commit f213c8db68e0a328ac300544cc12b9c41bac9f05
Author: maderix <maderix@users.noreply.github.com>
Date:   Sat Feb 28 00:22:06 2026 -0800

    Initial release

diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..2e0bef4
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2026 maderix
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..d2c7bb2
--- /dev/null
+++ b/README.md
@@ -0,0 +1,111 @@
+# ANE Training — Backpropagation on Apple Neural Engine
+
+Training neural networks directly on Apple's Neural Engine (ANE) via reverse-engineered private APIs. No CoreML training APIs, no Metal, no GPU — pure ANE compute.
+
+## What This Is
+
+A from-scratch implementation of transformer training (forward + backward pass) running on the ANE in Apple Silicon. The ANE is a 15.8 TFLOPS (M4) inference accelerator that Apple does not expose for training. This project reverse-engineers the `_ANEClient` / `_ANECompiler` private APIs and the MIL (Model Intermediate Language) format to run custom compute graphs — including backpropagation — directly on ANE hardware.
+
+**Current results (M4, single transformer layer, dim=768, seq=512):**
+- 9.3 ms/step, 11.2% ANE utilization (1.78 TFLOPS sustained)
+- 6 ANE kernel dispatches per training step
+- All forward and backward dx passes on ANE, dW gradients on CPU (Accelerate cblas)
+- Adam optimizer, gradient accumulation, checkpoint/resume
+
+## Architecture
+
+The training loop uses 6 ANE kernels per step:
+
+| Kernel | Function | Weights |
+|--------|----------|---------|
+| `kFwdAttn` | RMSNorm + QKV projection + SDPA + output projection | Wq, Wk, Wv, Wo, rms1, mask |
+| `kFwdFFN` | RMSNorm + SwiGLU FFN (W1, W3, SiLU, W2) | W1, W2, W3, rms2 |
+| `kFFNBwd` | FFN backward (W2^T + SiLU_bwd + W1^T + W3^T) | W2^T, W1^T, W3^T |
+| `kSdpaBwd1` | Wo^T + SDPA backward part 1 (dV, probs, dp) | Wo^T, mask |
+| `kSdpaBwd2` | SDPA backward part 2 (softmax grad, dQ, dK) | — |
+| `kQKVb` | QKV backward (Wq^T + Wk^T + Wv^T → dx) | Wq^T, Wk^T, Wv^T |
+
+CPU handles: RMSNorm backward, residual connections, loss computation, dW gradient accumulation (cblas_sgemm), Adam optimizer updates.
+
+Key optimizations:
+- **Channel-first CPU layout** — matches ANE IOSurface `[1,C,1,S]` format, eliminates all transpose overhead
+- **vDSP vectorized RMSNorm** — 10x faster than naive (6.7ms → 0.7ms)
+- **GCD async cblas overlap** — dW gradient sgemms run in parallel with ANE evals on a serial dispatch queue
+- **Deferred cblas wait** — wait pushed into next step's forward pass for maximum overlap
+- **ANE RMSNorm fusion** — RMSNorm folded into forward kernels as MIL ops (reduce_sum + pow + mul)
+- **Wo^T fusion** — output projection backward merged into SDPA backward kernel
+- **Forward taps** — Q, K, V, attention scores, hidden states exposed via concat outputs, avoiding CPU recompute
+- **exec() restart** — bypasses ~119 ANE compile limit per process
+
+## File Structure
+
+```
+├── api_exploration.m       # Initial ANE API discovery
+├── inmem_basic.m           # In-memory MIL compilation proof-of-concept
+├── inmem_bench.m           # ANE dispatch latency benchmarks
+├── inmem_peak.m            # Peak TFLOPS measurement (2048x2048 matmul)
+├── sram_bench.m            # ANE SRAM bandwidth probing
+├── sram_probe.m            # SRAM size/layout exploration
+└── training/
+    ├── ane_runtime.h       # ANE private API wrapper (compile, eval, IOSurface)
+    ├── ane_mil_gen.h       # MIL program generation helpers
+    ├── model.h             # Model weight initialization and blob builders
+    ├── forward.h           # Forward pass MIL generators
+    ├── backward.h          # Backward pass MIL generators
+    ├── train.m             # Minimal training loop (early prototype)
+    ├── tiny_train.m        # 2-layer tiny model training
+    ├── train_large.m       # Main: single-layer dim=768 training (optimized)
+    ├── test_*.m            # Unit tests for individual kernels
+    └── Makefile
+```
+
+## Building
+
+Requires macOS 15+ on Apple Silicon (tested on M4).
+
+```bash
+# Build the main training program
+xcrun clang -O2 -framework Foundation -framework IOSurface \
+  -framework CoreML -framework Accelerate -ldl -lobjc \
+  -o train_large training/train_large.m
+
+# Run
+./train_large
+```
+
+No external dependencies. Uses only system frameworks + private ANE APIs resolved at runtime via `objc_msgSend`.
+
+## How It Works
+
+1. **MIL generation** — Objective-C code constructs MIL program text at runtime, specifying convolutions (for linear layers), matmul (for attention), softmax, element-wise ops
+2. **In-memory compilation** — `_ANEInMemoryModelDescriptor` compiles MIL text + weight blobs directly to ANE programs, no disk mlmodelc needed
+3. **IOSurface I/O** — Input/output tensors passed via IOSurface shared memory in `[1, channels, 1, spatial]` format (fp16)
+4. **Weight embedding** — Weights baked into ANE programs as BLOBFILE constants; recompiled each batch when weights change
+5. **Gradient flow** — Forward taps expose intermediates needed for backward; backward kernels compute dx (input gradients) on ANE; dW (weight gradients) computed on CPU via cblas
+
+## Limitations
+
+- **SDPA causal masking** — ANE hardware ignores `attn_mask` in SDPA ops; causal attention is decomposed into separate Q@K^T (ANE) → mask+softmax (ANE via add+softmax) → scores@V (ANE)
+- **~119 compile limit** — ANE compiler leaks resources; worked around via `exec()` restart with checkpoint
+- **Single layer** — Currently trains one transformer layer; multi-layer would need pipeline scheduling
+- **Synthetic data** — Currently uses random data for benchmarking; real tokenized data support is WIP
+
+## Performance History
+
+| Optimization | ms/step | ANE util |
+|---|---|---|
+| Baseline (vDSP transpose) | 33.5 | 3.1% |
+| Channel-first layout | 20.3 | 5.2% |
+| vDSP vectorized RMSNorm | 14.2 | 7.4% |
+| GCD async cblas overlap | 11.4 | 9.2% |
+| ANE RMSNorm fusion | 11.4 | 9.2% |
+| Wo^T fusion (7→6 kernels) | 11.4 | 9.2% |
+| Deferred cblas wait | **9.3** | **11.2%** |
+
+## Disclaimer
+
+This project is independent research into Apple Neural Engine architecture. It uses undocumented APIs discovered through runtime introspection for research and educational purposes under fair use and interoperability provisions (see *Sega v. Accolade*, 1992; DMCA §1201(f)). No Apple proprietary code or binaries are included in this repository. This project is not affiliated with or endorsed by Apple Inc. Use at your own risk.
+
+## License
+
+MIT — see [LICENSE](LICENSE)
diff --git a/api_exploration.m b/api_exploration.m
new file mode 100644
index 0000000..a778def
--- /dev/null
+++ b/api_exploration.m
@@ -0,0 +1,167 @@
+#import <Foundation/Foundation.h>
+#import <CoreML/CoreML.h>
+#import <objc/runtime.h>
+#import <objc/message.h>
+#import <dlfcn.h>
+#import <mach/mach_time.h>
+#import <IOSurface/IOSurface.h>
+
+static mach_timebase_info_data_t g_tb;
+static double ticksToMs(uint64_t t) { return (double)t * g_tb.numer / g_tb.denom / 1e6; }
+
+int main() {
+    @autoreleasepool {
+        mach_timebase_info(&g_tb);
+        dlopen("/System/Library/PrivateFrameworks/AppleNeuralEngine.framework/AppleNeuralEngine", RTLD_NOW);
+
+        // === Approach 1: MLModelAsset from compiled .mlmodelc data ===
+        // First compile a known-working model to .mlmodelc
+        printf("=== Approach 1: MLModelAsset in-memory ===\n");
+        NSError *e = nil;
+        NSURL *src = [NSURL fileURLWithPath:@"/tmp/ane_sram_1024ch_64sp.mlpackage"];
+        NSURL *compiled = [MLModel compileModelAtURL:src error:&e];
+        if (e) { printf("Compile failed: %s\n", [[e description] UTF8String]); return 1; }
+        printf("Compiled to: %s\n", [[compiled path] UTF8String]);
+
+        // Read the model.mlmodel spec from the compiled bundle
+        // The spec is typically in coremldata.bin or model.mlmodel
+        NSFileManager *fm = [NSFileManager defaultManager];
+        NSArray *files = [fm contentsOfDirectoryAtPath:[compiled path] error:nil];
+        printf("Files in .mlmodelc:\n");
+        for (NSString *f in files) printf("  %s\n", [f UTF8String]);
+
+        // Try loading with MLModelAsset
+        // MLModelAsset has modelAssetWithURL: on macOS 15
+        if (@available(macOS 14.0, *)) {
+            // Read the spec data
+            NSString *specPath = [[compiled path] stringByAppendingPathComponent:@"coremldata.bin"];
+            if (![fm fileExistsAtPath:specPath]) {
+                specPath = [[compiled path] stringByAppendingPathComponent:@"model.mlmodel"];
+            }
+            NSData *specData = [NSData dataWithContentsOfFile:specPath];
+            printf("Spec data: %lu bytes from %s\n", (unsigned long)[specData length],
+                   [[specPath lastPathComponent] UTF8String]);
+
+            // Try MLModelAsset
+            Class assetClass = NSClassFromString(@"MLModelAsset");
+            if (assetClass) {
+                printf("MLModelAsset class found\n");
+                // List methods
+                unsigned int count;
+                Method *methods = class_copyMethodList(object_getClass(assetClass), &count);
+                for (unsigned int i = 0; i < count; i++)
+                    printf("  + %s\n", sel_getName(method_getName(methods[i])));
+                free(methods);
+            }
+        }
+
+        // === Approach 2: Read a .mlmodelc, extract MIL, feed to _ANEInMemoryModelDescriptor ===
+        printf("\n=== Approach 2: Inspect MIL in compiled model ===\n");
+        // Look for model.mil or any MIL file
+        NSDirectoryEnumerator *en = [fm enumeratorAtPath:[compiled path]];
+        NSString *f;
+        while ((f = [en nextObject])) {
+            NSString *full = [[compiled path] stringByAppendingPathComponent:f];
+            BOOL isDir;
+            [fm fileExistsAtPath:full isDirectory:&isDir];
+            if (!isDir) {
+                NSDictionary *attrs = [fm attributesOfItemAtPath:full error:nil];
+                printf("  %s (%llu bytes)\n", [f UTF8String],
+                       [[attrs objectForKey:NSFileSize] unsignedLongLongValue]);
+            }
+        }
+
+        // Try to find and read model.mil
+        NSString *milPath = [[compiled path] stringByAppendingPathComponent:@"model.mil"];
+        if ([fm fileExistsAtPath:milPath]) {
+            NSString *milText = [NSString stringWithContentsOfFile:milPath encoding:NSUTF8StringEncoding error:nil];
+            printf("\n=== model.mil contents (first 2000 chars) ===\n");
+            printf("%s\n", [[milText substringToIndex:MIN(2000, [milText length])] UTF8String]);
+        }
+
+        // Also check for mlmodelc structure
+        NSString *aneDir = nil;
+        en = [fm enumeratorAtPath:[compiled path]];
+        while ((f = [en nextObject])) {
+            if ([f hasSuffix:@".espresso.net"] || [f hasSuffix:@".hwx"] || [f hasSuffix:@".mil"]) {
+                printf("  FOUND: %s\n", [f UTF8String]);
+            }
+        }
+
+        // === Approach 3: Try _ANEInMemoryModelDescriptor with actual MIL from compiled model ===
+        printf("\n=== Approach 3: _ANEInMemoryModelDescriptor ===\n");
+        Class ANEInMemDesc = NSClassFromString(@"_ANEInMemoryModelDescriptor");
+        if (ANEInMemDesc) {
+            printf("Class exists. Methods:\n");
+            unsigned int count;
+            Method *methods = class_copyMethodList(object_getClass(ANEInMemDesc), &count);
+            for (unsigned int i = 0; i < count; i++) {
+                SEL s = method_getName(methods[i]);
+                printf("  + %s  (args: %d)\n", sel_getName(s), method_getNumberOfArguments(methods[i]));
+            }
+            free(methods);
+            methods = class_copyMethodList(ANEInMemDesc, &count);
+            printf("Instance methods:\n");
+            for (unsigned int i = 0; i < count; i++) {
+                SEL s = method_getName(methods[i]);
+                const char *enc = method_getTypeEncoding(methods[i]);
+                printf("  - %s  [%s]\n", sel_getName(s), enc ? enc : "?");
+            }
+            free(methods);
+
+            // If model.mil exists, try feeding it
+            if ([fm fileExistsAtPath:milPath]) {
+                NSString *milText = [NSString stringWithContentsOfFile:milPath encoding:NSUTF8StringEncoding error:nil];
+                printf("\nTrying modelWithMILText: with actual model.mil...\n");
+                id desc = ((id(*)(Class,SEL,id,id,id))objc_msgSend)(
+                    ANEInMemDesc, @selector(modelWithMILText:weights:optionsPlist:),
+                    milText, nil, nil);
+                printf("Result: %s\n", desc ? [[desc description] UTF8String] : "nil");
+
+                // Try with NSData
+                NSData *milData = [milText dataUsingEncoding:NSUTF8StringEncoding];
+                desc = ((id(*)(Class,SEL,id,id,id))objc_msgSend)(
+                    ANEInMemDesc, @selector(modelWithMILText:weights:optionsPlist:),
+                    milData, nil, nil);
+                printf("Result (NSData): %s\n", desc ? [[desc description] UTF8String] : "nil");
+            }
+        } else {
+            printf("_ANEInMemoryModelDescriptor NOT FOUND\n");
+        }
+
+        // === Approach 4: Hook into what CoreML actually sends to ANE ===
+        printf("\n=== Approach 4: Trace CoreML -> ANE path ===\n");
+        // Load the model the normal working way and inspect the _ANEModel
+        MLModelConfiguration *config = [[MLModelConfiguration alloc] init];
+        config.computeUnits = MLComputeUnitsAll;
+        MLModel *model = [MLModel modelWithContentsOfURL:compiled configuration:config error:&e];
+        if (e) { printf("MLModel load failed: %s\n", [[e description] UTF8String]); return 1; }
+
+        // Try to get internal model object
+        printf("MLModel: %s\n", [[model description] UTF8String]);
+
+        // Check if we can access the ANE model through the MLModel
+        // Try KVC for internal properties
+        @try {
+            id engine = [model valueForKey:@"engine"];
+            printf("engine: %s\n", engine ? [[engine description] UTF8String] : "nil");
+        } @catch(NSException *ex) {
+            printf("No 'engine' key\n");
+        }
+        @try {
+            id proxy = [model valueForKey:@"proxy"];
+            printf("proxy: %s\n", proxy ? [NSStringFromClass([proxy class]) UTF8String] : "nil");
+        } @catch(NSException *ex) {
+            printf("No 'proxy' key\n");
+        }
+
+        // Check MLNeuralNetworkEngine or MLANEEngine
+        Class aneEngine = NSClassFromString(@"MLANEEngine");
+        Class nnEngine = NSClassFromString(@"MLNeuralNetworkEngine");
+        Class milEngine = NSClassFromString(@"MLMILComputeEngine");
+        printf("MLANEEngine: %s\n", aneEngine ? "exists" : "not found");
+        printf("MLNeuralNetworkEngine: %s\n", nnEngine ? "exists" : "not found");
+        printf("MLMILComputeEngine: %s\n", milEngine ? "exists" : "not found");
+    }
+    return 0;
+}
diff --git a/inmem_basic.m b/inmem_basic.m
new file mode 100644
index 0000000..2a54505
--- /dev/null
+++ b/inmem_basic.m
@@ -0,0 +1,129 @@
+#import <Foundation/Foundation.h>
+#import <CoreML/CoreML.h>
+#import <objc/runtime.h>
+#import <objc/message.h>
+#import <dlfcn.h>
+#import <mach/mach_time.h>
+#import <IOSurface/IOSurface.h>
+
+static mach_timebase_info_data_t g_tb;
+static double ticksToMs(uint64_t t) { return (double)t * g_tb.numer / g_tb.denom / 1e6; }
+
+int main() {
+    @autoreleasepool {
+        mach_timebase_info(&g_tb);
+        dlopen("/System/Library/PrivateFrameworks/AppleNeuralEngine.framework/AppleNeuralEngine", RTLD_NOW);
+
+        NSError *e = nil;
+        int ch = 256, sp = 64;
+
+        // Get MIL and weights from a compiled model
+        NSURL *compiled = [MLModel compileModelAtURL:
+            [NSURL fileURLWithPath:@"/tmp/ane_sram_256ch_64sp.mlpackage"] error:&e];
+        if (e) { printf("Compile failed\n"); return 1; }
+
+        NSData *milData = [[NSString stringWithContentsOfFile:
+            [[compiled path] stringByAppendingPathComponent:@"model.mil"]
+            encoding:NSUTF8StringEncoding error:nil] dataUsingEncoding:NSUTF8StringEncoding];
+        NSData *weightBlob = [NSData dataWithContentsOfFile:
+            [[compiled path] stringByAppendingPathComponent:@"weights/weight.bin"]];
+
+        Class Desc = NSClassFromString(@"_ANEInMemoryModelDescriptor");
+        Class IMM = NSClassFromString(@"_ANEInMemoryModel");
+        Class AR = NSClassFromString(@"_ANERequest");
+        Class AIO = NSClassFromString(@"_ANEIOSurfaceObject");
+
+        NSDictionary *wdict = @{
+            @"@model_path/weights/weight.bin": @{@"offset": @64, @"data": weightBlob}
+        };
+        id desc = ((id(*)(Class,SEL,id,id,id))objc_msgSend)(
+            Desc, @selector(modelWithMILText:weights:optionsPlist:),
+            milData, wdict, nil);
+        id model = ((id(*)(Class,SEL,id))objc_msgSend)(IMM, @selector(inMemoryModelWithDescriptor:), desc);
+
+        // Get the hex identifier to pre-populate the temp dir
+        id hexId = ((id(*)(id,SEL))objc_msgSend)(model, @selector(hexStringIdentifier));
+        NSString *tmpDir = [NSTemporaryDirectory() stringByAppendingPathComponent:hexId];
+        NSFileManager *fm = [NSFileManager defaultManager];
+
+        // Pre-create dir with MIL and weights
+        [fm createDirectoryAtPath:[tmpDir stringByAppendingPathComponent:@"weights"]
+            withIntermediateDirectories:YES attributes:nil error:nil];
+        [milData writeToFile:[tmpDir stringByAppendingPathComponent:@"model.mil"] atomically:YES];
+        [weightBlob writeToFile:[tmpDir stringByAppendingPathComponent:@"weights/weight.bin"] atomically:YES];
+        printf("Pre-created: %s\n", [tmpDir UTF8String]);
+
+        // Compile
+        printf("Compiling...\n");
+        BOOL ok = ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(
+            model, @selector(compileWithQoS:options:error:), 21, @{}, &e);
+        printf("compile: %s\n", ok ? "YES" : "NO");
+        if (e) { printf("  err: %s\n", [[e description] UTF8String]); e=nil; }
+        if (!ok) return 1;
+
+        // Load
+        printf("Loading...\n");
+        ok = ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(
+            model, @selector(loadWithQoS:options:error:), 21, @{}, &e);
+        printf("load: %s\n", ok ? "YES" : "NO");
+        if (e) { printf("  err: %s\n", [[e description] UTF8String]); e=nil; }
+        if (!ok) return 1;
+
+        printf("state: %lu\n", ((NSUInteger(*)(id,SEL))objc_msgSend)(model, @selector(state)));
+
+        // Create IO surfaces
+        NSUInteger bytes = ch * sp * 4;
+        IOSurfaceRef ioIn = IOSurfaceCreate((__bridge CFDictionaryRef)@{
+            (id)kIOSurfaceWidth:@(bytes),(id)kIOSurfaceHeight:@1,
+            (id)kIOSurfaceBytesPerElement:@1,(id)kIOSurfaceBytesPerRow:@(bytes),
+            (id)kIOSurfaceAllocSize:@(bytes),(id)kIOSurfacePixelFormat:@0});
+        IOSurfaceRef ioOut = IOSurfaceCreate((__bridge CFDictionaryRef)@{
+            (id)kIOSurfaceWidth:@(bytes),(id)kIOSurfaceHeight:@1,
+            (id)kIOSurfaceBytesPerElement:@1,(id)kIOSurfaceBytesPerRow:@(bytes),
+            (id)kIOSurfaceAllocSize:@(bytes),(id)kIOSurfacePixelFormat:@0});
+        id wIn = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(AIO, @selector(objectWithIOSurface:), ioIn);
+        id wOut = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(AIO, @selector(objectWithIOSurface:), ioOut);
+        id req = ((id(*)(Class,SEL,id,id,id,id,id,id,id))objc_msgSend)(AR,
+            @selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:),
+            @[wIn], @[@0], @[wOut], @[@0], nil, nil, @0);
+
+        // Evaluate
+        printf("Evaluating...\n");
+        ok = ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)(
+            model, @selector(evaluateWithQoS:options:request:error:),
+            21, @{}, req, &e);
+        printf("evaluate: %s\n", ok ? "YES" : "NO");
+        if (e) { printf("  err: %s\n", [[e description] UTF8String]); e=nil; }
+
+        if (ok) {
+            // Warmup
+            for (int i = 0; i < 10; i++)
+                ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)(
+                    model, @selector(evaluateWithQoS:options:request:error:),
+                    21, @{}, req, &e);
+
+            // Benchmark
+            int iters = 100;
+            uint64_t t0 = mach_absolute_time();
+            for (int i = 0; i < iters; i++)
+                ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)(
+                    model, @selector(evaluateWithQoS:options:request:error:),
+                    21, @{}, req, &e);
+            double ms = ticksToMs(mach_absolute_time() - t0) / iters;
+            double gf = 2.0*ch*ch*sp/1e9;
+            double tflops = gf / ms;
+
+            printf("\n========================================\n");
+            printf("IN-MEMORY ANE EXECUTION SUCCESSFUL!\n");
+            printf("  %.3f ms/eval, %.2f TFLOPS\n", ms, tflops);
+            printf("========================================\n");
+        }
+
+        // Cleanup
+        ((BOOL(*)(id,SEL,unsigned int,NSError**))objc_msgSend)(
+            model, @selector(unloadWithQoS:error:), 21, &e);
+        CFRelease(ioIn); CFRelease(ioOut);
+        [fm removeItemAtPath:tmpDir error:nil];
+    }
+    return 0;
+}
diff --git a/inmem_bench.m b/inmem_bench.m
new file mode 100644
index 0000000..8a5af33
--- /dev/null
+++ b/inmem_bench.m
@@ -0,0 +1,111 @@
+#import <Foundation/Foundation.h>
+#import <CoreML/CoreML.h>
+#import <objc/runtime.h>
+#import <objc/message.h>
+#import <dlfcn.h>
+#import <mach/mach_time.h>
+#import <IOSurface/IOSurface.h>
+
+static mach_timebase_info_data_t g_tb;
+static double ticksToMs(uint64_t t) { return (double)t * g_tb.numer / g_tb.denom / 1e6; }
+
+double benchInMem(int ch, int sp) {
+    @autoreleasepool {
+        NSError *e = nil;
+        NSString *path = [NSString stringWithFormat:@"/tmp/ane_sram_%dch_%dsp.mlpackage", ch, sp];
+        NSURL *compiled = [MLModel compileModelAtURL:[NSURL fileURLWithPath:path] error:&e];
+        if (e) return -1;
+
+        NSData *milData = [[NSString stringWithContentsOfFile:
+            [[compiled path] stringByAppendingPathComponent:@"model.mil"]
+            encoding:NSUTF8StringEncoding error:nil] dataUsingEncoding:NSUTF8StringEncoding];
+        NSData *weightBlob = [NSData dataWithContentsOfFile:
+            [[compiled path] stringByAppendingPathComponent:@"weights/weight.bin"]];
+
+        Class Desc = NSClassFromString(@"_ANEInMemoryModelDescriptor");
+        Class IMM = NSClassFromString(@"_ANEInMemoryModel");
+        Class AR = NSClassFromString(@"_ANERequest");
+        Class AIO = NSClassFromString(@"_ANEIOSurfaceObject");
+
+        NSDictionary *wdict = @{
+            @"@model_path/weights/weight.bin": @{@"offset": @64, @"data": weightBlob}
+        };
+        id desc = ((id(*)(Class,SEL,id,id,id))objc_msgSend)(
+            Desc, @selector(modelWithMILText:weights:optionsPlist:),
+            milData, wdict, nil);
+        if (!desc) return -2;
+        id model = ((id(*)(Class,SEL,id))objc_msgSend)(IMM, @selector(inMemoryModelWithDescriptor:), desc);
+        if (!model) return -3;
+
+        id hexId = ((id(*)(id,SEL))objc_msgSend)(model, @selector(hexStringIdentifier));
+        NSString *tmpDir = [NSTemporaryDirectory() stringByAppendingPathComponent:hexId];
+        NSFileManager *fm = [NSFileManager defaultManager];
+        [fm createDirectoryAtPath:[tmpDir stringByAppendingPathComponent:@"weights"]
+            withIntermediateDirectories:YES attributes:nil error:nil];
+        [milData writeToFile:[tmpDir stringByAppendingPathComponent:@"model.mil"] atomically:YES];
+        [weightBlob writeToFile:[tmpDir stringByAppendingPathComponent:@"weights/weight.bin"] atomically:YES];
+
+        BOOL ok = ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(
+            model, @selector(compileWithQoS:options:error:), 21, @{}, &e);
+        if (!ok) { [fm removeItemAtPath:tmpDir error:nil]; return -4; }
+
+        ok = ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(
+            model, @selector(loadWithQoS:options:error:), 21, @{}, &e);
+        if (!ok) { [fm removeItemAtPath:tmpDir error:nil]; return -5; }
+
+        NSUInteger bytes = ch * sp * 4;
+        IOSurfaceRef ioIn = IOSurfaceCreate((__bridge CFDictionaryRef)@{
+            (id)kIOSurfaceWidth:@(bytes),(id)kIOSurfaceHeight:@1,
+            (id)kIOSurfaceBytesPerElement:@1,(id)kIOSurfaceBytesPerRow:@(bytes),
+            (id)kIOSurfaceAllocSize:@(bytes),(id)kIOSurfacePixelFormat:@0});
+        IOSurfaceRef ioOut = IOSurfaceCreate((__bridge CFDictionaryRef)@{
+            (id)kIOSurfaceWidth:@(bytes),(id)kIOSurfaceHeight:@1,
+            (id)kIOSurfaceBytesPerElement:@1,(id)kIOSurfaceBytesPerRow:@(bytes),
+            (id)kIOSurfaceAllocSize:@(bytes),(id)kIOSurfacePixelFormat:@0});
+        id wIn = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(AIO, @selector(objectWithIOSurface:), ioIn);
+        id wOut = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(AIO, @selector(objectWithIOSurface:), ioOut);
+        id req = ((id(*)(Class,SEL,id,id,id,id,id,id,id))objc_msgSend)(AR,
+            @selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:),
+            @[wIn], @[@0], @[wOut], @[@0], nil, nil, @0);
+
+        for (int i = 0; i < 5; i++)
+            ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)(
+                model, @selector(evaluateWithQoS:options:request:error:), 21, @{}, req, &e);
+
+        int iters = 50;
+        uint64_t t0 = mach_absolute_time();
+        for (int i = 0; i < iters; i++)
+            ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)(
+                model, @selector(evaluateWithQoS:options:request:error:), 21, @{}, req, &e);
+        double ms = ticksToMs(mach_absolute_time() - t0) / iters;
+
+        ((BOOL(*)(id,SEL,unsigned int,NSError**))objc_msgSend)(model, @selector(unloadWithQoS:error:), 21, &e);
+        CFRelease(ioIn); CFRelease(ioOut);
+        [fm removeItemAtPath:tmpDir error:nil];
+        return ms;
+    }
+}
+
+int main() {
+    mach_timebase_info(&g_tb);
+    dlopen("/System/Library/PrivateFrameworks/AppleNeuralEngine.framework/AppleNeuralEngine", RTLD_NOW);
+
+    printf("=== In-Memory ANE Benchmark ===\n\n");
+    printf("%-12s %8s %10s %8s\n", "Config", "W (MB)", "ms/eval", "TFLOPS");
+    printf("---------------------------------------------\n");
+
+    int chs[] = {256, 512, 1024, 2048, 3072, 4096};
+    int sps[] = {64,  64,  64,   64,   64,   64};
+    for (int i = 0; i < 6; i++) {
+        int ch = chs[i], sp = sps[i];
+        double w_mb = (double)ch*ch*2/1024/1024;
+        double gf = 2.0*ch*ch*sp/1e9;
+        double ms = benchInMem(ch, sp);
+        double tflops = (ms > 0) ? gf/ms : 0;
+        if (ms > 0)
+            printf("%4dch x%2dsp %7.1f  %8.3f ms %7.2f\n", ch, sp, w_mb, ms, tflops);
+        else
+            printf("%4dch x%2dsp %7.1f  FAIL(%.0f)\n", ch, sp, w_mb, ms);
+    }
+    return 0;
+}
diff --git a/inmem_peak.m b/inmem_peak.m
new file mode 100644
index 0000000..87b8163
--- /dev/null
+++ b/inmem_peak.m
@@ -0,0 +1,111 @@
+#import <Foundation/Foundation.h>
+#import <CoreML/CoreML.h>
+#import <objc/runtime.h>
+#import <objc/message.h>
+#import <dlfcn.h>
+#import <mach/mach_time.h>
+#import <IOSurface/IOSurface.h>
+
+static mach_timebase_info_data_t g_tb;
+static double ticksToMs(uint64_t t) { return (double)t * g_tb.numer / g_tb.denom / 1e6; }
+
+NSData *buildWeightBlob(int ch, int depth) {
+    NSUInteger wsize = ch * ch * 2;
+    NSUInteger chunkSize = 64 + wsize;
+    NSUInteger total = 64 + chunkSize * depth;
+    uint8_t *buf = calloc(total, 1);
+    buf[0] = 0x01; buf[4] = 0x02;
+    for (int i = 0; i < depth; i++) {
+        uint8_t *chunk = buf + 64 + i * chunkSize;
+        chunk[0]=0xEF; chunk[1]=0xBE; chunk[2]=0xAD; chunk[3]=0xDE;
+        chunk[4]=0x01; chunk[10]=0x08;
+        uint16_t *fp16 = (uint16_t*)(chunk + 64);
+        for (NSUInteger j = 0; j < wsize/2; j++) fp16[j] = (arc4random()&0x03FF)|0x2000;
+    }
+    return [NSData dataWithBytesNoCopy:buf length:total freeWhenDone:YES];
+}
+
+NSString *genMIL(int ch, int sp, int depth) {
+    NSMutableString *m = [NSMutableString string];
+    [m appendString:@"program(1.3)\n[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, {\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, {\"coremltools-version\", \"9.0\"}})]\n{\n"];
+    [m appendFormat:@"    func main<ios18>(tensor<fp32, [1, %d, 1, %d]> x) {\n", ch, sp];
+    [m appendString:@"            string c_pad_type_0 = const()[name = string(\"c_pad_type_0\"), val = string(\"valid\")];\n"
+        @"            tensor<int32, [2]> c_strides_0 = const()[name = string(\"c_strides_0\"), val = tensor<int32, [2]>([1, 1])];\n"
+        @"            tensor<int32, [4]> c_pad_0 = const()[name = string(\"c_pad_0\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
+        @"            tensor<int32, [2]> c_dilations_0 = const()[name = string(\"c_dilations_0\"), val = tensor<int32, [2]>([1, 1])];\n"
+        @"            int32 c_groups_0 = const()[name = string(\"c_groups_0\"), val = int32(1)];\n"
+        @"            string x_to_fp16_dtype_0 = const()[name = string(\"x_to_fp16_dtype_0\"), val = string(\"fp16\")];\n"];
+    [m appendFormat:@"            tensor<fp16, [1, %d, 1, %d]> x_to_fp16 = cast(dtype = x_to_fp16_dtype_0, x = x)[name = string(\"cast_in\")];\n", ch, sp];
+    NSUInteger cs = 64 + ch*ch*2;
+    NSString *prev = @"x_to_fp16";
+    for (int i = 0; i < depth; i++) {
+        [m appendFormat:@"            tensor<fp16, [%d, %d, 1, 1]> W%d = const()[name = string(\"W%d\"), val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = string(\"@model_path/weights/weight.bin\"), offset = uint64(%lu)))];\n",
+            ch, ch, i, i, ch, ch, (unsigned long)(64 + i*cs)];
+        NSString *out = [NSString stringWithFormat:@"c%d", i];
+        [m appendFormat:@"            tensor<fp16, [1, %d, 1, %d]> %@ = conv(dilations = c_dilations_0, groups = c_groups_0, pad = c_pad_0, pad_type = c_pad_type_0, strides = c_strides_0, weight = W%d, x = %@)[name = string(\"%@\")];\n",
+            ch, sp, out, i, prev, out];
+        prev = out;
+    }
+    [m appendString:@"            string to_fp32 = const()[name = string(\"to_fp32\"), val = string(\"fp32\")];\n"];
+    [m appendFormat:@"            tensor<fp32, [1, %d, 1, %d]> c = cast(dtype = to_fp32, x = %@)[name = string(\"cast_out\")];\n", ch, sp, prev];
+    [m appendString:@"        } -> (c);\n}\n"];
+    return m;
+}
+
+double bench(int ch, int sp, int depth) {
+    @autoreleasepool {
+        NSError *e = nil;
+        NSData *milData = [[genMIL(ch,sp,depth) dataUsingEncoding:NSUTF8StringEncoding] copy];
+        NSData *wb = buildWeightBlob(ch, depth);
+        Class D=NSClassFromString(@"_ANEInMemoryModelDescriptor"), I=NSClassFromString(@"_ANEInMemoryModel");
+        Class AR=NSClassFromString(@"_ANERequest"), AIO=NSClassFromString(@"_ANEIOSurfaceObject");
+        id desc=((id(*)(Class,SEL,id,id,id))objc_msgSend)(D,@selector(modelWithMILText:weights:optionsPlist:),milData,@{@"@model_path/weights/weight.bin":@{@"offset":@0,@"data":wb}},nil);
+        if(!desc)return -1;
+        id mdl=((id(*)(Class,SEL,id))objc_msgSend)(I,@selector(inMemoryModelWithDescriptor:),desc);
+        id hx=((id(*)(id,SEL))objc_msgSend)(mdl,@selector(hexStringIdentifier));
+        NSString *td=[NSTemporaryDirectory() stringByAppendingPathComponent:hx];
+        NSFileManager *fm=[NSFileManager defaultManager];
+        [fm createDirectoryAtPath:[td stringByAppendingPathComponent:@"weights"] withIntermediateDirectories:YES attributes:nil error:nil];
+        [milData writeToFile:[td stringByAppendingPathComponent:@"model.mil"] atomically:YES];
+        [wb writeToFile:[td stringByAppendingPathComponent:@"weights/weight.bin"] atomically:YES];
+        if(!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl,@selector(compileWithQoS:options:error:),21,@{},&e)){[fm removeItemAtPath:td error:nil];return -3;}
+        if(!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl,@selector(loadWithQoS:options:error:),21,@{},&e)){[fm removeItemAtPath:td error:nil];return -4;}
+        NSUInteger bytes=ch*sp*4;
+        IOSurfaceRef ioI=IOSurfaceCreate((__bridge CFDictionaryRef)@{(id)kIOSurfaceWidth:@(bytes),(id)kIOSurfaceHeight:@1,(id)kIOSurfaceBytesPerElement:@1,(id)kIOSurfaceBytesPerRow:@(bytes),(id)kIOSurfaceAllocSize:@(bytes),(id)kIOSurfacePixelFormat:@0});
+        IOSurfaceRef ioO=IOSurfaceCreate((__bridge CFDictionaryRef)@{(id)kIOSurfaceWidth:@(bytes),(id)kIOSurfaceHeight:@1,(id)kIOSurfaceBytesPerElement:@1,(id)kIOSurfaceBytesPerRow:@(bytes),(id)kIOSurfaceAllocSize:@(bytes),(id)kIOSurfacePixelFormat:@0});
+        id wI=((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(AIO,@selector(objectWithIOSurface:),ioI);
+        id wO=((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(AIO,@selector(objectWithIOSurface:),ioO);
+        id req=((id(*)(Class,SEL,id,id,id,id,id,id,id))objc_msgSend)(AR,@selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:),@[wI],@[@0],@[wO],@[@0],nil,nil,@0);
+        for(int i=0;i<10;i++)((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)(mdl,@selector(evaluateWithQoS:options:request:error:),21,@{},req,&e);
+        int it=50; uint64_t t0=mach_absolute_time();
+        for(int i=0;i<it;i++)((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)(mdl,@selector(evaluateWithQoS:options:request:error:),21,@{},req,&e);
+        double ms=ticksToMs(mach_absolute_time()-t0)/it;
+        ((BOOL(*)(id,SEL,unsigned int,NSError**))objc_msgSend)(mdl,@selector(unloadWithQoS:error:),21,&e);
+        CFRelease(ioI);CFRelease(ioO);[fm removeItemAtPath:td error:nil];
+        return ms;
+    }
+}
+
+int main() {
+    mach_timebase_info(&g_tb);
+    dlopen("/System/Library/PrivateFrameworks/AppleNeuralEngine.framework/AppleNeuralEngine",RTLD_NOW);
+    printf("=== Programmatic MIL → In-Memory ANE Peak ===\n\n");
+    printf("%-28s %7s %7s %9s %7s %6s\n","Config","W(MB)","GFLOP","ms/eval","TFLOPS","%%peak");
+    printf("----------------------------------------------------------------------\n");
+    typedef struct{int c,s,d;}C;
+    C cf[]={
+        {512,64,32},{512,64,48},{512,64,64},{512,64,96},{512,64,128},
+        {256,64,64},{256,64,128},{256,64,256},
+        {384,64,64},{384,64,128},
+    };
+    for(int i=0;i<10;i++){
+        int c=cf[i].c,s=cf[i].s,d=cf[i].d;
+        double w=(double)c*c*2*d/1024/1024, gf=2.0*c*c*s*d/1e9;
+        char l[64]; snprintf(l,64,"%dx conv %dch sp%d",d,c,s);
+        double ms=bench(c,s,d);
+        double tf=ms>0?gf/ms:0;
+        if(ms>0)printf("%-28s %6.1f  %6.2f  %7.3f ms %6.2f  %5.1f%%\n",l,w,gf,ms,tf,tf/0.019*100);
+        else printf("%-28s %6.1f  %6.2f  FAIL(%.0f)\n",l,w,gf,ms);
+    }
+    return 0;
+}
diff --git a/sram_bench.m b/sram_bench.m
new file mode 100644
index 0000000..9dc3a35
--- /dev/null
+++ b/sram_bench.m
@@ -0,0 +1,101 @@
+#import <Foundation/Foundation.h>
+#import <CoreML/CoreML.h>
+#import <objc/runtime.h>
+#import <objc/message.h>
+#import <dlfcn.h>
+#import <mach/mach_time.h>
+#import <IOSurface/IOSurface.h>
+
+static mach_timebase_info_data_t g_tb;
+static double ticksToMs(uint64_t t) { return (double)t * g_tb.numer / g_tb.denom / 1e6; }
+static id g_client;
+static Class AM, AR, AIO;
+
+double bench(const char *path, int ch, int sp) {
+    @autoreleasepool {
+        NSError *e = nil;
+        NSURL *compiled = [MLModel compileModelAtURL:
+            [NSURL fileURLWithPath:[NSString stringWithUTF8String:path]] error:&e];
+        if (e) return -1;
+        id model = ((id(*)(Class,SEL,id,id))objc_msgSend)(AM, @selector(modelAtURL:key:), compiled, @"s");
+        BOOL ok = ((BOOL(*)(id,SEL,id,id,NSUInteger,NSError**))objc_msgSend)(
+            g_client, @selector(compileModel:options:qos:error:), model,
+            @{@"kANEFModelType":@"kANEFModelMIL",@"kANEFNetPlistFilenameKey":@"model.mil"}, 21, &e);
+        if (!ok) return -2;
+        ok = ((BOOL(*)(id,SEL,id,id,NSUInteger,NSError**))objc_msgSend)(
+            g_client, @selector(loadModel:options:qos:error:), model, @{}, 21, &e);
+        if (!ok) return -3;
+
+        NSUInteger bytes = ch * sp * 4; // FP32 input
+        IOSurfaceRef ioIn = IOSurfaceCreate((__bridge CFDictionaryRef)@{
+            (id)kIOSurfaceWidth:@(bytes),(id)kIOSurfaceHeight:@1,
+            (id)kIOSurfaceBytesPerElement:@1,(id)kIOSurfaceBytesPerRow:@(bytes),
+            (id)kIOSurfaceAllocSize:@(bytes),(id)kIOSurfacePixelFormat:@0});
+        IOSurfaceRef ioOut = IOSurfaceCreate((__bridge CFDictionaryRef)@{
+            (id)kIOSurfaceWidth:@(bytes),(id)kIOSurfaceHeight:@1,
+            (id)kIOSurfaceBytesPerElement:@1,(id)kIOSurfaceBytesPerRow:@(bytes),
+            (id)kIOSurfaceAllocSize:@(bytes),(id)kIOSurfacePixelFormat:@0});
+
+        id wIn = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(AIO, @selector(objectWithIOSurface:), ioIn);
+        id wOut = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(AIO, @selector(objectWithIOSurface:), ioOut);
+        id req = ((id(*)(Class,SEL,id,id,id,id,id,id,id))objc_msgSend)(AR,
+            @selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:),
+            @[wIn], @[@0], @[wOut], @[@0], nil, nil, @0);
+
+        for (int i = 0; i < 5; i++)
+            ((BOOL(*)(id,SEL,id,id,id,NSUInteger,NSError**))objc_msgSend)(
+                g_client, @selector(evaluateWithModel:options:request:qos:error:), model, @{}, req, 21, &e);
+
+        int iters = 30;
+        uint64_t t0 = mach_absolute_time();
+        for (int i = 0; i < iters; i++)
+            ((BOOL(*)(id,SEL,id,id,id,NSUInteger,NSError**))objc_msgSend)(
+                g_client, @selector(evaluateWithModel:options:request:qos:error:), model, @{}, req, 21, &e);
+        double ms = ticksToMs(mach_absolute_time() - t0) / iters;
+
+        ((void(*)(id,SEL,id,id,NSUInteger,NSError**))objc_msgSend)(
+            g_client, @selector(unloadModel:options:qos:error:), model, @{}, 21, &e);
+        CFRelease(ioIn); CFRelease(ioOut);
+        return ms;
+    }
+}
+
+int main() {
+    mach_timebase_info(&g_tb);
+    dlopen("/System/Library/PrivateFrameworks/AppleNeuralEngine.framework/AppleNeuralEngine", RTLD_NOW);
+    g_client = [NSClassFromString(@"_ANEClient") performSelector:@selector(sharedConnection)];
+    AM = NSClassFromString(@"_ANEModel");
+    AR = NSClassFromString(@"_ANERequest");
+    AIO = NSClassFromString(@"_ANEIOSurfaceObject");
+
+    printf("=== ANE SRAM Probe: 1x1 Conv with Increasing Weight Size ===\n\n");
+    printf("%-25s %8s %8s %8s %10s %8s\n", "Config", "W (MB)", "Act(MB)", "Tot(MB)", "ms/eval", "TFLOPS");
+    printf("--------------------------------------------------------------------------\n");
+
+    typedef struct { int ch; int sp; } S;
+    S sizes[] = {{256,64},{512,64},{1024,64},{2048,64},{3072,64},{4096,64},{5120,64},{6144,64},{8192,32}};
+
+    for (int i = 0; i < 9; i++) {
+        int ch = sizes[i].ch, sp = sizes[i].sp;
+        double w_mb = (double)ch * ch * 2 / 1024 / 1024;  // FP16 weights
+        double a_mb = (double)ch * sp * 2 / 1024 / 1024;   // FP16 activations
+        double tot = w_mb + 2 * a_mb;
+        double gflop = 2.0 * ch * ch * sp / 1e9;
+
+        char path[256];
+        snprintf(path, sizeof(path), "/tmp/ane_sram_%dch_%dsp.mlpackage", ch, sp);
+        double ms = bench(path, ch, sp);
+
+        double tflops = (ms > 0) ? gflop / ms : -1;
+        char label[64];
+        snprintf(label, sizeof(label), "%dch x %dsp", ch, sp);
+
+        if (ms > 0)
+            printf("%-25s %7.1f  %7.2f  %7.1f  %8.3f ms %7.2f\n", label, w_mb, a_mb, tot, ms, tflops);
+        else
+            printf("%-25s %7.1f  %7.2f  %7.1f  FAIL(%.0f)\n", label, w_mb, a_mb, tot, ms);
+    }
+
+    printf("\nLook for the performance cliff to estimate SRAM size.\n");
+    return 0;
+}
diff --git a/sram_probe.m b/sram_probe.m
new file mode 100644
index 0000000..0766187
--- /dev/null
+++ b/sram_probe.m
@@ -0,0 +1,83 @@
+#import <Foundation/Foundation.h>
+#import <CoreML/CoreML.h>
+#import <objc/runtime.h>
+#import <objc/message.h>
+#import <dlfcn.h>
+#import <mach/mach_time.h>
+#import <IOSurface/IOSurface.h>
+
+static mach_timebase_info_data_t g_tb;
+static double ticksToMs(uint64_t t) { return (double)t * g_tb.numer / g_tb.denom / 1e6; }
+static id g_client; static Class AM, AR, AIO;
+
+double bench(const char *path, int ch, int sp) {
+    @autoreleasepool {
+        NSError *e = nil;
+        NSURL *compiled = [MLModel compileModelAtURL:
+            [NSURL fileURLWithPath:[NSString stringWithUTF8String:path]] error:&e];
+        if (e) return -1;
+        id model = ((id(*)(Class,SEL,id,id))objc_msgSend)(AM, @selector(modelAtURL:key:), compiled, @"s");
+        ((BOOL(*)(id,SEL,id,id,NSUInteger,NSError**))objc_msgSend)(
+            g_client, @selector(compileModel:options:qos:error:), model,
+            @{@"kANEFModelType":@"kANEFModelMIL",@"kANEFNetPlistFilenameKey":@"model.mil"}, 21, &e);
+        ((BOOL(*)(id,SEL,id,id,NSUInteger,NSError**))objc_msgSend)(
+            g_client, @selector(loadModel:options:qos:error:), model, @{}, 21, &e);
+        NSUInteger bytes = ch * sp * 4;
+        IOSurfaceRef ioIn = IOSurfaceCreate((__bridge CFDictionaryRef)@{
+            (id)kIOSurfaceWidth:@(bytes),(id)kIOSurfaceHeight:@1,
+            (id)kIOSurfaceBytesPerElement:@1,(id)kIOSurfaceBytesPerRow:@(bytes),
+            (id)kIOSurfaceAllocSize:@(bytes),(id)kIOSurfacePixelFormat:@0});
+        IOSurfaceRef ioOut = IOSurfaceCreate((__bridge CFDictionaryRef)@{
+            (id)kIOSurfaceWidth:@(bytes),(id)kIOSurfaceHeight:@1,
+            (id)kIOSurfaceBytesPerElement:@1,(id)kIOSurfaceBytesPerRow:@(bytes),
+            (id)kIOSurfaceAllocSize:@(bytes),(id)kIOSurfacePixelFormat:@0});
+        id wIn = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(AIO, @selector(objectWithIOSurface:), ioIn);
+        id wOut = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(AIO, @selector(objectWithIOSurface:), ioOut);
+        id req = ((id(*)(Class,SEL,id,id,id,id,id,id,id))objc_msgSend)(AR,
+            @selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:),
+            @[wIn], @[@0], @[wOut], @[@0], nil, nil, @0);
+        for (int i = 0; i < 5; i++)
+            ((BOOL(*)(id,SEL,id,id,id,NSUInteger,NSError**))objc_msgSend)(
+                g_client, @selector(evaluateWithModel:options:request:qos:error:), model, @{}, req, 21, &e);
+        int iters = 50;
+        uint64_t t0 = mach_absolute_time();
+        for (int i = 0; i < iters; i++)
+            ((BOOL(*)(id,SEL,id,id,id,NSUInteger,NSError**))objc_msgSend)(
+                g_client, @selector(evaluateWithModel:options:request:qos:error:), model, @{}, req, 21, &e);
+        double ms = ticksToMs(mach_absolute_time() - t0) / iters;
+        ((void(*)(id,SEL,id,id,NSUInteger,NSError**))objc_msgSend)(
+            g_client, @selector(unloadModel:options:qos:error:), model, @{}, 21, &e);
+        CFRelease(ioIn); CFRelease(ioOut);
+        return ms;
+    }
+}
+
+int main() {
+    mach_timebase_info(&g_tb);
+    dlopen("/System/Library/PrivateFrameworks/AppleNeuralEngine.framework/AppleNeuralEngine", RTLD_NOW);
+    g_client = [NSClassFromString(@"_ANEClient") performSelector:@selector(sharedConnection)];
+    AM = NSClassFromString(@"_ANEModel"); AR = NSClassFromString(@"_ANERequest");
+    AIO = NSClassFromString(@"_ANEIOSurfaceObject");
+
+    printf("=== ANE SRAM Fine Probe (weights only vary, spatial=64) ===\n\n");
+    printf("%-12s %8s %10s %8s %12s\n", "Channels", "W (MB)", "ms/eval", "TFLOPS", "GFLOPS/MB");
+    printf("--------------------------------------------------------------\n");
+
+    int chs[] = {256, 512, 1024, 1536, 2048, 2560, 3072, 3584, 4096, 4608, 5120, 6144, 8192};
+    int sps[] = {64,  64,  64,   64,   64,   64,   64,   64,   64,   64,   64,   64,   32};
+
+    for (int i = 0; i < 13; i++) {
+        int ch = chs[i], sp = sps[i];
+        double w_mb = (double)ch * ch * 2 / 1024 / 1024;
+        double gf = 2.0 * ch * ch * sp / 1e9;
+        char path[256];
+        snprintf(path, sizeof(path), "/tmp/ane_sram_%dch_%dsp.mlpackage", ch, sp);
+        double ms = bench(path, ch, sp);
+        double tf = (ms > 0) ? gf / ms : 0;
+        double eff = (ms > 0) ? tf * 1000 / w_mb : 0;
+        printf("%6d ch   %7.1f  %8.3f ms %7.2f  %10.1f %s\n",
+               ch, w_mb, ms, tf, eff,
+               (i > 0 && eff < 100) ? " <-- spilling?" : "");
+    }
+    return 0;
+}
diff --git a/training/Makefile b/training/Makefile
new file mode 100644
index 0000000..90c2977
--- /dev/null
+++ b/training/Makefile
@@ -0,0 +1,12 @@
+CC = xcrun clang
+CFLAGS = -O2 -Wall -Wno-deprecated-declarations -fobjc-arc
+FRAMEWORKS = -framework Foundation -framework CoreML -framework IOSurface
+LDFLAGS = $(FRAMEWORKS) -ldl
+
+train: train.m ane_runtime.h ane_mil_gen.h model.h forward.h backward.h
+	$(CC) $(CFLAGS) -o $@ train.m $(LDFLAGS)
+
+clean:
+	rm -f train
+
+.PHONY: clean
diff --git a/training/ane_mil_gen.h b/training/ane_mil_gen.h
new file mode 100644
index 0000000..97fc451
--- /dev/null
+++ b/training/ane_mil_gen.h
@@ -0,0 +1,208 @@
+// ane_mil_gen.h — Generate MIL text for conv-based linear ops + weight blobs
+#pragma once
+#import <Foundation/Foundation.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+
+// Build an FP16 weight blob with the required header structure.
+// weights_f32: source weights in row-major [out_ch, in_ch]
+// Returns NSData with header + FP16 weights
+static NSData *mil_build_weight_blob(const float *weights_f32, int out_ch, int in_ch) {
+    NSUInteger wsize = (NSUInteger)out_ch * in_ch * 2; // FP16
+    NSUInteger total = 64 + 64 + wsize; // global header + chunk header + data
+    uint8_t *buf = (uint8_t*)calloc(total, 1);
+    buf[0] = 0x01; buf[4] = 0x02;
+    uint8_t *chunk = buf + 64;
+    chunk[0] = 0xEF; chunk[1] = 0xBE; chunk[2] = 0xAD; chunk[3] = 0xDE;
+    chunk[4] = 0x01;
+    *(uint32_t*)(chunk + 8) = (uint32_t)wsize;   // data_size
+    *(uint32_t*)(chunk + 16) = 128;               // data_offset (from file start)
+    // Convert f32 → fp16 (simple truncation via _Float16)
+    _Float16 *fp16 = (_Float16*)(buf + 128);
+    for (NSUInteger i = 0; i < (NSUInteger)out_ch * in_ch; i++)
+        fp16[i] = (_Float16)weights_f32[i];
+    return [NSData dataWithBytesNoCopy:buf length:total freeWhenDone:YES];
+}
+
+// Generate MIL for a single matmul: y = W @ x (using matmul op, weights as input)
+// Input x: [1, in_ch, spatial] fp32
+// Input W: [1, out_ch, in_ch] fp32
+// Output:  [1, out_ch, spatial] fp32
+static NSString *mil_gen_matmul(int in_ch, int out_ch, int spatial) {
+    return [NSString stringWithFormat:
+        @"program(1.3)\n"
+        "[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, "
+        "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, "
+        "{\"coremltools-version\", \"9.0\"}})]\n"
+        "{\n"
+        "    func main<ios18>(tensor<fp32, [1, %d, %d]> x, tensor<fp32, [1, %d, %d]> W) {\n"
+        "        string to_fp16 = const()[name = string(\"to_fp16\"), val = string(\"fp16\")];\n"
+        "        tensor<fp16, [1, %d, %d]> x16 = cast(dtype = to_fp16, x = x)[name = string(\"cast_x\")];\n"
+        "        tensor<fp16, [1, %d, %d]> W16 = cast(dtype = to_fp16, x = W)[name = string(\"cast_W\")];\n"
+        "        bool tx = const()[name = string(\"tx\"), val = bool(false)];\n"
+        "        bool ty = const()[name = string(\"ty\"), val = bool(false)];\n"
+        "        tensor<fp16, [1, %d, %d]> y16 = matmul(transpose_x = tx, transpose_y = ty, x = W16, y = x16)[name = string(\"mm\")];\n"
+        "        string to_fp32 = const()[name = string(\"to_fp32\"), val = string(\"fp32\")];\n"
+        "        tensor<fp32, [1, %d, %d]> y = cast(dtype = to_fp32, x = y16)[name = string(\"cast_out\")];\n"
+        "    } -> (y);\n"
+        "}\n",
+        in_ch, spatial, out_ch, in_ch,
+        in_ch, spatial, out_ch, in_ch,
+        out_ch, spatial, out_ch, spatial];
+}
+
+// Keep the baked-weight version for reference (used in inference-only scenarios)
+static NSString *mil_gen_conv(int in_ch, int out_ch, int spatial) {
+    return [NSString stringWithFormat:
+        @"program(1.3)\n"
+        "[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, "
+        "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, "
+        "{\"coremltools-version\", \"9.0\"}})]\n"
+        "{\n"
+        "    func main<ios18>(tensor<fp32, [1, %d, 1, %d]> x) {\n"
+        "        string c_pad_type = const()[name = string(\"c_pad_type\"), val = string(\"valid\")];\n"
+        "        tensor<int32, [2]> c_strides = const()[name = string(\"c_strides\"), val = tensor<int32, [2]>([1, 1])];\n"
+        "        tensor<int32, [4]> c_pad = const()[name = string(\"c_pad\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
+        "        tensor<int32, [2]> c_dilations = const()[name = string(\"c_dilations\"), val = tensor<int32, [2]>([1, 1])];\n"
+        "        int32 c_groups = const()[name = string(\"c_groups\"), val = int32(1)];\n"
+        "        string to_fp16 = const()[name = string(\"to_fp16\"), val = string(\"fp16\")];\n"
+        "        tensor<fp16, [1, %d, 1, %d]> x16 = cast(dtype = to_fp16, x = x)[name = string(\"cast_in\")];\n"
+        "        tensor<fp16, [%d, %d, 1, 1]> W = const()[name = string(\"W\"), "
+        "val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = string(\"@model_path/weights/weight.bin\"), offset = uint64(64)))];\n"
+        "        tensor<fp16, [1, %d, 1, %d]> y16 = conv(dilations = c_dilations, groups = c_groups, "
+        "pad = c_pad, pad_type = c_pad_type, strides = c_strides, weight = W, x = x16)[name = string(\"conv\")];\n"
+        "        string to_fp32 = const()[name = string(\"to_fp32\"), val = string(\"fp32\")];\n"
+        "        tensor<fp32, [1, %d, 1, %d]> y = cast(dtype = to_fp32, x = y16)[name = string(\"cast_out\")];\n"
+        "    } -> (y);\n"
+        "}\n",
+        in_ch, spatial, in_ch, spatial,
+        out_ch, in_ch, out_ch, in_ch,
+        out_ch, spatial, out_ch, spatial];
+}
+
+// Generate MIL for fused QKV: 3 parallel convs from same input
+// Input:  [1, dim, 1, S]
+// Outputs: Q[1, dim, 1, S], K[1, dim, 1, S], V[1, dim, 1, S]
+// Weight blob layout: Wq[dim,dim] @ offset 64, Wk @ offset 64+cs, Wv @ offset 64+2*cs
+// where cs = 64 + dim*dim*2
+static NSString *mil_gen_qkv(int dim, int spatial) {
+    NSUInteger cs = 64 + (NSUInteger)dim * dim * 2;
+    return [NSString stringWithFormat:
+        @"program(1.3)\n"
+        "[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, "
+        "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, "
+        "{\"coremltools-version\", \"9.0\"}})]\n"
+        "{\n"
+        "    func main<ios18>(tensor<fp32, [1, %d, 1, %d]> x) {\n"
+        "        string c_pad_type = const()[name = string(\"c_pad_type\"), val = string(\"valid\")];\n"
+        "        tensor<int32, [2]> c_strides = const()[name = string(\"c_strides\"), val = tensor<int32, [2]>([1, 1])];\n"
+        "        tensor<int32, [4]> c_pad = const()[name = string(\"c_pad\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
+        "        tensor<int32, [2]> c_dilations = const()[name = string(\"c_dilations\"), val = tensor<int32, [2]>([1, 1])];\n"
+        "        int32 c_groups = const()[name = string(\"c_groups\"), val = int32(1)];\n"
+        "        string to_fp16 = const()[name = string(\"to_fp16\"), val = string(\"fp16\")];\n"
+        "        tensor<fp16, [1, %d, 1, %d]> x16 = cast(dtype = to_fp16, x = x)[name = string(\"cast_in\")];\n"
+        "        tensor<fp16, [%d, %d, 1, 1]> Wq = const()[name = string(\"Wq\"), "
+        "val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = string(\"@model_path/weights/weight.bin\"), offset = uint64(64)))];\n"
+        "        tensor<fp16, [%d, %d, 1, 1]> Wk = const()[name = string(\"Wk\"), "
+        "val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = string(\"@model_path/weights/weight.bin\"), offset = uint64(%lu)))];\n"
+        "        tensor<fp16, [%d, %d, 1, 1]> Wv = const()[name = string(\"Wv\"), "
+        "val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = string(\"@model_path/weights/weight.bin\"), offset = uint64(%lu)))];\n"
+        "        tensor<fp16, [1, %d, 1, %d]> q16 = conv(dilations = c_dilations, groups = c_groups, "
+        "pad = c_pad, pad_type = c_pad_type, strides = c_strides, weight = Wq, x = x16)[name = string(\"conv_q\")];\n"
+        "        tensor<fp16, [1, %d, 1, %d]> k16 = conv(dilations = c_dilations, groups = c_groups, "
+        "pad = c_pad, pad_type = c_pad_type, strides = c_strides, weight = Wk, x = x16)[name = string(\"conv_k\")];\n"
+        "        tensor<fp16, [1, %d, 1, %d]> v16 = conv(dilations = c_dilations, groups = c_groups, "
+        "pad = c_pad, pad_type = c_pad_type, strides = c_strides, weight = Wv, x = x16)[name = string(\"conv_v\")];\n"
+        "        string to_fp32 = const()[name = string(\"to_fp32\"), val = string(\"fp32\")];\n"
+        "        tensor<fp32, [1, %d, 1, %d]> q = cast(dtype = to_fp32, x = q16)[name = string(\"cast_q\")];\n"
+        "        tensor<fp32, [1, %d, 1, %d]> k = cast(dtype = to_fp32, x = k16)[name = string(\"cast_k\")];\n"
+        "        tensor<fp32, [1, %d, 1, %d]> v = cast(dtype = to_fp32, x = v16)[name = string(\"cast_v\")];\n"
+        "    } -> (q, k, v);\n"
+        "}\n",
+        dim, spatial, dim, spatial,
+        dim, dim, dim, dim,
+        dim, dim, dim, dim, (unsigned long)(64 + cs),
+        dim, dim, dim, dim, (unsigned long)(64 + 2*cs),
+        dim, spatial, dim, spatial, dim, spatial,
+        dim, spatial, dim, spatial, dim, spatial];
+}
+
+// Build weight blob for fused QKV (3 weight matrices concatenated)
+static NSData *mil_build_qkv_weight_blob(const float *wq, const float *wk, const float *wv, int dim) {
+    NSUInteger wsize = (NSUInteger)dim * dim * 2;
+    NSUInteger cs = 64 + wsize;
+    NSUInteger total = 64 + 3 * cs;
+    uint8_t *buf = (uint8_t*)calloc(total, 1);
+    buf[0] = 0x01; buf[4] = 0x02;
+    const float *ws[3] = {wq, wk, wv};
+    for (int w = 0; w < 3; w++) {
+        uint8_t *chunk = buf + 64 + w * cs;
+        chunk[0]=0xEF; chunk[1]=0xBE; chunk[2]=0xAD; chunk[3]=0xDE;
+        chunk[4]=0x01;
+        *(uint32_t*)(chunk + 8) = (uint32_t)wsize;
+        *(uint32_t*)(chunk + 16) = (uint32_t)(64 + w * cs + 64); // absolute data offset
+        _Float16 *fp16 = (_Float16*)(chunk + 64);
+        for (NSUInteger i = 0; i < (NSUInteger)dim * dim; i++)
+            fp16[i] = (_Float16)ws[w][i];
+    }
+    return [NSData dataWithBytesNoCopy:buf length:total freeWhenDone:YES];
+}
+
+// Build weight blob for fused FFN up (w1 + w3, both [hidden_dim, dim])
+static NSData *mil_build_ffn_up_weight_blob(const float *w1, const float *w3, int hidden_dim, int dim) {
+    NSUInteger wsize = (NSUInteger)hidden_dim * dim * 2;
+    NSUInteger cs = 64 + wsize;
+    NSUInteger total = 64 + 2 * cs;
+    uint8_t *buf = (uint8_t*)calloc(total, 1);
+    buf[0] = 0x01; buf[4] = 0x02;
+    const float *ws[2] = {w1, w3};
+    for (int w = 0; w < 2; w++) {
+        uint8_t *chunk = buf + 64 + w * cs;
+        chunk[0]=0xEF; chunk[1]=0xBE; chunk[2]=0xAD; chunk[3]=0xDE;
+        chunk[4]=0x01;
+        *(uint32_t*)(chunk + 8) = (uint32_t)wsize;
+        *(uint32_t*)(chunk + 16) = (uint32_t)(64 + w * cs + 64); // absolute data offset
+        _Float16 *fp16 = (_Float16*)(chunk + 64);
+        for (NSUInteger i = 0; i < (NSUInteger)hidden_dim * dim; i++)
+            fp16[i] = (_Float16)ws[w][i];
+    }
+    return [NSData dataWithBytesNoCopy:buf length:total freeWhenDone:YES];
+}
+
+// Generate MIL for fused FFN up: w1 + w3 parallel convs
+static NSString *mil_gen_ffn_up(int dim, int hidden_dim, int spatial) {
+    NSUInteger cs = 64 + (NSUInteger)hidden_dim * dim * 2;
+    return [NSString stringWithFormat:
+        @"program(1.3)\n"
+        "[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, "
+        "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, "
+        "{\"coremltools-version\", \"9.0\"}})]\n"
+        "{\n"
+        "    func main<ios18>(tensor<fp32, [1, %d, 1, %d]> x) {\n"
+        "        string c_pad_type = const()[name = string(\"c_pad_type\"), val = string(\"valid\")];\n"
+        "        tensor<int32, [2]> c_strides = const()[name = string(\"c_strides\"), val = tensor<int32, [2]>([1, 1])];\n"
+        "        tensor<int32, [4]> c_pad = const()[name = string(\"c_pad\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
+        "        tensor<int32, [2]> c_dilations = const()[name = string(\"c_dilations\"), val = tensor<int32, [2]>([1, 1])];\n"
+        "        int32 c_groups = const()[name = string(\"c_groups\"), val = int32(1)];\n"
+        "        string to_fp16 = const()[name = string(\"to_fp16\"), val = string(\"fp16\")];\n"
+        "        tensor<fp16, [1, %d, 1, %d]> x16 = cast(dtype = to_fp16, x = x)[name = string(\"cast_in\")];\n"
+        "        tensor<fp16, [%d, %d, 1, 1]> W1 = const()[name = string(\"W1\"), "
+        "val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = string(\"@model_path/weights/weight.bin\"), offset = uint64(64)))];\n"
+        "        tensor<fp16, [%d, %d, 1, 1]> W3 = const()[name = string(\"W3\"), "
+        "val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = string(\"@model_path/weights/weight.bin\"), offset = uint64(%lu)))];\n"
+        "        tensor<fp16, [1, %d, 1, %d]> h1 = conv(dilations = c_dilations, groups = c_groups, "
+        "pad = c_pad, pad_type = c_pad_type, strides = c_strides, weight = W1, x = x16)[name = string(\"conv_w1\")];\n"
+        "        tensor<fp16, [1, %d, 1, %d]> h3 = conv(dilations = c_dilations, groups = c_groups, "
+        "pad = c_pad, pad_type = c_pad_type, strides = c_strides, weight = W3, x = x16)[name = string(\"conv_w3\")];\n"
+        "        string to_fp32 = const()[name = string(\"to_fp32\"), val = string(\"fp32\")];\n"
+        "        tensor<fp32, [1, %d, 1, %d]> out1 = cast(dtype = to_fp32, x = h1)[name = string(\"cast_h1\")];\n"
+        "        tensor<fp32, [1, %d, 1, %d]> out3 = cast(dtype = to_fp32, x = h3)[name = string(\"cast_h3\")];\n"
+        "    } -> (out1, out3);\n"
+        "}\n",
+        dim, spatial, dim, spatial,
+        hidden_dim, dim, hidden_dim, dim,
+        hidden_dim, dim, hidden_dim, dim, (unsigned long)(64 + cs),
+        hidden_dim, spatial, hidden_dim, spatial,
+        hidden_dim, spatial, hidden_dim, spatial];
+}
diff --git a/training/ane_runtime.h b/training/ane_runtime.h
new file mode 100644
index 0000000..585d0f0
--- /dev/null
+++ b/training/ane_runtime.h
@@ -0,0 +1,160 @@
+// ane_runtime.h — Reusable ANE in-memory compile/load/eval wrapper
+// Uses _ANEInMemoryModel via private AppleNeuralEngine.framework
+#pragma once
+#import <Foundation/Foundation.h>
+#import <objc/runtime.h>
+#import <objc/message.h>
+#import <dlfcn.h>
+#import <IOSurface/IOSurface.h>
+
+typedef struct {
+    id model;               // _ANEInMemoryModel
+    IOSurfaceRef *ioInputs;
+    IOSurfaceRef *ioOutputs;
+    id request;             // _ANERequest
+    NSString *tmpDir;
+    int nInputs, nOutputs;
+    size_t *inputBytes;
+    size_t *outputBytes;
+} ANEKernel;
+
+static Class g_ANEDesc, g_ANEInMem, g_ANEReq, g_ANEIO;
+static bool g_ane_loaded = false;
+
+static void ane_init(void) {
+    if (g_ane_loaded) return;
+    dlopen("/System/Library/PrivateFrameworks/AppleNeuralEngine.framework/AppleNeuralEngine", RTLD_NOW);
+    g_ANEDesc  = NSClassFromString(@"_ANEInMemoryModelDescriptor");
+    g_ANEInMem = NSClassFromString(@"_ANEInMemoryModel");
+    g_ANEReq   = NSClassFromString(@"_ANERequest");
+    g_ANEIO    = NSClassFromString(@"_ANEIOSurfaceObject");
+    g_ane_loaded = true;
+}
+
+static IOSurfaceRef ane_create_surface(size_t bytes) {
+    return IOSurfaceCreate((__bridge CFDictionaryRef)@{
+        (id)kIOSurfaceWidth: @(bytes),
+        (id)kIOSurfaceHeight: @1,
+        (id)kIOSurfaceBytesPerElement: @1,
+        (id)kIOSurfaceBytesPerRow: @(bytes),
+        (id)kIOSurfaceAllocSize: @(bytes),
+        (id)kIOSurfacePixelFormat: @0
+    });
+}
+
+// Compile a MIL graph with weight blob into an ANE kernel.
+// milText: NSData of MIL text
+// weightData: NSData of raw weight blob (can be nil)
+// inputSizes/outputSizes: arrays of byte sizes for each I/O tensor
+static ANEKernel *ane_compile(NSData *milText, NSData *weightData,
+                               int nInputs, size_t *inputSizes,
+                               int nOutputs, size_t *outputSizes) {
+    ane_init();
+    NSError *e = nil;
+
+    NSDictionary *wdict = nil;
+    if (weightData) {
+        wdict = @{@"@model_path/weights/weight.bin": @{@"offset": @0, @"data": weightData}};
+    }
+    id desc = ((id(*)(Class,SEL,id,id,id))objc_msgSend)(
+        g_ANEDesc, @selector(modelWithMILText:weights:optionsPlist:),
+        milText, wdict, nil);
+    if (!desc) return NULL;
+
+    id mdl = ((id(*)(Class,SEL,id))objc_msgSend)(
+        g_ANEInMem, @selector(inMemoryModelWithDescriptor:), desc);
+
+    // Pre-populate temp dir with MIL + weights
+    id hx = ((id(*)(id,SEL))objc_msgSend)(mdl, @selector(hexStringIdentifier));
+    NSString *td = [NSTemporaryDirectory() stringByAppendingPathComponent:hx];
+    NSFileManager *fm = [NSFileManager defaultManager];
+    [fm createDirectoryAtPath:[td stringByAppendingPathComponent:@"weights"]
+        withIntermediateDirectories:YES attributes:nil error:nil];
+    [milText writeToFile:[td stringByAppendingPathComponent:@"model.mil"] atomically:YES];
+    if (weightData)
+        [weightData writeToFile:[td stringByAppendingPathComponent:@"weights/weight.bin"] atomically:YES];
+
+    if (!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(
+            mdl, @selector(compileWithQoS:options:error:), 21, @{}, &e)) {
+        fprintf(stderr, "ANE compile failed: %s\n", [[e description] UTF8String]);
+        [fm removeItemAtPath:td error:nil];
+        return NULL;
+    }
+    if (!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(
+            mdl, @selector(loadWithQoS:options:error:), 21, @{}, &e)) {
+        fprintf(stderr, "ANE load failed: %s\n", [[e description] UTF8String]);
+        [fm removeItemAtPath:td error:nil];
+        return NULL;
+    }
+
+    ANEKernel *k = calloc(1, sizeof(ANEKernel));
+    k->model = mdl;
+    k->tmpDir = td;
+    k->nInputs = nInputs;
+    k->nOutputs = nOutputs;
+    k->inputBytes = malloc(nInputs * sizeof(size_t));
+    k->outputBytes = malloc(nOutputs * sizeof(size_t));
+    memcpy(k->inputBytes, inputSizes, nInputs * sizeof(size_t));
+    memcpy(k->outputBytes, outputSizes, nOutputs * sizeof(size_t));
+
+    // Create IOSurfaces
+    k->ioInputs = malloc(nInputs * sizeof(IOSurfaceRef));
+    k->ioOutputs = malloc(nOutputs * sizeof(IOSurfaceRef));
+    for (int i = 0; i < nInputs; i++)
+        k->ioInputs[i] = ane_create_surface(inputSizes[i]);
+    for (int i = 0; i < nOutputs; i++)
+        k->ioOutputs[i] = ane_create_surface(outputSizes[i]);
+
+    // Build request
+    NSMutableArray *wIns = [NSMutableArray arrayWithCapacity:nInputs];
+    NSMutableArray *iIdx = [NSMutableArray arrayWithCapacity:nInputs];
+    for (int i = 0; i < nInputs; i++) {
+        [wIns addObject:((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(
+            g_ANEIO, @selector(objectWithIOSurface:), k->ioInputs[i])];
+        [iIdx addObject:@(i)];
+    }
+    NSMutableArray *wOuts = [NSMutableArray arrayWithCapacity:nOutputs];
+    NSMutableArray *oIdx = [NSMutableArray arrayWithCapacity:nOutputs];
+    for (int i = 0; i < nOutputs; i++) {
+        [wOuts addObject:((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(
+            g_ANEIO, @selector(objectWithIOSurface:), k->ioOutputs[i])];
+        [oIdx addObject:@(i)];
+    }
+    k->request = ((id(*)(Class,SEL,id,id,id,id,id,id,id))objc_msgSend)(
+        g_ANEReq, @selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:),
+        wIns, iIdx, wOuts, oIdx, nil, nil, @0);
+
+    return k;
+}
+
+static void ane_write_input(ANEKernel *k, int idx, const void *data, size_t bytes) {
+    IOSurfaceLock(k->ioInputs[idx], 0, NULL);
+    memcpy(IOSurfaceGetBaseAddress(k->ioInputs[idx]), data, bytes);
+    IOSurfaceUnlock(k->ioInputs[idx], 0, NULL);
+}
+
+static void ane_read_output(ANEKernel *k, int idx, void *data, size_t bytes) {
+    IOSurfaceLock(k->ioOutputs[idx], kIOSurfaceLockReadOnly, NULL);
+    memcpy(data, IOSurfaceGetBaseAddress(k->ioOutputs[idx]), bytes);
+    IOSurfaceUnlock(k->ioOutputs[idx], kIOSurfaceLockReadOnly, NULL);
+}
+
+static bool ane_eval(ANEKernel *k) {
+    NSError *e = nil;
+    return ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)(
+        k->model, @selector(evaluateWithQoS:options:request:error:),
+        21, @{}, k->request, &e);
+}
+
+static void ane_free(ANEKernel *k) {
+    if (!k) return;
+    NSError *e = nil;
+    ((BOOL(*)(id,SEL,unsigned int,NSError**))objc_msgSend)(
+        k->model, @selector(unloadWithQoS:error:), 21, &e);
+    for (int i = 0; i < k->nInputs; i++) CFRelease(k->ioInputs[i]);
+    for (int i = 0; i < k->nOutputs; i++) CFRelease(k->ioOutputs[i]);
+    [[NSFileManager defaultManager] removeItemAtPath:k->tmpDir error:nil];
+    free(k->ioInputs); free(k->ioOutputs);
+    free(k->inputBytes); free(k->outputBytes);
+    free(k);
+}
diff --git a/training/backward.h b/training/backward.h
new file mode 100644
index 0000000..138ea7c
--- /dev/null
+++ b/training/backward.h
@@ -0,0 +1,308 @@
+// backward.h — Backward pass using CPU matmul (correct gradients) + ANE optional
+#pragma once
+#include "model.h"
+#include "forward.h"
+#include <math.h>
+#include <string.h>
+
+// dW += dy @ x^T — dy: [S, out_dim], x: [S, in_dim], dW: [out_dim, in_dim]
+static void cpu_accum_dW(float *dW, const float *dy, const float *x, int S, int out_dim, int in_dim) {
+    for (int t = 0; t < S; t++)
+        for (int i = 0; i < out_dim; i++)
+            for (int j = 0; j < in_dim; j++)
+                dW[i*in_dim+j] += dy[t*out_dim+i] * x[t*in_dim+j];
+}
+
+// dx = W^T @ dy — W: [out_dim, in_dim], dy: [S, out_dim] → dx: [S, in_dim]
+static void cpu_matmul_backward_dx(const float *W, const float *dy, float *dx,
+                                    int S, int out_dim, int in_dim) {
+    for (int t = 0; t < S; t++)
+        for (int j = 0; j < in_dim; j++) {
+            float sum = 0;
+            for (int i = 0; i < out_dim; i++)
+                sum += W[i*in_dim+j] * dy[t*out_dim+i];
+            dx[t*in_dim+j] = sum;
+        }
+}
+
+static void cpu_rmsnorm_backward(float *dx, const float *dy, const float *x, const float *w,
+                                  int S, int D) {
+    for (int t = 0; t < S; t++) {
+        float ss = 0;
+        for (int i = 0; i < D; i++) ss += x[t*D+i] * x[t*D+i];
+        float rms = sqrtf(ss / D + 1e-5f);
+        float inv_rms = 1.0f / rms;
+        float dot = 0;
+        for (int i = 0; i < D; i++)
+            dot += dy[t*D+i] * w[i] * x[t*D+i];
+        dot /= (D * rms * rms);
+        for (int i = 0; i < D; i++)
+            dx[t*D+i] = dy[t*D+i] * w[i] * inv_rms - x[t*D+i] * dot;
+    }
+}
+
+static inline float silu_backward(float x) {
+    float s = 1.0f / (1.0f + expf(-x));
+    return s * (1.0f + x * (1.0f - s));
+}
+
+static void cpu_attention_backward(float *dq, float *dk, float *dv,
+                                    const float *d_out, const float *q, const float *k, const float *v,
+                                    int S, int n_heads, int head_dim) {
+    float scale = 1.0f / sqrtf((float)head_dim);
+    int D = n_heads * head_dim;
+    float *scores = (float*)malloc(S * sizeof(float));
+    float *dscores = (float*)malloc(S * sizeof(float));
+
+    memset(dq, 0, S * D * sizeof(float));
+    memset(dk, 0, S * D * sizeof(float));
+    memset(dv, 0, S * D * sizeof(float));
+
+    for (int h = 0; h < n_heads; h++) {
+        for (int t = 0; t < S; t++) {
+            // Recompute softmax for this row
+            float mx = -1e9f;
+            for (int s = 0; s <= t; s++) {
+                float dot = 0;
+                for (int i = 0; i < head_dim; i++)
+                    dot += q[t*D + h*head_dim + i] * k[s*D + h*head_dim + i];
+                scores[s] = dot * scale;
+                if (scores[s] > mx) mx = scores[s];
+            }
+            float sm = 0;
+            for (int s = 0; s <= t; s++) { scores[s] = expf(scores[s] - mx); sm += scores[s]; }
+            for (int s = 0; s <= t; s++) scores[s] /= sm;
+
+            // dscores = d_out · v
+            float ds_sum = 0;
+            for (int s = 0; s <= t; s++) {
+                float dot = 0;
+                for (int i = 0; i < head_dim; i++)
+                    dot += d_out[t*D + h*head_dim + i] * v[s*D + h*head_dim + i];
+                dscores[s] = dot;
+                ds_sum += scores[s] * dot;
+            }
+
+            // Softmax backward + scale
+            for (int s = 0; s <= t; s++) {
+                float ds = scores[s] * (dscores[s] - ds_sum) * scale;
+                // dq[t] += ds * k[s]
+                for (int i = 0; i < head_dim; i++)
+                    dq[t*D + h*head_dim + i] += ds * k[s*D + h*head_dim + i];
+                // dk[s] += ds * q[t]
+                for (int i = 0; i < head_dim; i++)
+                    dk[s*D + h*head_dim + i] += ds * q[t*D + h*head_dim + i];
+                // dv[s] += scores[t,s] * d_out[t]
+                for (int i = 0; i < head_dim; i++)
+                    dv[s*D + h*head_dim + i] += scores[s] * d_out[t*D + h*head_dim + i];
+            }
+        }
+    }
+    free(scores); free(dscores);
+}
+
+static void cpu_rope_backward(float *dq, float *dk, int S, int n_heads, int head_dim) {
+    for (int t = 0; t < S; t++)
+        for (int h = 0; h < n_heads; h++)
+            for (int i = 0; i < head_dim; i += 2) {
+                float freq = 1.0f / powf(10000.0f, (float)i / head_dim);
+                float val = t * freq;
+                float cos_v = cosf(val), sin_v = sinf(val);
+                int off = t * n_heads * head_dim + h * head_dim + i;
+                float dq0 = dq[off], dq1 = dq[off+1];
+                dq[off]   = dq0 * cos_v + dq1 * sin_v;
+                dq[off+1] = -dq0 * sin_v + dq1 * cos_v;
+                float dk0 = dk[off], dk1 = dk[off+1];
+                dk[off]   = dk0 * cos_v + dk1 * sin_v;
+                dk[off+1] = -dk0 * sin_v + dk1 * cos_v;
+            }
+}
+
+static void model_clip_gradients(Model *m, float max_norm) {
+    int d = m->cfg.dim, hd = m->cfg.hidden_dim, vs = m->cfg.vocab_size;
+    double total_norm_sq = 0;
+    #define ACCUM_NORM(grad, size) do { \
+        for (size_t _i = 0; _i < (size_t)(size); _i++) total_norm_sq += (double)(grad)[_i] * (grad)[_i]; \
+    } while(0)
+    for (int l = 0; l < N_LAYERS; l++) {
+        ACCUM_NORM(m->grad_wq[l], d*d); ACCUM_NORM(m->grad_wk[l], d*d);
+        ACCUM_NORM(m->grad_wv[l], d*d); ACCUM_NORM(m->grad_wo[l], d*d);
+        ACCUM_NORM(m->grad_w1[l], hd*d); ACCUM_NORM(m->grad_w2[l], d*hd);
+        ACCUM_NORM(m->grad_w3[l], hd*d);
+    }
+    ACCUM_NORM(m->grad_wcls, vs*d); ACCUM_NORM(m->grad_emb, vs*d);
+    #undef ACCUM_NORM
+    float total_norm = sqrtf((float)total_norm_sq);
+    if (total_norm > max_norm) {
+        float scale = max_norm / total_norm;
+        #define SCALE_GRAD(grad, size) do { \
+            for (size_t _i = 0; _i < (size_t)(size); _i++) (grad)[_i] *= scale; \
+        } while(0)
+        for (int l = 0; l < N_LAYERS; l++) {
+            SCALE_GRAD(m->grad_wq[l], d*d); SCALE_GRAD(m->grad_wk[l], d*d);
+            SCALE_GRAD(m->grad_wv[l], d*d); SCALE_GRAD(m->grad_wo[l], d*d);
+            SCALE_GRAD(m->grad_w1[l], hd*d); SCALE_GRAD(m->grad_w2[l], d*hd);
+            SCALE_GRAD(m->grad_w3[l], hd*d);
+        }
+        SCALE_GRAD(m->grad_wcls, vs*d); SCALE_GRAD(m->grad_emb, vs*d);
+        #undef SCALE_GRAD
+    }
+}
+
+static void model_backward(Model *m, const int *tokens) {
+    int S = m->seq_len, d = m->cfg.dim, hd = m->cfg.hidden_dim;
+    int nh = m->cfg.n_heads, hdim = HEAD_DIM, vs = m->cfg.vocab_size;
+
+    // Zero gradients
+    for (int l = 0; l < N_LAYERS; l++) {
+        memset(m->grad_wq[l], 0, d*d*sizeof(float));
+        memset(m->grad_wk[l], 0, d*d*sizeof(float));
+        memset(m->grad_wv[l], 0, d*d*sizeof(float));
+        memset(m->grad_wo[l], 0, d*d*sizeof(float));
+        memset(m->grad_w1[l], 0, hd*d*sizeof(float));
+        memset(m->grad_w2[l], 0, d*hd*sizeof(float));
+        memset(m->grad_w3[l], 0, hd*d*sizeof(float));
+    }
+    memset(m->grad_wcls, 0, (size_t)vs*d*sizeof(float));
+    memset(m->grad_emb, 0, (size_t)vs*d*sizeof(float));
+
+    // dLogits from cross-entropy
+    float *dlogits = (float*)calloc(S * vs, sizeof(float));
+    for (int t = 0; t < S - 1; t++) {
+        float mx = -1e9f;
+        for (int i = 0; i < vs; i++) if (m->logits[t*vs+i] > mx) mx = m->logits[t*vs+i];
+        float sm = 0;
+        for (int i = 0; i < vs; i++) sm += expf(m->logits[t*vs+i] - mx);
+        for (int i = 0; i < vs; i++)
+            dlogits[t*vs+i] = expf(m->logits[t*vs+i] - mx) / sm;
+        dlogits[t*vs + tokens[t+1]] -= 1.0f;
+        for (int i = 0; i < vs; i++)
+            dlogits[t*vs+i] /= (S - 1);
+    }
+
+    // Classifier backward
+    cpu_accum_dW(m->grad_wcls, dlogits, m->act_final, S, vs, d);
+    float *dx = (float*)calloc(S * d, sizeof(float));
+    cpu_matmul_backward_dx(m->wcls, dlogits, dx, S, vs, d);
+    free(dlogits);
+
+    // Final RMSNorm backward
+    float *dx_norm = (float*)malloc(S * d * sizeof(float));
+    cpu_rmsnorm_backward(dx_norm, dx, m->act_pre_final, m->rms_final_w, S, d);
+    memcpy(dx, dx_norm, S * d * sizeof(float));
+    free(dx_norm);
+
+    // Layers in reverse
+    for (int l = N_LAYERS - 1; l >= 0; l--) {
+        // FFN down backward
+        float *d_silu = (float*)calloc(S * hd, sizeof(float));
+        cpu_matmul_backward_dx(m->w2[l], dx, d_silu, S, d, hd);
+        cpu_accum_dW(m->grad_w2[l], dx, m->act_silu[l], S, d, hd);
+
+        // SiLU backward
+        float *d_h1 = (float*)malloc(S * hd * sizeof(float));
+        float *d_h3 = (float*)malloc(S * hd * sizeof(float));
+        for (int t = 0; t < S; t++)
+            for (int i = 0; i < hd; i++) {
+                d_h1[t*hd+i] = d_silu[t*hd+i] * m->act_h3[l][t*hd+i] * silu_backward(m->act_h1[l][t*hd+i]);
+                d_h3[t*hd+i] = d_silu[t*hd+i] * silu_f(m->act_h1[l][t*hd+i]);
+            }
+        free(d_silu);
+
+        // FFN up backward
+        cpu_accum_dW(m->grad_w1[l], d_h1, m->act_ffn_in[l], S, hd, d);
+        cpu_accum_dW(m->grad_w3[l], d_h3, m->act_ffn_in[l], S, hd, d);
+
+        float *dx_ffn_in = (float*)calloc(S * d, sizeof(float));
+        float *dx_w1 = (float*)malloc(S * d * sizeof(float));
+        float *dx_w3 = (float*)malloc(S * d * sizeof(float));
+        cpu_matmul_backward_dx(m->w1[l], d_h1, dx_w1, S, hd, d);
+        cpu_matmul_backward_dx(m->w3[l], d_h3, dx_w3, S, hd, d);
+        for (int i = 0; i < S * d; i++) dx_ffn_in[i] = dx_w1[i] + dx_w3[i];
+        free(d_h1); free(d_h3); free(dx_w1); free(dx_w3);
+
+        // RMSNorm FFN backward
+        float *dx_ffn_norm = (float*)malloc(S * d * sizeof(float));
+        // The input to FFN rmsnorm was the residual after attention = act_x[l] + attn_residual
+        // We saved act_x[l] but the actual input to ffn_rmsnorm is x after attention residual
+        // For a proper implementation we'd save this. Approximate with act_x[l].
+        cpu_rmsnorm_backward(dx_ffn_norm, dx_ffn_in, m->act_x[l], m->rms_ffn_w[l], S, d);
+        for (int i = 0; i < S * d; i++) dx[i] += dx_ffn_norm[i];
+        free(dx_ffn_in); free(dx_ffn_norm);
+
+        // O projection backward
+        float *d_attn_out = (float*)calloc(S * d, sizeof(float));
+        cpu_matmul_backward_dx(m->wo[l], dx, d_attn_out, S, d, d);
+        cpu_accum_dW(m->grad_wo[l], dx, m->act_attn_out[l], S, d, d);
+
+        // Attention backward
+        float *dq = (float*)calloc(S * d, sizeof(float));
+        float *dk = (float*)calloc(S * d, sizeof(float));
+        float *dv = (float*)calloc(S * d, sizeof(float));
+        cpu_attention_backward(dq, dk, dv, d_attn_out, m->act_q[l], m->act_k[l], m->act_v[l], S, nh, hdim);
+        free(d_attn_out);
+
+        cpu_rope_backward(dq, dk, S, nh, hdim);
+
+        // QKV backward
+        cpu_accum_dW(m->grad_wq[l], dq, m->act_xnorm[l], S, d, d);
+        cpu_accum_dW(m->grad_wk[l], dk, m->act_xnorm[l], S, d, d);
+        cpu_accum_dW(m->grad_wv[l], dv, m->act_xnorm[l], S, d, d);
+
+        float *dx_qkv = (float*)calloc(S * d, sizeof(float));
+        float *tmp = (float*)malloc(S * d * sizeof(float));
+        cpu_matmul_backward_dx(m->wq[l], dq, tmp, S, d, d);
+        for (int i = 0; i < S*d; i++) dx_qkv[i] += tmp[i];
+        cpu_matmul_backward_dx(m->wk[l], dk, tmp, S, d, d);
+        for (int i = 0; i < S*d; i++) dx_qkv[i] += tmp[i];
+        cpu_matmul_backward_dx(m->wv[l], dv, tmp, S, d, d);
+        for (int i = 0; i < S*d; i++) dx_qkv[i] += tmp[i];
+        free(tmp); free(dq); free(dk); free(dv);
+
+        // RMSNorm attention backward
+        float *dx_att_norm = (float*)malloc(S * d * sizeof(float));
+        cpu_rmsnorm_backward(dx_att_norm, dx_qkv, m->act_x[l], m->rms_att_w[l], S, d);
+        for (int i = 0; i < S * d; i++) dx[i] += dx_att_norm[i];
+        free(dx_qkv); free(dx_att_norm);
+    }
+
+    // Embedding gradient
+    for (int t = 0; t < S; t++)
+        for (int i = 0; i < d; i++)
+            m->grad_emb[tokens[t]*d + i] += dx[t*d + i];
+
+    free(dx);
+}
+
+static void model_adam_step(Model *m, float lr, float beta1, float beta2, float eps) {
+    m->adam_step++;
+    float bc1 = 1.0f - powf(beta1, m->adam_step);
+    float bc2 = 1.0f - powf(beta2, m->adam_step);
+    size_t idx = 0;
+
+    #define ADAM_UPDATE(param, grad, size) do { \
+        for (size_t _i = 0; _i < (size_t)(size); _i++) { \
+            float g = (grad)[_i]; \
+            m->adam_m[idx] = beta1 * m->adam_m[idx] + (1-beta1) * g; \
+            m->adam_v[idx] = beta2 * m->adam_v[idx] + (1-beta2) * g * g; \
+            float m_hat = m->adam_m[idx] / bc1; \
+            float v_hat = m->adam_v[idx] / bc2; \
+            (param)[_i] -= lr * m_hat / (sqrtf(v_hat) + eps); \
+            idx++; \
+        } \
+    } while(0)
+
+    int d = m->cfg.dim, hd = m->cfg.hidden_dim, vs = m->cfg.vocab_size;
+    for (int l = 0; l < N_LAYERS; l++) {
+        ADAM_UPDATE(m->wq[l], m->grad_wq[l], d*d);
+        ADAM_UPDATE(m->wk[l], m->grad_wk[l], d*d);
+        ADAM_UPDATE(m->wv[l], m->grad_wv[l], d*d);
+        ADAM_UPDATE(m->wo[l], m->grad_wo[l], d*d);
+        ADAM_UPDATE(m->w1[l], m->grad_w1[l], hd*d);
+        ADAM_UPDATE(m->w2[l], m->grad_w2[l], d*hd);
+        ADAM_UPDATE(m->w3[l], m->grad_w3[l], hd*d);
+    }
+    ADAM_UPDATE(m->wcls, m->grad_wcls, vs*d);
+    ADAM_UPDATE(m->token_embedding, m->grad_emb, vs*d);
+    #undef ADAM_UPDATE
+}
diff --git a/training/forward.h b/training/forward.h
new file mode 100644
index 0000000..adcf898
--- /dev/null
+++ b/training/forward.h
@@ -0,0 +1,179 @@
+// forward.h — Forward pass: ANE baked-weight conv for linears, CPU for element-wise
+#pragma once
+#include "model.h"
+#include <math.h>
+#include <string.h>
+
+// ANE conv eval: input [S, in_dim] row-major → transpose to [in_dim, S] channels-first
+// ANE computes conv(W, x) with baked W → output [out_dim, S]
+// Transpose back to [S, out_dim] row-major
+static void ane_conv_eval(ANEKernel *kernel, const float *x, float *y,
+                          int S, int in_dim, int out_dim) {
+    float *x_t = (float*)malloc(S * in_dim * sizeof(float));
+    for (int t = 0; t < S; t++)
+        for (int i = 0; i < in_dim; i++)
+            x_t[i*S + t] = x[t*in_dim + i];
+
+    ane_write_input(kernel, 0, x_t, S * in_dim * sizeof(float));
+    ane_eval(kernel);
+
+    float *y_t = (float*)malloc(S * out_dim * sizeof(float));
+    ane_read_output(kernel, 0, y_t, S * out_dim * sizeof(float));
+
+    for (int t = 0; t < S; t++)
+        for (int i = 0; i < out_dim; i++)
+            y[t*out_dim + i] = y_t[i*S + t];
+
+    free(x_t); free(y_t);
+}
+
+// CPU matmul fallback: y = W @ x, W[out_dim, in_dim], x[S, in_dim] → y[S, out_dim]
+static void cpu_matmul(const float *W, const float *x, float *y, int S, int in_dim, int out_dim) {
+    for (int t = 0; t < S; t++)
+        for (int i = 0; i < out_dim; i++) {
+            float sum = 0;
+            for (int j = 0; j < in_dim; j++)
+                sum += W[i*in_dim + j] * x[t*in_dim + j];
+            y[t*out_dim + i] = sum;
+        }
+}
+
+static void cpu_rmsnorm(float *out, const float *x, const float *w, int S, int D) {
+    for (int t = 0; t < S; t++) {
+        float ss = 0;
+        for (int i = 0; i < D; i++) ss += x[t*D+i] * x[t*D+i];
+        ss = 1.0f / sqrtf(ss / D + 1e-5f);
+        for (int i = 0; i < D; i++) out[t*D+i] = x[t*D+i] * ss * w[i];
+    }
+}
+
+static void cpu_rope(float *q, float *k, int S, int n_heads, int head_dim) {
+    for (int t = 0; t < S; t++)
+        for (int h = 0; h < n_heads; h++)
+            for (int i = 0; i < head_dim; i += 2) {
+                float freq = 1.0f / powf(10000.0f, (float)i / head_dim);
+                float val = t * freq;
+                float cos_v = cosf(val), sin_v = sinf(val);
+                int off = t * n_heads * head_dim + h * head_dim + i;
+                float q0 = q[off], q1 = q[off+1];
+                q[off]   = q0 * cos_v - q1 * sin_v;
+                q[off+1] = q0 * sin_v + q1 * cos_v;
+                float k0 = k[off], k1 = k[off+1];
+                k[off]   = k0 * cos_v - k1 * sin_v;
+                k[off+1] = k0 * sin_v + k1 * cos_v;
+            }
+}
+
+static void cpu_attention(float *out, const float *q, const float *k, const float *v,
+                          int S, int n_heads, int head_dim) {
+    float scale = 1.0f / sqrtf((float)head_dim);
+    float *scores = (float*)malloc(S * S * sizeof(float));
+    for (int h = 0; h < n_heads; h++) {
+        int D = n_heads * head_dim;
+        for (int t = 0; t < S; t++) {
+            float mx = -1e9f;
+            for (int s = 0; s <= t; s++) {
+                float dot = 0;
+                for (int i = 0; i < head_dim; i++)
+                    dot += q[t*D + h*head_dim + i] * k[s*D + h*head_dim + i];
+                scores[s] = dot * scale;
+                if (scores[s] > mx) mx = scores[s];
+            }
+            float sm = 0;
+            for (int s = 0; s <= t; s++) { scores[s] = expf(scores[s] - mx); sm += scores[s]; }
+            for (int s = 0; s <= t; s++) scores[s] /= sm;
+            for (int i = 0; i < head_dim; i++) {
+                float val = 0;
+                for (int s = 0; s <= t; s++)
+                    val += scores[s] * v[s*D + h*head_dim + i];
+                out[t*D + h*head_dim + i] = val;
+            }
+        }
+    }
+    free(scores);
+}
+
+static inline float silu_f(float x) { return x / (1.0f + expf(-x)); }
+
+// Forward pass — returns loss. Saves activations for backward.
+static float model_forward(Model *m, const int *tokens, bool use_ane) {
+    int S = m->seq_len, d = m->cfg.dim, hd = m->cfg.hidden_dim;
+    int nh = m->cfg.n_heads, hdim = HEAD_DIM, vs = m->cfg.vocab_size;
+
+    float *x = (float*)malloc(S * d * sizeof(float));
+    for (int t = 0; t < S; t++)
+        memcpy(x + t*d, m->token_embedding + tokens[t]*d, d * sizeof(float));
+
+    for (int l = 0; l < N_LAYERS; l++) {
+        memcpy(m->act_x[l], x, S * d * sizeof(float));
+
+        cpu_rmsnorm(m->act_xnorm[l], x, m->rms_att_w[l], S, d);
+
+        if (use_ane) {
+            ane_conv_eval(m->kern_q[l], m->act_xnorm[l], m->act_q[l], S, d, d);
+            ane_conv_eval(m->kern_k[l], m->act_xnorm[l], m->act_k[l], S, d, d);
+            ane_conv_eval(m->kern_v[l], m->act_xnorm[l], m->act_v[l], S, d, d);
+        } else {
+            cpu_matmul(m->wq[l], m->act_xnorm[l], m->act_q[l], S, d, d);
+            cpu_matmul(m->wk[l], m->act_xnorm[l], m->act_k[l], S, d, d);
+            cpu_matmul(m->wv[l], m->act_xnorm[l], m->act_v[l], S, d, d);
+        }
+
+        cpu_rope(m->act_q[l], m->act_k[l], S, nh, hdim);
+        cpu_attention(m->act_attn_out[l], m->act_q[l], m->act_k[l], m->act_v[l], S, nh, hdim);
+
+        float *o_out = (float*)malloc(S * d * sizeof(float));
+        if (use_ane) {
+            ane_conv_eval(m->kern_o[l], m->act_attn_out[l], o_out, S, d, d);
+        } else {
+            cpu_matmul(m->wo[l], m->act_attn_out[l], o_out, S, d, d);
+        }
+        for (int i = 0; i < S * d; i++) x[i] += o_out[i];
+        free(o_out);
+
+        cpu_rmsnorm(m->act_ffn_in[l], x, m->rms_ffn_w[l], S, d);
+
+        if (use_ane) {
+            ane_conv_eval(m->kern_w1[l], m->act_ffn_in[l], m->act_h1[l], S, d, hd);
+            ane_conv_eval(m->kern_w3[l], m->act_ffn_in[l], m->act_h3[l], S, d, hd);
+        } else {
+            cpu_matmul(m->w1[l], m->act_ffn_in[l], m->act_h1[l], S, d, hd);
+            cpu_matmul(m->w3[l], m->act_ffn_in[l], m->act_h3[l], S, d, hd);
+        }
+
+        for (int t = 0; t < S; t++)
+            for (int i = 0; i < hd; i++)
+                m->act_silu[l][t*hd+i] = silu_f(m->act_h1[l][t*hd+i]) * m->act_h3[l][t*hd+i];
+
+        float *ffn_out = (float*)malloc(S * d * sizeof(float));
+        if (use_ane) {
+            ane_conv_eval(m->kern_w2[l], m->act_silu[l], ffn_out, S, hd, d);
+        } else {
+            cpu_matmul(m->w2[l], m->act_silu[l], ffn_out, S, hd, d);
+        }
+        for (int i = 0; i < S * d; i++) x[i] += ffn_out[i];
+        free(ffn_out);
+    }
+
+    memcpy(m->act_pre_final, x, S * d * sizeof(float));
+    cpu_rmsnorm(m->act_final, x, m->rms_final_w, S, d);
+
+    if (use_ane && m->kern_cls) {
+        ane_conv_eval(m->kern_cls, m->act_final, m->logits, S, d, vs);
+    } else {
+        cpu_matmul(m->wcls, m->act_final, m->logits, S, d, vs);
+    }
+
+    free(x);
+
+    float loss = 0;
+    for (int t = 0; t < S - 1; t++) {
+        float mx = -1e9f;
+        for (int i = 0; i < vs; i++) if (m->logits[t*vs+i] > mx) mx = m->logits[t*vs+i];
+        float sm = 0;
+        for (int i = 0; i < vs; i++) sm += expf(m->logits[t*vs+i] - mx);
+        float log_prob = m->logits[t*vs + tokens[t+1]] - mx - logf(sm);
+        loss -= log_prob;
+    }
+    return loss / (S - 1);
+}
diff --git a/training/model.h b/training/model.h
new file mode 100644
index 0000000..6cee52f
--- /dev/null
+++ b/training/model.h
@@ -0,0 +1,256 @@
+// model.h — Stories110M model struct + weight loading + ANE kernel compilation
+// Training version: baked-weight conv kernels, recompile when weights update
+#pragma once
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include "ane_runtime.h"
+#include "ane_mil_gen.h"
+
+#define N_LAYERS 12
+#define DIM 768
+#define HIDDEN_DIM 2048
+#define N_HEADS 12
+#define HEAD_DIM 64
+#define VOCAB_SIZE 32000
+#define MAX_SEQ 1024
+
+typedef struct {
+    int dim, hidden_dim, n_layers, n_heads, n_kv_heads, vocab_size, seq_len;
+} Config;
+
+typedef struct {
+    Config cfg;
+    int seq_len; // training sequence length
+
+    // Raw weights (f32)
+    float *token_embedding;  // [vocab_size, dim]
+    float *rms_att_w[N_LAYERS]; // [dim]
+    float *wq[N_LAYERS];       // [dim, dim]
+    float *wk[N_LAYERS];       // [dim, dim]
+    float *wv[N_LAYERS];       // [dim, dim]
+    float *wo[N_LAYERS];       // [dim, dim]
+    float *rms_ffn_w[N_LAYERS]; // [dim]
+    float *w1[N_LAYERS];       // [hidden_dim, dim]
+    float *w2[N_LAYERS];       // [dim, hidden_dim]
+    float *w3[N_LAYERS];       // [hidden_dim, dim]
+    float *rms_final_w;        // [dim]
+    float *wcls;               // [vocab_size, dim]
+
+    // Per-layer ANE conv kernels (baked weights, recompiled on update)
+    ANEKernel *kern_q[N_LAYERS];   // Q projection: dim→dim
+    ANEKernel *kern_k[N_LAYERS];   // K projection: dim→dim
+    ANEKernel *kern_v[N_LAYERS];   // V projection: dim→dim
+    ANEKernel *kern_o[N_LAYERS];   // O projection: dim→dim
+    ANEKernel *kern_w1[N_LAYERS];  // FFN w1: dim→hidden
+    ANEKernel *kern_w2[N_LAYERS];  // FFN w2: hidden→dim
+    ANEKernel *kern_w3[N_LAYERS];  // FFN w3: dim→hidden
+    ANEKernel *kern_cls;           // Classifier: dim→vocab
+
+    // Gradient accumulators (f32)
+    float *grad_wq[N_LAYERS], *grad_wk[N_LAYERS], *grad_wv[N_LAYERS], *grad_wo[N_LAYERS];
+    float *grad_w1[N_LAYERS], *grad_w2[N_LAYERS], *grad_w3[N_LAYERS];
+    float *grad_wcls;
+    float *grad_emb;
+
+    // Adam optimizer state
+    float *adam_m, *adam_v;
+    int adam_step;
+    size_t total_params;
+
+    // Activation cache for backward
+    float *act_x[N_LAYERS];
+    float *act_xnorm[N_LAYERS];
+    float *act_q[N_LAYERS];
+    float *act_k[N_LAYERS];
+    float *act_v[N_LAYERS];
+    float *act_attn_out[N_LAYERS];
+    float *act_ffn_in[N_LAYERS];
+    float *act_h1[N_LAYERS];
+    float *act_h3[N_LAYERS];
+    float *act_silu[N_LAYERS];
+    float *act_final;
+    float *act_pre_final;
+    float *logits;
+} Model;
+
+static int model_load_weights(Model *m, const char *path) {
+    FILE *f = fopen(path, "rb");
+    if (!f) { fprintf(stderr, "Cannot open %s\n", path); return -1; }
+    fread(&m->cfg, sizeof(Config), 1, f);
+    bool shared = m->cfg.vocab_size > 0;
+    if (m->cfg.vocab_size < 0) m->cfg.vocab_size = -m->cfg.vocab_size;
+
+    printf("Model: dim=%d hidden=%d layers=%d heads=%d vocab=%d seq=%d\n",
+           m->cfg.dim, m->cfg.hidden_dim, m->cfg.n_layers, m->cfg.n_heads,
+           m->cfg.vocab_size, m->cfg.seq_len);
+
+    int d = m->cfg.dim, hd = m->cfg.hidden_dim, nl = m->cfg.n_layers, vs = m->cfg.vocab_size;
+
+    m->token_embedding = (float*)malloc(vs * d * sizeof(float));
+    fread(m->token_embedding, sizeof(float), vs * d, f);
+
+    float *rms_att_all = (float*)malloc(nl * d * sizeof(float));
+    float *wq_all = (float*)malloc(nl * d * d * sizeof(float));
+    float *wk_all = (float*)malloc(nl * d * d * sizeof(float));
+    float *wv_all = (float*)malloc(nl * d * d * sizeof(float));
+    float *wo_all = (float*)malloc(nl * d * d * sizeof(float));
+    float *rms_ffn_all = (float*)malloc(nl * d * sizeof(float));
+    float *w1_all = (float*)malloc(nl * hd * d * sizeof(float));
+    float *w2_all = (float*)malloc(nl * d * hd * sizeof(float));
+    float *w3_all = (float*)malloc(nl * hd * d * sizeof(float));
+
+    fread(rms_att_all, sizeof(float), nl * d, f);
+    fread(wq_all, sizeof(float), nl * d * d, f);
+    fread(wk_all, sizeof(float), nl * d * d, f);
+    fread(wv_all, sizeof(float), nl * d * d, f);
+    fread(wo_all, sizeof(float), nl * d * d, f);
+    fread(rms_ffn_all, sizeof(float), nl * d, f);
+    fread(w1_all, sizeof(float), nl * hd * d, f);
+    fread(w2_all, sizeof(float), nl * d * hd, f);
+    fread(w3_all, sizeof(float), nl * hd * d, f);
+
+    for (int l = 0; l < nl; l++) {
+        m->rms_att_w[l] = (float*)malloc(d * sizeof(float));
+        memcpy(m->rms_att_w[l], rms_att_all + l*d, d * sizeof(float));
+        m->wq[l] = (float*)malloc(d*d*sizeof(float));
+        memcpy(m->wq[l], wq_all + l*d*d, d*d*sizeof(float));
+        m->wk[l] = (float*)malloc(d*d*sizeof(float));
+        memcpy(m->wk[l], wk_all + l*d*d, d*d*sizeof(float));
+        m->wv[l] = (float*)malloc(d*d*sizeof(float));
+        memcpy(m->wv[l], wv_all + l*d*d, d*d*sizeof(float));
+        m->wo[l] = (float*)malloc(d*d*sizeof(float));
+        memcpy(m->wo[l], wo_all + l*d*d, d*d*sizeof(float));
+        m->rms_ffn_w[l] = (float*)malloc(d * sizeof(float));
+        memcpy(m->rms_ffn_w[l], rms_ffn_all + l*d, d * sizeof(float));
+        m->w1[l] = (float*)malloc(hd*d*sizeof(float));
+        memcpy(m->w1[l], w1_all + l*hd*d, hd*d*sizeof(float));
+        m->w2[l] = (float*)malloc(d*hd*sizeof(float));
+        memcpy(m->w2[l], w2_all + l*d*hd, d*hd*sizeof(float));
+        m->w3[l] = (float*)malloc(hd*d*sizeof(float));
+        memcpy(m->w3[l], w3_all + l*hd*d, hd*d*sizeof(float));
+    }
+    free(rms_att_all); free(wq_all); free(wk_all); free(wv_all); free(wo_all);
+    free(rms_ffn_all); free(w1_all); free(w2_all); free(w3_all);
+
+    m->rms_final_w = (float*)malloc(d * sizeof(float));
+    fread(m->rms_final_w, sizeof(float), d, f);
+
+    if (shared) {
+        m->wcls = m->token_embedding;
+    } else {
+        m->wcls = (float*)malloc(vs * d * sizeof(float));
+        fread(m->wcls, sizeof(float), vs * d, f);
+    }
+    fclose(f);
+    return 0;
+}
+
+// Compile a single baked-weight conv kernel
+static ANEKernel *compile_conv_kernel(const float *weights, int in_ch, int out_ch, int spatial) {
+    NSData *wb = mil_build_weight_blob(weights, out_ch, in_ch);
+    NSString *mil = mil_gen_conv(in_ch, out_ch, spatial);
+    size_t inBytes = (size_t)in_ch * spatial * 4;
+    size_t outBytes = (size_t)out_ch * spatial * 4;
+    return ane_compile([mil dataUsingEncoding:NSUTF8StringEncoding], wb, 1, &inBytes, 1, &outBytes);
+}
+
+// Compile all per-layer ANE kernels with current weights
+static int model_compile_kernels(Model *m, int seq_len) {
+    m->seq_len = seq_len;
+    int d = m->cfg.dim, hd = m->cfg.hidden_dim, vs = m->cfg.vocab_size;
+    int S = seq_len;
+    printf("Compiling %d ANE conv kernels (S=%d)...\n", N_LAYERS * 7 + 1, S);
+
+    for (int l = 0; l < N_LAYERS; l++) {
+        m->kern_q[l] = compile_conv_kernel(m->wq[l], d, d, S);
+        m->kern_k[l] = compile_conv_kernel(m->wk[l], d, d, S);
+        m->kern_v[l] = compile_conv_kernel(m->wv[l], d, d, S);
+        m->kern_o[l] = compile_conv_kernel(m->wo[l], d, d, S);
+        m->kern_w1[l] = compile_conv_kernel(m->w1[l], d, hd, S);
+        m->kern_w2[l] = compile_conv_kernel(m->w2[l], hd, d, S);
+        m->kern_w3[l] = compile_conv_kernel(m->w3[l], d, hd, S);
+        if (!m->kern_q[l]) { fprintf(stderr, "L%d kern_q fail\n",l); return -1; }
+        if (!m->kern_k[l]) { fprintf(stderr, "L%d kern_k fail\n",l); return -1; }
+        if (!m->kern_v[l]) { fprintf(stderr, "L%d kern_v fail\n",l); return -1; }
+        if (!m->kern_o[l]) { fprintf(stderr, "L%d kern_o fail\n",l); return -1; }
+        if (!m->kern_w1[l]) { fprintf(stderr, "L%d kern_w1 fail\n",l); return -1; }
+        if (!m->kern_w2[l]) { fprintf(stderr, "L%d kern_w2 fail\n",l); return -1; }
+        if (!m->kern_w3[l]) { fprintf(stderr, "L%d kern_w3 fail\n",l); return -1; }
+        printf("  Layer %d OK\n", l);
+    }
+    m->kern_cls = compile_conv_kernel(m->wcls, d, vs, S);
+    if (!m->kern_cls) {
+        fprintf(stderr, "Classifier kernel compile failed (dim=%d→vocab=%d too large?), using CPU for cls\n", d, vs);
+    }
+    printf("  All kernels compiled (%d conv + %s)\n", N_LAYERS * 7, m->kern_cls ? "cls" : "cls=CPU");
+    return 0;
+}
+
+// Recompile all kernels after weight update — unload all first to avoid ANE model limit
+static int model_recompile_kernels(Model *m) {
+    int d = m->cfg.dim, hd = m->cfg.hidden_dim, vs = m->cfg.vocab_size;
+    int S = m->seq_len;
+    // Phase 1: unload+free all
+    for (int l = 0; l < N_LAYERS; l++) {
+        ane_free(m->kern_q[l]); ane_free(m->kern_k[l]); ane_free(m->kern_v[l]); ane_free(m->kern_o[l]);
+        ane_free(m->kern_w1[l]); ane_free(m->kern_w2[l]); ane_free(m->kern_w3[l]);
+        m->kern_q[l]=m->kern_k[l]=m->kern_v[l]=m->kern_o[l]=NULL;
+        m->kern_w1[l]=m->kern_w2[l]=m->kern_w3[l]=NULL;
+    }
+    if (m->kern_cls) { ane_free(m->kern_cls); m->kern_cls=NULL; }
+    // Phase 2: recompile all
+    for (int l = 0; l < N_LAYERS; l++) {
+        m->kern_q[l] = compile_conv_kernel(m->wq[l], d, d, S);
+        m->kern_k[l] = compile_conv_kernel(m->wk[l], d, d, S);
+        m->kern_v[l] = compile_conv_kernel(m->wv[l], d, d, S);
+        m->kern_o[l] = compile_conv_kernel(m->wo[l], d, d, S);
+        m->kern_w1[l] = compile_conv_kernel(m->w1[l], d, hd, S);
+        m->kern_w2[l] = compile_conv_kernel(m->w2[l], hd, d, S);
+        m->kern_w3[l] = compile_conv_kernel(m->w3[l], d, hd, S);
+        if (!m->kern_q[l] || !m->kern_k[l] || !m->kern_v[l] || !m->kern_o[l] ||
+            !m->kern_w1[l] || !m->kern_w2[l] || !m->kern_w3[l]) return -1;
+    }
+    m->kern_cls = compile_conv_kernel(m->wcls, d, vs, S);
+    // cls may fail for large vocab — that's OK, forward uses CPU fallback
+    return 0;
+}
+
+static void model_alloc_training(Model *m) {
+    int d = m->cfg.dim, hd = m->cfg.hidden_dim, vs = m->cfg.vocab_size, S = m->seq_len;
+    for (int l = 0; l < N_LAYERS; l++) {
+        m->act_x[l] = (float*)calloc(S * d, sizeof(float));
+        m->act_xnorm[l] = (float*)calloc(S * d, sizeof(float));
+        m->act_q[l] = (float*)calloc(S * d, sizeof(float));
+        m->act_k[l] = (float*)calloc(S * d, sizeof(float));
+        m->act_v[l] = (float*)calloc(S * d, sizeof(float));
+        m->act_attn_out[l] = (float*)calloc(S * d, sizeof(float));
+        m->act_ffn_in[l] = (float*)calloc(S * d, sizeof(float));
+        m->act_h1[l] = (float*)calloc(S * hd, sizeof(float));
+        m->act_h3[l] = (float*)calloc(S * hd, sizeof(float));
+        m->act_silu[l] = (float*)calloc(S * hd, sizeof(float));
+
+        m->grad_wq[l] = (float*)calloc(d * d, sizeof(float));
+        m->grad_wk[l] = (float*)calloc(d * d, sizeof(float));
+        m->grad_wv[l] = (float*)calloc(d * d, sizeof(float));
+        m->grad_wo[l] = (float*)calloc(d * d, sizeof(float));
+        m->grad_w1[l] = (float*)calloc(hd * d, sizeof(float));
+        m->grad_w2[l] = (float*)calloc(d * hd, sizeof(float));
+        m->grad_w3[l] = (float*)calloc(hd * d, sizeof(float));
+    }
+    m->act_final = (float*)calloc(S * d, sizeof(float));
+    m->act_pre_final = (float*)calloc(S * d, sizeof(float));
+    m->logits = (float*)calloc(S * vs, sizeof(float));
+    m->grad_wcls = (float*)calloc(vs * d, sizeof(float));
+    m->grad_emb = (float*)calloc(vs * d, sizeof(float));
+
+    m->total_params = 0;
+    for (int l = 0; l < N_LAYERS; l++)
+        m->total_params += 4*(size_t)d*d + 2*(size_t)hd*d + (size_t)d*hd;
+    m->total_params += (size_t)vs * d * 2;
+    m->adam_m = (float*)calloc(m->total_params, sizeof(float));
+    m->adam_v = (float*)calloc(m->total_params, sizeof(float));
+    m->adam_step = 0;
+    printf("Total trainable params: %zu (%.1f M)\n", m->total_params, m->total_params/1e6);
+}
diff --git a/training/test_ane_causal_attn.m b/training/test_ane_causal_attn.m
new file mode 100644
index 0000000..cb9b761
--- /dev/null
+++ b/training/test_ane_causal_attn.m
@@ -0,0 +1,295 @@
+// Decomposed causal attention: Q@K^T on ANE, mask+softmax on CPU, scores@V on ANE
+// This gives us causal masking with ANE acceleration for the matmuls
+#import <Foundation/Foundation.h>
+#import <objc/message.h>
+#import <dlfcn.h>
+#import <IOSurface/IOSurface.h>
+#import <mach/mach_time.h>
+#include <math.h>
+
+#define HEADS 12
+#define HD 64
+#define SEQ 64
+
+static Class g_D, g_I, g_AR, g_AIO;
+static mach_timebase_info_data_t g_tb;
+static void ane_init(void) {
+    dlopen("/System/Library/PrivateFrameworks/AppleNeuralEngine.framework/AppleNeuralEngine", RTLD_NOW);
+    g_D  = NSClassFromString(@"_ANEInMemoryModelDescriptor");
+    g_I  = NSClassFromString(@"_ANEInMemoryModel");
+    g_AR = NSClassFromString(@"_ANERequest");
+    g_AIO= NSClassFromString(@"_ANEIOSurfaceObject");
+}
+static double tb_ms(uint64_t t) { return (double)t * g_tb.numer / g_tb.denom / 1e6; }
+static IOSurfaceRef make_surface(size_t bytes) {
+    return IOSurfaceCreate((__bridge CFDictionaryRef)@{
+        (id)kIOSurfaceWidth:@(bytes), (id)kIOSurfaceHeight:@1,
+        (id)kIOSurfaceBytesPerElement:@1, (id)kIOSurfaceBytesPerRow:@(bytes),
+        (id)kIOSurfaceAllocSize:@(bytes), (id)kIOSurfacePixelFormat:@0});
+}
+
+typedef struct { id model; NSString *td; } Kern;
+
+static Kern compile_mil(NSString *mil) {
+    Kern k = {nil, nil};
+    NSData *md = [mil dataUsingEncoding:NSUTF8StringEncoding];
+    id desc = ((id(*)(Class,SEL,id,id,id))objc_msgSend)(g_D, @selector(modelWithMILText:weights:optionsPlist:), md, @{}, nil);
+    if (!desc) { printf("desc=NULL\n"); return k; }
+    id mdl = ((id(*)(Class,SEL,id))objc_msgSend)(g_I, @selector(inMemoryModelWithDescriptor:), desc);
+    id hx = ((id(*)(id,SEL))objc_msgSend)(mdl, @selector(hexStringIdentifier));
+    NSString *td = [NSTemporaryDirectory() stringByAppendingPathComponent:hx];
+    [[NSFileManager defaultManager] createDirectoryAtPath:td withIntermediateDirectories:YES attributes:nil error:nil];
+    [md writeToFile:[td stringByAppendingPathComponent:@"model.mil"] atomically:YES];
+    NSError *e = nil;
+    if (!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(compileWithQoS:options:error:), 21, @{}, &e)) {
+        printf("compile FAIL: %s\n", e?[[e localizedDescription] UTF8String]:"");
+        [[NSFileManager defaultManager] removeItemAtPath:td error:nil]; return k;
+    }
+    ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(loadWithQoS:options:error:), 21, @{}, &e);
+    k.model = mdl; k.td = td;
+    return k;
+}
+
+static BOOL ane_eval(Kern *k, IOSurfaceRef *ins, int nin, IOSurfaceRef out) {
+    NSMutableArray *inArr = [NSMutableArray array], *inIdx = [NSMutableArray array];
+    for (int i = 0; i < nin; i++) {
+        [inArr addObject:((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ins[i])];
+        [inIdx addObject:@(i)];
+    }
+    id wO = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), out);
+    id req = ((id(*)(Class,SEL,id,id,id,id,id,id,id))objc_msgSend)(g_AR,
+        @selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:),
+        inArr, inIdx, @[wO], @[@0], nil, nil, @0);
+    NSError *e = nil;
+    return ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)(
+        k->model, @selector(evaluateWithQoS:options:request:error:), 21, @{}, req, &e);
+}
+
+static void cleanup_kern(Kern *k) {
+    if (!k->model) return;
+    NSError *e = nil;
+    ((BOOL(*)(id,SEL,unsigned int,NSError**))objc_msgSend)(k->model, @selector(unloadWithQoS:error:), 21, &e);
+    [[NSFileManager defaultManager] removeItemAtPath:k->td error:nil];
+}
+
+int main() {
+    @autoreleasepool {
+        setbuf(stdout, NULL);
+        ane_init();
+        mach_timebase_info(&g_tb);
+
+        // === Approach 1: Non-causal SDPA (baseline) ===
+        printf("=== Non-causal SDPA (baseline) ===\n");
+        NSString *sdpa_mil = [NSString stringWithFormat:
+            @"program(1.3)\n[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, "
+            "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, "
+            "{\"coremltools-version\", \"9.0\"}})]\n{\n"
+            "    func main<ios18>(tensor<fp16, [1, %d, %d, %d]> q, "
+            "tensor<fp16, [1, %d, %d, %d]> k, tensor<fp16, [1, %d, %d, %d]> v) {\n"
+            "        tensor<fp16, [1, %d, %d, %d]> att = scaled_dot_product_attention("
+            "query = q, key = k, value = v)[name = string(\"sdpa\")];\n"
+            "    } -> (att);\n}\n",
+            HEADS, SEQ, HD, HEADS, SEQ, HD, HEADS, SEQ, HD, HEADS, SEQ, HD];
+        Kern kSDPA = compile_mil(sdpa_mil);
+        printf("SDPA compile: %s\n", kSDPA.model ? "OK" : "FAIL");
+
+        // === Approach 2: Decomposed causal via matmul ops ===
+        // Step 1: Q @ K^T → scores [1, HEADS, SEQ, SEQ]
+        // MIL matmul: matmul(x=Q, y=K, transpose_y=true)
+        // Q shape: [1, HEADS, SEQ, HD], K shape: [1, HEADS, SEQ, HD]
+        // scores = Q @ K^T → [1, HEADS, SEQ, SEQ]
+        printf("\n=== Decomposed causal attention ===\n");
+        NSString *qkt_mil = [NSString stringWithFormat:
+            @"program(1.3)\n[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, "
+            "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, "
+            "{\"coremltools-version\", \"9.0\"}})]\n{\n"
+            "    func main<ios18>(tensor<fp16, [1, %d, %d, %d]> q, "
+            "tensor<fp16, [1, %d, %d, %d]> k) {\n"
+            "        tensor<fp16, [1, %d, %d, %d]> scores = matmul("
+            "x = q, y = k, transpose_y = true)[name = string(\"qkt\")];\n"
+            "    } -> (scores);\n}\n",
+            HEADS, SEQ, HD, HEADS, SEQ, HD, HEADS, SEQ, SEQ];
+        Kern kQKT = compile_mil(qkt_mil);
+        printf("Q@K^T compile: %s\n", kQKT.model ? "OK" : "FAIL");
+
+        // Step 3: scores_softmax @ V → output [1, HEADS, SEQ, HD]
+        NSString *sv_mil = [NSString stringWithFormat:
+            @"program(1.3)\n[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, "
+            "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, "
+            "{\"coremltools-version\", \"9.0\"}})]\n{\n"
+            "    func main<ios18>(tensor<fp16, [1, %d, %d, %d]> s, "
+            "tensor<fp16, [1, %d, %d, %d]> v) {\n"
+            "        tensor<fp16, [1, %d, %d, %d]> out = matmul("
+            "x = s, y = v)[name = string(\"sv\")];\n"
+            "    } -> (out);\n}\n",
+            HEADS, SEQ, SEQ, HEADS, SEQ, HD, HEADS, SEQ, HD];
+        Kern kSV = compile_mil(sv_mil);
+        printf("scores@V compile: %s\n", kSV.model ? "OK" : "FAIL");
+
+        if (!kSDPA.model || !kQKT.model || !kSV.model) {
+            printf("Some kernels failed to compile, aborting\n");
+            goto done;
+        }
+
+        // Generate test data
+        srand48(42);
+        int total_qkv = HEADS * SEQ * HD;
+        _Float16 *Q = (_Float16*)malloc(total_qkv * 2);
+        _Float16 *K = (_Float16*)malloc(total_qkv * 2);
+        _Float16 *V = (_Float16*)malloc(total_qkv * 2);
+        for (int i = 0; i < total_qkv; i++) {
+            Q[i] = (_Float16)(0.5f * (2*drand48()-1));
+            K[i] = (_Float16)(0.5f * (2*drand48()-1));
+            V[i] = (_Float16)(0.5f * (2*drand48()-1));
+        }
+
+        // IOSurfaces for Q, K, V
+        size_t qkv_bytes = total_qkv * 2;
+        IOSurfaceRef ioQ = make_surface(qkv_bytes), ioK = make_surface(qkv_bytes), ioV = make_surface(qkv_bytes);
+        IOSurfaceLock(ioQ, 0, NULL); memcpy(IOSurfaceGetBaseAddress(ioQ), Q, qkv_bytes); IOSurfaceUnlock(ioQ, 0, NULL);
+        IOSurfaceLock(ioK, 0, NULL); memcpy(IOSurfaceGetBaseAddress(ioK), K, qkv_bytes); IOSurfaceUnlock(ioK, 0, NULL);
+        IOSurfaceLock(ioV, 0, NULL); memcpy(IOSurfaceGetBaseAddress(ioV), V, qkv_bytes); IOSurfaceUnlock(ioV, 0, NULL);
+
+        // Scores IOSurface: [1, HEADS, SEQ, SEQ]
+        int total_scores = HEADS * SEQ * SEQ;
+        size_t scores_bytes = total_scores * 2;
+        IOSurfaceRef ioScores = make_surface(scores_bytes);
+        IOSurfaceRef ioOut_sdpa = make_surface(qkv_bytes);
+        IOSurfaceRef ioOut_decomp = make_surface(qkv_bytes);
+
+        // === Run non-causal SDPA ===
+        {
+            IOSurfaceRef ins[] = {ioQ, ioK, ioV};
+            if (!ane_eval(&kSDPA, ins, 3, ioOut_sdpa)) { printf("SDPA eval FAIL\n"); goto done; }
+        }
+
+        // === Run decomposed causal ===
+        // Step 1: Q@K^T on ANE
+        {
+            IOSurfaceRef ins[] = {ioQ, ioK};
+            if (!ane_eval(&kQKT, ins, 2, ioScores)) { printf("Q@K^T eval FAIL\n"); goto done; }
+        }
+
+        // Step 2: Scale + causal mask + softmax on CPU
+        {
+            IOSurfaceLock(ioScores, 0, NULL);
+            _Float16 *scores = (_Float16*)IOSurfaceGetBaseAddress(ioScores);
+            float scale = 1.0f / sqrtf((float)HD);
+            for (int h = 0; h < HEADS; h++) {
+                for (int t = 0; t < SEQ; t++) {
+                    // Apply scale, causal mask, and softmax
+                    float row[SEQ], maxs = -1e30f;
+                    for (int t2 = 0; t2 < SEQ; t2++) {
+                        float s = (float)scores[h*SEQ*SEQ + t*SEQ + t2] * scale;
+                        if (t2 > t) s = -1e30f;  // causal mask
+                        row[t2] = s;
+                        if (s > maxs) maxs = s;
+                    }
+                    float sum = 0;
+                    for (int t2 = 0; t2 < SEQ; t2++) { row[t2] = expf(row[t2] - maxs); sum += row[t2]; }
+                    for (int t2 = 0; t2 < SEQ; t2++)
+                        scores[h*SEQ*SEQ + t*SEQ + t2] = (_Float16)(row[t2] / sum);
+                }
+            }
+            IOSurfaceUnlock(ioScores, 0, NULL);
+        }
+
+        // Step 3: softmax_scores @ V on ANE
+        {
+            IOSurfaceRef ins[] = {ioScores, ioV};
+            if (!ane_eval(&kSV, ins, 2, ioOut_decomp)) { printf("scores@V eval FAIL\n"); goto done; }
+        }
+
+        // === Verify decomposed causal ===
+        {
+            float scale = 1.0f / sqrtf((float)HD);
+            IOSurfaceLock(ioOut_decomp, kIOSurfaceLockReadOnly, NULL);
+            _Float16 *out = (_Float16*)IOSurfaceGetBaseAddress(ioOut_decomp);
+            float maxdiff = 0;
+            for (int h = 0; h < HEADS; h++)
+                for (int t = 0; t < SEQ; t++) {
+                    float scores[SEQ], maxs = -1e30f;
+                    for (int t2 = 0; t2 <= t; t2++) {
+                        float s = 0;
+                        for (int d = 0; d < HD; d++) s += (float)Q[h*SEQ*HD+t*HD+d]*(float)K[h*SEQ*HD+t2*HD+d];
+                        s *= scale; scores[t2] = s; if(s>maxs) maxs=s;
+                    }
+                    float sum = 0;
+                    for (int t2 = 0; t2 <= t; t2++) { scores[t2]=expf(scores[t2]-maxs); sum+=scores[t2]; }
+                    for (int t2 = 0; t2 <= t; t2++) scores[t2]/=sum;
+                    for (int d = 0; d < HD; d++) {
+                        float ref = 0;
+                        for (int t2 = 0; t2 <= t; t2++) ref += scores[t2]*(float)V[h*SEQ*HD+t2*HD+d];
+                        float diff = fabsf((float)out[h*SEQ*HD+t*HD+d] - ref);
+                        if(diff>maxdiff) maxdiff=diff;
+                    }
+                }
+            IOSurfaceUnlock(ioOut_decomp, kIOSurfaceLockReadOnly, NULL);
+            printf("\nDecomposed causal max diff vs CPU ref: %.6f\n", maxdiff);
+        }
+
+        // === Benchmark: SDPA vs decomposed ===
+        printf("\n=== Benchmarks ===\n");
+        int N = 500;
+        {
+            IOSurfaceRef ins[] = {ioQ, ioK, ioV};
+            // Warmup
+            for (int i = 0; i < 10; i++) ane_eval(&kSDPA, ins, 3, ioOut_sdpa);
+            uint64_t t0 = mach_absolute_time();
+            for (int i = 0; i < N; i++) ane_eval(&kSDPA, ins, 3, ioOut_sdpa);
+            double ms = tb_ms(mach_absolute_time() - t0);
+            double flops = 4.0 * HEADS * SEQ * SEQ * HD;
+            printf("SDPA (non-causal): %.3f ms/eval, %.1f GFLOPS\n", ms/N, N*flops/ms/1e6);
+        }
+        {
+            // Decomposed: QKT + CPU softmax + SV
+            // Warmup
+            for (int i = 0; i < 10; i++) {
+                IOSurfaceRef ins1[] = {ioQ, ioK};
+                ane_eval(&kQKT, ins1, 2, ioScores);
+                // Skip CPU softmax in benchmark for ANE-only timing
+                IOSurfaceRef ins2[] = {ioScores, ioV};
+                ane_eval(&kSV, ins2, 2, ioOut_decomp);
+            }
+            uint64_t t0 = mach_absolute_time();
+            for (int i = 0; i < N; i++) {
+                IOSurfaceRef ins1[] = {ioQ, ioK};
+                ane_eval(&kQKT, ins1, 2, ioScores);
+                // CPU softmax + causal mask
+                IOSurfaceLock(ioScores, 0, NULL);
+                _Float16 *sc = (_Float16*)IOSurfaceGetBaseAddress(ioScores);
+                float scale = 1.0f / sqrtf((float)HD);
+                for (int h = 0; h < HEADS; h++)
+                    for (int t = 0; t < SEQ; t++) {
+                        float row[SEQ], maxs = -1e30f;
+                        for (int t2 = 0; t2 < SEQ; t2++) {
+                            float s = (float)sc[h*SEQ*SEQ+t*SEQ+t2] * scale;
+                            if (t2 > t) s = -1e30f;
+                            row[t2] = s; if(s>maxs) maxs=s;
+                        }
+                        float sum = 0;
+                        for (int t2 = 0; t2 < SEQ; t2++) { row[t2]=expf(row[t2]-maxs); sum+=row[t2]; }
+                        for (int t2 = 0; t2 < SEQ; t2++)
+                            sc[h*SEQ*SEQ+t*SEQ+t2] = (_Float16)(row[t2]/sum);
+                    }
+                IOSurfaceUnlock(ioScores, 0, NULL);
+                IOSurfaceRef ins2[] = {ioScores, ioV};
+                ane_eval(&kSV, ins2, 2, ioOut_decomp);
+            }
+            double ms = tb_ms(mach_absolute_time() - t0);
+            double flops = 4.0 * HEADS * SEQ * SEQ * HD;
+            printf("Decomposed causal: %.3f ms/eval, %.1f GFLOPS\n", ms/N, N*flops/ms/1e6);
+        }
+
+        CFRelease(ioQ); CFRelease(ioK); CFRelease(ioV);
+        CFRelease(ioScores); CFRelease(ioOut_sdpa); CFRelease(ioOut_decomp);
+        free(Q); free(K); free(V);
+
+        done:
+        cleanup_kern(&kSDPA);
+        cleanup_kern(&kQKT);
+        cleanup_kern(&kSV);
+        printf("\nDONE\n");
+    }
+    return 0;
+}
diff --git a/training/test_ane_sdpa5.m b/training/test_ane_sdpa5.m
new file mode 100644
index 0000000..0ddce84
--- /dev/null
+++ b/training/test_ane_sdpa5.m
@@ -0,0 +1,297 @@
+// Debug: why causal mask doesn't apply. Try different approaches.
+#import <Foundation/Foundation.h>
+#import <objc/message.h>
+#import <dlfcn.h>
+#import <IOSurface/IOSurface.h>
+#include <math.h>
+
+#define HEADS 12
+#define HD 64
+#define SEQ 8  // small for readable output
+
+static Class g_D, g_I, g_AR, g_AIO;
+static void ane_init(void) {
+    dlopen("/System/Library/PrivateFrameworks/AppleNeuralEngine.framework/AppleNeuralEngine", RTLD_NOW);
+    g_D  = NSClassFromString(@"_ANEInMemoryModelDescriptor");
+    g_I  = NSClassFromString(@"_ANEInMemoryModel");
+    g_AR = NSClassFromString(@"_ANERequest");
+    g_AIO= NSClassFromString(@"_ANEIOSurfaceObject");
+}
+static IOSurfaceRef make_surface(size_t bytes) {
+    return IOSurfaceCreate((__bridge CFDictionaryRef)@{
+        (id)kIOSurfaceWidth:@(bytes), (id)kIOSurfaceHeight:@1,
+        (id)kIOSurfaceBytesPerElement:@1, (id)kIOSurfaceBytesPerRow:@(bytes),
+        (id)kIOSurfaceAllocSize:@(bytes), (id)kIOSurfacePixelFormat:@0});
+}
+
+// Build inline mask string for MIL: tensor<fp16, [1,1,S,S]>([v00, v01, ...])
+static NSString *build_inline_causal_mask(int s) {
+    NSMutableString *vals = [NSMutableString string];
+    for (int t = 0; t < s; t++) {
+        for (int t2 = 0; t2 < s; t2++) {
+            if (t > 0 || t2 > 0) [vals appendString:@", "];
+            [vals appendString:(t2 <= t) ? @"0" : @"-65504"];  // fp16 -inf
+        }
+    }
+    return [NSString stringWithFormat:
+        @"tensor<fp16, [1, 1, %d, %d]>([%@])", s, s, vals];
+}
+
+static NSData *build_mask_blob(int seq) {
+    int wsize = seq * seq * 2;
+    int total = 128 + wsize;
+    uint8_t *buf = (uint8_t*)calloc(total, 1);
+    buf[0]=1; buf[4]=2; buf[64]=0xEF; buf[65]=0xBE; buf[66]=0xAD; buf[67]=0xDE; buf[68]=1;
+    *(uint32_t*)(buf+72)=wsize; *(uint32_t*)(buf+80)=128;
+    _Float16 *fp16 = (_Float16*)(buf+128);
+    for (int t = 0; t < seq; t++)
+        for (int t2 = 0; t2 < seq; t2++)
+            fp16[t*seq + t2] = (t2 <= t) ? (_Float16)0.0f : (_Float16)(-65504.0f);
+    return [NSData dataWithBytesNoCopy:buf length:total freeWhenDone:YES];
+}
+
+typedef struct { id model; NSString *td; } Model;
+
+static Model compile_model(NSString *mil, NSDictionary *wd) {
+    Model m = {nil, nil};
+    NSData *md = [mil dataUsingEncoding:NSUTF8StringEncoding];
+    id desc = ((id(*)(Class,SEL,id,id,id))objc_msgSend)(g_D, @selector(modelWithMILText:weights:optionsPlist:), md, wd ?: @{}, nil);
+    if (!desc) { printf("  desc=NULL\n"); return m; }
+    id mdl = ((id(*)(Class,SEL,id))objc_msgSend)(g_I, @selector(inMemoryModelWithDescriptor:), desc);
+    id hx = ((id(*)(id,SEL))objc_msgSend)(mdl, @selector(hexStringIdentifier));
+    NSString *td = [NSTemporaryDirectory() stringByAppendingPathComponent:hx];
+    [[NSFileManager defaultManager] createDirectoryAtPath:[td stringByAppendingPathComponent:@"weights"] withIntermediateDirectories:YES attributes:nil error:nil];
+    [md writeToFile:[td stringByAppendingPathComponent:@"model.mil"] atomically:YES];
+    for (NSString *path in wd) {
+        [wd[path][@"data"] writeToFile:[td stringByAppendingPathComponent:[path stringByReplacingOccurrencesOfString:@"@model_path/" withString:@""]] atomically:YES];
+    }
+    NSError *e = nil;
+    if (!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(compileWithQoS:options:error:), 21, @{}, &e)) {
+        printf("  compile FAIL: %s\n", e?[[[e localizedDescription] substringToIndex:MIN(300,(int)[[e localizedDescription] length])] UTF8String]:"");
+        [[NSFileManager defaultManager] removeItemAtPath:td error:nil]; return m;
+    }
+    if (!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(loadWithQoS:options:error:), 21, @{}, &e)) {
+        printf("  load FAIL\n"); [[NSFileManager defaultManager] removeItemAtPath:td error:nil]; return m;
+    }
+    m.model = mdl; m.td = td;
+    return m;
+}
+
+static void cleanup_model(Model *m) {
+    if (!m->model) return;
+    NSError *e = nil;
+    ((BOOL(*)(id,SEL,unsigned int,NSError**))objc_msgSend)(m->model, @selector(unloadWithQoS:error:), 21, &e);
+    [[NSFileManager defaultManager] removeItemAtPath:m->td error:nil];
+}
+
+int main() {
+    @autoreleasepool {
+        setbuf(stdout, NULL);
+        ane_init();
+
+        srand48(42);
+        int total = HEADS * SEQ * HD;
+        _Float16 *Q = (_Float16*)malloc(total * 2);
+        _Float16 *K = (_Float16*)malloc(total * 2);
+        _Float16 *V = (_Float16*)malloc(total * 2);
+        for (int i = 0; i < total; i++) {
+            Q[i] = (_Float16)(0.5f * (2*drand48()-1));
+            K[i] = (_Float16)(0.5f * (2*drand48()-1));
+            V[i] = (_Float16)(0.5f * (2*drand48()-1));
+        }
+
+        size_t bytes = total * 2;
+        IOSurfaceRef ioQ = make_surface(bytes), ioK = make_surface(bytes);
+        IOSurfaceRef ioV = make_surface(bytes);
+        IOSurfaceLock(ioQ, 0, NULL); memcpy(IOSurfaceGetBaseAddress(ioQ), Q, bytes); IOSurfaceUnlock(ioQ, 0, NULL);
+        IOSurfaceLock(ioK, 0, NULL); memcpy(IOSurfaceGetBaseAddress(ioK), K, bytes); IOSurfaceUnlock(ioK, 0, NULL);
+        IOSurfaceLock(ioV, 0, NULL); memcpy(IOSurfaceGetBaseAddress(ioV), V, bytes); IOSurfaceUnlock(ioV, 0, NULL);
+        id wQ = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioQ);
+        id wK = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioK);
+        id wV = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioV);
+
+        // CPU references
+        float scale = 1.0f / sqrtf((float)HD);
+        float *cpu_causal = (float*)calloc(total, sizeof(float));
+        float *cpu_nocausal = (float*)calloc(total, sizeof(float));
+        for (int h = 0; h < HEADS; h++)
+            for (int t = 0; t < SEQ; t++) {
+                // Causal
+                float scores[SEQ], maxs = -1e30f;
+                for (int t2 = 0; t2 <= t; t2++) {
+                    float s = 0;
+                    for (int d = 0; d < HD; d++) s += (float)Q[h*SEQ*HD+t*HD+d]*(float)K[h*SEQ*HD+t2*HD+d];
+                    s *= scale; scores[t2] = s; if(s>maxs) maxs=s;
+                }
+                float sum = 0;
+                for (int t2 = 0; t2 <= t; t2++) { scores[t2]=expf(scores[t2]-maxs); sum+=scores[t2]; }
+                for (int t2 = 0; t2 <= t; t2++) scores[t2]/=sum;
+                for (int d = 0; d < HD; d++) {
+                    float r = 0;
+                    for (int t2 = 0; t2 <= t; t2++) r += scores[t2]*(float)V[h*SEQ*HD+t2*HD+d];
+                    cpu_causal[h*SEQ*HD+t*HD+d] = r;
+                }
+                // Non-causal
+                maxs = -1e30f;
+                for (int t2 = 0; t2 < SEQ; t2++) {
+                    float s = 0;
+                    for (int d = 0; d < HD; d++) s += (float)Q[h*SEQ*HD+t*HD+d]*(float)K[h*SEQ*HD+t2*HD+d];
+                    s *= scale; scores[t2] = s; if(s>maxs) maxs=s;
+                }
+                sum = 0;
+                for (int t2 = 0; t2 < SEQ; t2++) { scores[t2]=expf(scores[t2]-maxs); sum+=scores[t2]; }
+                for (int t2 = 0; t2 < SEQ; t2++) scores[t2]/=sum;
+                for (int d = 0; d < HD; d++) {
+                    float r = 0;
+                    for (int t2 = 0; t2 < SEQ; t2++) r += scores[t2]*(float)V[h*SEQ*HD+t2*HD+d];
+                    cpu_nocausal[h*SEQ*HD+t*HD+d] = r;
+                }
+            }
+
+        // Helper: eval and compare
+        void (^eval_and_compare)(const char*, Model*, int nInputs, IOSurfaceRef*) =
+            ^(const char *label, Model *m, int nInputs, IOSurfaceRef *inputs) {
+            IOSurfaceRef ioO = make_surface(bytes);
+            id wO = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioO);
+            NSMutableArray *inArr = [NSMutableArray array];
+            NSMutableArray *inIdx = [NSMutableArray array];
+            for (int i = 0; i < nInputs; i++) {
+                [inArr addObject:((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), inputs[i])];
+                [inIdx addObject:@(i)];
+            }
+            id req = ((id(*)(Class,SEL,id,id,id,id,id,id,id))objc_msgSend)(g_AR,
+                @selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:),
+                inArr, inIdx, @[wO], @[@0], nil, nil, @0);
+            NSError *e = nil;
+            BOOL ok = ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)(
+                m->model, @selector(evaluateWithQoS:options:request:error:), 21, @{}, req, &e);
+            if (!ok) {
+                printf("  %s: eval FAIL: %s\n", label, e?[[[e localizedDescription] substringToIndex:MIN(200,(int)[[e localizedDescription] length])] UTF8String]:"");
+                CFRelease(ioO); return;
+            }
+            IOSurfaceLock(ioO, kIOSurfaceLockReadOnly, NULL);
+            _Float16 *out = (_Float16*)IOSurfaceGetBaseAddress(ioO);
+            float dc=0, dnc=0;
+            for (int i = 0; i < total; i++) {
+                float v = (float)out[i];
+                float d1 = fabsf(v - cpu_causal[i]); if(d1>dc) dc=d1;
+                float d2 = fabsf(v - cpu_nocausal[i]); if(d2>dnc) dnc=d2;
+            }
+            IOSurfaceUnlock(ioO, kIOSurfaceLockReadOnly, NULL);
+            printf("  %s: diff_causal=%.6f diff_nocausal=%.6f → %s\n", label, dc, dnc,
+                   dc < dnc ? "CAUSAL" : (dc > dnc ? "NON-CAUSAL" : "SAME"));
+            CFRelease(ioO);
+        };
+
+        // === Test 1: No mask (should be non-causal) ===
+        printf("Test 1: no mask\n");
+        {
+            NSString *mil = [NSString stringWithFormat:
+                @"program(1.3)\n[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, "
+                "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, "
+                "{\"coremltools-version\", \"9.0\"}})]\n{\n"
+                "    func main<ios18>(tensor<fp16, [1, %d, %d, %d]> q, "
+                "tensor<fp16, [1, %d, %d, %d]> k, tensor<fp16, [1, %d, %d, %d]> v) {\n"
+                "        tensor<fp16, [1, %d, %d, %d]> att = scaled_dot_product_attention("
+                "query = q, key = k, value = v)[name = string(\"sdpa\")];\n"
+                "    } -> (att);\n}\n",
+                HEADS, SEQ, HD, HEADS, SEQ, HD, HEADS, SEQ, HD, HEADS, SEQ, HD];
+            Model m = compile_model(mil, nil);
+            if (m.model) {
+                IOSurfaceRef ins[] = {ioQ, ioK, ioV};
+                eval_and_compare("no-mask", &m, 3, ins);
+                cleanup_model(&m);
+            }
+        }
+
+        // === Test 2: Inline causal mask ===
+        printf("\nTest 2: inline causal mask\n");
+        {
+            NSString *maskStr = build_inline_causal_mask(SEQ);
+            NSString *mil = [NSString stringWithFormat:
+                @"program(1.3)\n[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, "
+                "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, "
+                "{\"coremltools-version\", \"9.0\"}})]\n{\n"
+                "    func main<ios18>(tensor<fp16, [1, %d, %d, %d]> q, "
+                "tensor<fp16, [1, %d, %d, %d]> k, tensor<fp16, [1, %d, %d, %d]> v) {\n"
+                "        %@ mask = const()[name = string(\"mask\"), val = %@];\n"
+                "        tensor<fp16, [1, %d, %d, %d]> att = scaled_dot_product_attention("
+                "query = q, key = k, value = v, attn_mask = mask)[name = string(\"sdpa\")];\n"
+                "    } -> (att);\n}\n",
+                HEADS, SEQ, HD, HEADS, SEQ, HD, HEADS, SEQ, HD,
+                [NSString stringWithFormat:@"tensor<fp16, [1, 1, %d, %d]>", SEQ, SEQ], maskStr,
+                HEADS, SEQ, HD];
+            Model m = compile_model(mil, nil);
+            if (m.model) {
+                IOSurfaceRef ins[] = {ioQ, ioK, ioV};
+                eval_and_compare("inline-mask", &m, 3, ins);
+                cleanup_model(&m);
+            }
+        }
+
+        // === Test 3: BLOBFILE mask ===
+        printf("\nTest 3: BLOBFILE causal mask\n");
+        {
+            NSString *mil = [NSString stringWithFormat:
+                @"program(1.3)\n[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, "
+                "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, "
+                "{\"coremltools-version\", \"9.0\"}})]\n{\n"
+                "    func main<ios18>(tensor<fp16, [1, %d, %d, %d]> q, "
+                "tensor<fp16, [1, %d, %d, %d]> k, tensor<fp16, [1, %d, %d, %d]> v) {\n"
+                "        tensor<fp16, [1, 1, %d, %d]> mask = const()[name = string(\"mask\"), "
+                "val = tensor<fp16, [1, 1, %d, %d]>(BLOBFILE(path = string(\"@model_path/weights/mask.bin\"), offset = uint64(64)))];\n"
+                "        tensor<fp16, [1, %d, %d, %d]> att = scaled_dot_product_attention("
+                "query = q, key = k, value = v, attn_mask = mask)[name = string(\"sdpa\")];\n"
+                "    } -> (att);\n}\n",
+                HEADS, SEQ, HD, HEADS, SEQ, HD, HEADS, SEQ, HD,
+                SEQ, SEQ, SEQ, SEQ, HEADS, SEQ, HD];
+            NSDictionary *wd = @{@"@model_path/weights/mask.bin": @{@"offset":@0, @"data":build_mask_blob(SEQ)}};
+            Model m = compile_model(mil, wd);
+            if (m.model) {
+                IOSurfaceRef ins[] = {ioQ, ioK, ioV};
+                eval_and_compare("blob-mask", &m, 3, ins);
+                cleanup_model(&m);
+            }
+        }
+
+        // === Test 4: mask as runtime input ===
+        printf("\nTest 4: mask as runtime input\n");
+        {
+            NSString *mil = [NSString stringWithFormat:
+                @"program(1.3)\n[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, "
+                "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, "
+                "{\"coremltools-version\", \"9.0\"}})]\n{\n"
+                "    func main<ios18>(tensor<fp16, [1, %d, %d, %d]> q, "
+                "tensor<fp16, [1, %d, %d, %d]> k, tensor<fp16, [1, %d, %d, %d]> v, "
+                "tensor<fp16, [1, 1, %d, %d]> mask) {\n"
+                "        tensor<fp16, [1, %d, %d, %d]> att = scaled_dot_product_attention("
+                "query = q, key = k, value = v, attn_mask = mask)[name = string(\"sdpa\")];\n"
+                "    } -> (att);\n}\n",
+                HEADS, SEQ, HD, HEADS, SEQ, HD, HEADS, SEQ, HD,
+                SEQ, SEQ, HEADS, SEQ, HD];
+            Model m = compile_model(mil, nil);
+            if (m.model) {
+                // Create mask IOSurface
+                size_t mbytes = SEQ * SEQ * 2;
+                IOSurfaceRef ioM = make_surface(mbytes);
+                IOSurfaceLock(ioM, 0, NULL);
+                _Float16 *mp = (_Float16*)IOSurfaceGetBaseAddress(ioM);
+                for (int t = 0; t < SEQ; t++)
+                    for (int t2 = 0; t2 < SEQ; t2++)
+                        mp[t*SEQ+t2] = (t2 <= t) ? (_Float16)0.0f : (_Float16)(-65504.0f);
+                IOSurfaceUnlock(ioM, 0, NULL);
+
+                IOSurfaceRef ins[] = {ioQ, ioK, ioV, ioM};
+                eval_and_compare("runtime-mask", &m, 4, ins);
+                CFRelease(ioM);
+                cleanup_model(&m);
+            }
+        }
+
+        CFRelease(ioQ); CFRelease(ioK); CFRelease(ioV);
+        free(Q); free(K); free(V);
+        free(cpu_causal); free(cpu_nocausal);
+        printf("\nDONE\n");
+    }
+    return 0;
+}
diff --git a/training/test_conv_attn3.m b/training/test_conv_attn3.m
new file mode 100644
index 0000000..a396b4d
--- /dev/null
+++ b/training/test_conv_attn3.m
@@ -0,0 +1,276 @@
+// Grouped conv causal attention with CORRECT layout A: blob[oc*ICg + ic]
+#import <Foundation/Foundation.h>
+#import <objc/message.h>
+#import <dlfcn.h>
+#import <IOSurface/IOSurface.h>
+#import <mach/mach_time.h>
+#include <math.h>
+
+#define HEADS 12
+#define HD 64
+#define DIM (HEADS*HD)
+#define SEQ 64
+
+static Class g_D, g_I, g_AR, g_AIO;
+static mach_timebase_info_data_t g_tb;
+static void ane_init(void) {
+    dlopen("/System/Library/PrivateFrameworks/AppleNeuralEngine.framework/AppleNeuralEngine", RTLD_NOW);
+    g_D  = NSClassFromString(@"_ANEInMemoryModelDescriptor");
+    g_I  = NSClassFromString(@"_ANEInMemoryModel");
+    g_AR = NSClassFromString(@"_ANERequest");
+    g_AIO= NSClassFromString(@"_ANEIOSurfaceObject");
+}
+static double tb_ms(uint64_t t) { return (double)t * g_tb.numer / g_tb.denom / 1e6; }
+static IOSurfaceRef make_surface(size_t bytes) {
+    return IOSurfaceCreate((__bridge CFDictionaryRef)@{
+        (id)kIOSurfaceWidth:@(bytes), (id)kIOSurfaceHeight:@1,
+        (id)kIOSurfaceBytesPerElement:@1, (id)kIOSurfaceBytesPerRow:@(bytes),
+        (id)kIOSurfaceAllocSize:@(bytes), (id)kIOSurfacePixelFormat:@0});
+}
+static NSData *build_blob_raw(_Float16 *data, int count) {
+    int wsize = count * 2, total = 128 + wsize;
+    uint8_t *buf = (uint8_t*)calloc(total, 1);
+    buf[0]=1; buf[4]=2; buf[64]=0xEF; buf[65]=0xBE; buf[66]=0xAD; buf[67]=0xDE; buf[68]=1;
+    *(uint32_t*)(buf+72)=wsize; *(uint32_t*)(buf+80)=128;
+    memcpy(buf+128, data, wsize);
+    return [NSData dataWithBytesNoCopy:buf length:total freeWhenDone:YES];
+}
+typedef struct { id model; NSString *td; } Kern;
+static Kern compile_mil(NSString *mil, NSDictionary *wd) {
+    Kern k = {nil, nil};
+    NSData *md = [mil dataUsingEncoding:NSUTF8StringEncoding];
+    id desc = ((id(*)(Class,SEL,id,id,id))objc_msgSend)(g_D, @selector(modelWithMILText:weights:optionsPlist:), md, wd ?: @{}, nil);
+    if (!desc) { printf("desc=NULL\n"); return k; }
+    id mdl = ((id(*)(Class,SEL,id))objc_msgSend)(g_I, @selector(inMemoryModelWithDescriptor:), desc);
+    id hx = ((id(*)(id,SEL))objc_msgSend)(mdl, @selector(hexStringIdentifier));
+    NSString *td = [NSTemporaryDirectory() stringByAppendingPathComponent:hx];
+    [[NSFileManager defaultManager] createDirectoryAtPath:[td stringByAppendingPathComponent:@"weights"]
+        withIntermediateDirectories:YES attributes:nil error:nil];
+    [md writeToFile:[td stringByAppendingPathComponent:@"model.mil"] atomically:YES];
+    for (NSString *path in wd) {
+        [wd[path][@"data"] writeToFile:[td stringByAppendingPathComponent:
+            [path stringByReplacingOccurrencesOfString:@"@model_path/" withString:@""]] atomically:YES];
+    }
+    NSError *e = nil;
+    if (!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(compileWithQoS:options:error:), 21, @{}, &e)) {
+        printf("compile FAIL: %s\n", e?[[e localizedDescription] UTF8String]:""); return k;
+    }
+    ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(loadWithQoS:options:error:), 21, @{}, &e);
+    k.model = mdl; k.td = td;
+    return k;
+}
+static BOOL ane_eval(Kern *k, IOSurfaceRef *ins, int nin, IOSurfaceRef out) {
+    NSMutableArray *inArr = [NSMutableArray array], *inIdx = [NSMutableArray array];
+    for (int i = 0; i < nin; i++) {
+        [inArr addObject:((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ins[i])];
+        [inIdx addObject:@(i)];
+    }
+    id wO = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), out);
+    id req = ((id(*)(Class,SEL,id,id,id,id,id,id,id))objc_msgSend)(g_AR,
+        @selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:),
+        inArr, inIdx, @[wO], @[@0], nil, nil, @0);
+    NSError *e = nil;
+    return ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)(
+        k->model, @selector(evaluateWithQoS:options:request:error:), 21, @{}, req, &e);
+}
+static void cleanup_kern(Kern *k) {
+    if (!k->model) return;
+    NSError *e = nil;
+    ((BOOL(*)(id,SEL,unsigned int,NSError**))objc_msgSend)(k->model, @selector(unloadWithQoS:error:), 21, &e);
+    [[NSFileManager defaultManager] removeItemAtPath:k->td error:nil];
+}
+
+static NSString *gen_conv_mil(int ic, int oc, int icg, int groups, int sp) {
+    return [NSString stringWithFormat:
+        @"program(1.3)\n[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, "
+        "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, "
+        "{\"coremltools-version\", \"9.0\"}})]\n{\n"
+        "    func main<ios18>(tensor<fp16, [1, %d, 1, %d]> x) {\n"
+        "        tensor<fp16, [%d, %d, 1, 1]> W = const()[name = string(\"W\"), "
+        "val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = string(\"@model_path/weights/w.bin\"), offset = uint64(64)))];\n"
+        "        string pt = const()[name = string(\"pt\"), val = string(\"valid\")];\n"
+        "        tensor<int32, [2]> st = const()[name = string(\"st\"), val = tensor<int32, [2]>([1, 1])];\n"
+        "        tensor<int32, [4]> pd = const()[name = string(\"pd\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
+        "        tensor<int32, [2]> dl = const()[name = string(\"dl\"), val = tensor<int32, [2]>([1, 1])];\n"
+        "        int32 gr = const()[name = string(\"gr\"), val = int32(%d)];\n"
+        "        tensor<fp16, [1, %d, 1, %d]> y = conv(dilations = dl, groups = gr, pad = pd, "
+        "pad_type = pt, strides = st, weight = W, x = x)[name = string(\"cv\")];\n"
+        "    } -> (y);\n}\n", ic, sp, oc, icg, oc, icg, groups, oc, sp];
+}
+
+int main() {
+    @autoreleasepool {
+        setbuf(stdout, NULL);
+        ane_init();
+        mach_timebase_info(&g_tb);
+
+        printf("=== Grouped Conv Causal Attention (layout A) ===\n");
+        printf("HEADS=%d HD=%d SEQ=%d\n\n", HEADS, HD, SEQ);
+
+        srand48(42);
+        float *Q = (float*)malloc(SEQ*DIM*sizeof(float));
+        float *K = (float*)malloc(SEQ*DIM*sizeof(float));
+        float *V = (float*)malloc(SEQ*DIM*sizeof(float));
+        for (int i = 0; i < SEQ*DIM; i++) {
+            Q[i] = 0.5f*(2*drand48()-1);
+            K[i] = 0.5f*(2*drand48()-1);
+            V[i] = 0.5f*(2*drand48()-1);
+        }
+
+        // Q@K^T grouped conv weight: [HEADS*SEQ, HD, 1, 1] with groups=HEADS
+        // Layout A: blob[oc * ICg + ic] where ICg = HD
+        // For head h: oc = h*SEQ+t2, ic = d (within group)
+        // We want: output[h*SEQ+t2, t] = sum_d Q[h*HD+d, t] * K_weight[h*SEQ+t2, d]
+        // So K_weight[oc, ic] = K[t2, h*HD+d] where oc=h*SEQ+t2, ic=d
+        int kw_count = HEADS * SEQ * HD;
+        _Float16 *kw = (_Float16*)malloc(kw_count * sizeof(_Float16));
+        for (int h = 0; h < HEADS; h++)
+            for (int t2 = 0; t2 < SEQ; t2++)
+                for (int d = 0; d < HD; d++) {
+                    int oc = h*SEQ + t2;
+                    kw[oc*HD + d] = (_Float16)K[t2*DIM + h*HD + d];
+                }
+        NSDictionary *qkt_wd = @{@"@model_path/weights/w.bin": @{@"offset":@0, @"data":build_blob_raw(kw, kw_count)}};
+        free(kw);
+
+        // scores@V grouped conv weight: [HEADS*HD, SEQ, 1, 1] with groups=HEADS
+        // oc = h*HD+d, ic = t2 (within group, ICg=SEQ)
+        // V_weight[oc, ic] = V[t2, h*HD+d]
+        int vw_count = HEADS * HD * SEQ;
+        _Float16 *vw = (_Float16*)malloc(vw_count * sizeof(_Float16));
+        for (int h = 0; h < HEADS; h++)
+            for (int d = 0; d < HD; d++)
+                for (int t2 = 0; t2 < SEQ; t2++) {
+                    int oc = h*HD + d;
+                    vw[oc*SEQ + t2] = (_Float16)V[t2*DIM + h*HD + d];
+                }
+        NSDictionary *sv_wd = @{@"@model_path/weights/w.bin": @{@"offset":@0, @"data":build_blob_raw(vw, vw_count)}};
+        free(vw);
+
+        // Compile
+        printf("Compiling Q@K^T (grouped conv, groups=%d)...\n", HEADS);
+        NSString *qkt_mil = gen_conv_mil(HEADS*HD, HEADS*SEQ, HD, HEADS, SEQ);
+        Kern kQKT = compile_mil(qkt_mil, qkt_wd);
+        printf("  %s\n", kQKT.model ? "OK" : "FAIL");
+
+        printf("Compiling scores@V (grouped conv, groups=%d)...\n", HEADS);
+        NSString *sv_mil = gen_conv_mil(HEADS*SEQ, HEADS*HD, SEQ, HEADS, SEQ);
+        Kern kSV = compile_mil(sv_mil, sv_wd);
+        printf("  %s\n", kSV.model ? "OK" : "FAIL");
+
+        if (!kQKT.model || !kSV.model) { printf("FAIL\n"); goto done; }
+
+        // Prepare Q IOSurface [1, DIM, 1, SEQ] fp16
+        size_t q_bytes = DIM * SEQ * 2;
+        IOSurfaceRef ioQ = make_surface(q_bytes);
+        IOSurfaceLock(ioQ, 0, NULL);
+        _Float16 *qp = (_Float16*)IOSurfaceGetBaseAddress(ioQ);
+        for (int t = 0; t < SEQ; t++)
+            for (int c = 0; c < DIM; c++)
+                qp[c*SEQ + t] = (_Float16)Q[t*DIM + c];
+        IOSurfaceUnlock(ioQ, 0, NULL);
+
+        size_t sc_bytes = HEADS * SEQ * SEQ * 2;
+        IOSurfaceRef ioScores = make_surface(sc_bytes);
+        IOSurfaceRef ioOut = make_surface(q_bytes);
+
+        // Step 1: Q@K^T
+        IOSurfaceRef ins1[] = {ioQ};
+        if (!ane_eval(&kQKT, ins1, 1, ioScores)) { printf("Q@K^T eval FAIL\n"); goto done; }
+
+        // Step 2: Scale + causal mask + softmax (CPU)
+        float scale = 1.0f / sqrtf((float)HD);
+        IOSurfaceLock(ioScores, 0, NULL);
+        _Float16 *sc = (_Float16*)IOSurfaceGetBaseAddress(ioScores);
+        for (int h = 0; h < HEADS; h++)
+            for (int t = 0; t < SEQ; t++) {
+                float row[SEQ], maxs = -1e30f;
+                for (int t2 = 0; t2 < SEQ; t2++) {
+                    // scores channel = h*SEQ+t2, spatial = t
+                    float s = (float)sc[(h*SEQ+t2)*SEQ + t] * scale;
+                    if (t2 > t) s = -1e30f;
+                    row[t2] = s;
+                    if (s > maxs) maxs = s;
+                }
+                float sum = 0;
+                for (int t2 = 0; t2 < SEQ; t2++) { row[t2] = expf(row[t2]-maxs); sum += row[t2]; }
+                for (int t2 = 0; t2 < SEQ; t2++)
+                    sc[(h*SEQ+t2)*SEQ + t] = (_Float16)(row[t2] / sum);
+            }
+        IOSurfaceUnlock(ioScores, 0, NULL);
+
+        // Step 3: scores@V
+        IOSurfaceRef ins2[] = {ioScores};
+        if (!ane_eval(&kSV, ins2, 1, ioOut)) { printf("scores@V eval FAIL\n"); goto done; }
+
+        // Verify
+        IOSurfaceLock(ioOut, kIOSurfaceLockReadOnly, NULL);
+        _Float16 *out = (_Float16*)IOSurfaceGetBaseAddress(ioOut);
+        float maxdiff = 0;
+        for (int h = 0; h < HEADS; h++)
+            for (int t = 0; t < SEQ; t++) {
+                float sc2[SEQ], maxs = -1e30f;
+                for (int t2 = 0; t2 <= t; t2++) {
+                    float s = 0;
+                    for (int d = 0; d < HD; d++) s += Q[t*DIM+h*HD+d]*K[t2*DIM+h*HD+d];
+                    s *= scale; sc2[t2] = s; if(s>maxs) maxs=s;
+                }
+                float sum = 0;
+                for (int t2 = 0; t2 <= t; t2++) { sc2[t2]=expf(sc2[t2]-maxs); sum+=sc2[t2]; }
+                for (int t2 = 0; t2 <= t; t2++) sc2[t2]/=sum;
+                for (int d = 0; d < HD; d++) {
+                    float ref = 0;
+                    for (int t2 = 0; t2 <= t; t2++) ref += sc2[t2]*V[t2*DIM+h*HD+d];
+                    float diff = fabsf((float)out[(h*HD+d)*SEQ+t] - ref);
+                    if (diff > maxdiff) maxdiff = diff;
+                }
+            }
+        IOSurfaceUnlock(ioOut, kIOSurfaceLockReadOnly, NULL);
+        printf("\nMax diff vs CPU causal ref: %.6f → %s\n", maxdiff, maxdiff < 0.05f ? "PASS" : "FAIL");
+
+        // Benchmark
+        printf("\n=== Benchmark ===\n");
+        int N = 500;
+        for (int i = 0; i < 20; i++) { ane_eval(&kQKT, ins1, 1, ioScores); ane_eval(&kSV, ins2, 1, ioOut); }
+
+        uint64_t t0 = mach_absolute_time();
+        for (int i = 0; i < N; i++) {
+            ane_eval(&kQKT, ins1, 1, ioScores);
+            ane_eval(&kSV, ins2, 1, ioOut);
+        }
+        double ms_ane = tb_ms(mach_absolute_time() - t0);
+
+        t0 = mach_absolute_time();
+        for (int i = 0; i < N; i++) {
+            ane_eval(&kQKT, ins1, 1, ioScores);
+            IOSurfaceLock(ioScores, 0, NULL);
+            _Float16 *s = (_Float16*)IOSurfaceGetBaseAddress(ioScores);
+            for (int h = 0; h < HEADS; h++)
+                for (int t = 0; t < SEQ; t++) {
+                    float row[SEQ], maxs = -1e30f;
+                    for (int t2 = 0; t2 < SEQ; t2++) {
+                        float v = (float)s[(h*SEQ+t2)*SEQ+t]*scale;
+                        if(t2>t) v=-1e30f; row[t2]=v; if(v>maxs) maxs=v;
+                    }
+                    float sum=0;
+                    for (int t2=0;t2<SEQ;t2++){row[t2]=expf(row[t2]-maxs);sum+=row[t2];}
+                    for (int t2=0;t2<SEQ;t2++) s[(h*SEQ+t2)*SEQ+t]=(_Float16)(row[t2]/sum);
+                }
+            IOSurfaceUnlock(ioScores, 0, NULL);
+            ane_eval(&kSV, ins2, 1, ioOut);
+        }
+        double ms_full = tb_ms(mach_absolute_time() - t0);
+
+        double flops = 2.0 * HEADS * SEQ * SEQ * HD * 2;
+        printf("ANE-only (2 convs):  %.3f ms/iter  %.1f GFLOPS\n", ms_ane/N, N*flops/ms_ane/1e6);
+        printf("Full pipeline:       %.3f ms/iter  %.1f GFLOPS\n", ms_full/N, N*flops/ms_full/1e6);
+        printf("CPU softmax:         %.3f ms/iter\n", (ms_full-ms_ane)/N);
+
+        CFRelease(ioQ); CFRelease(ioScores); CFRelease(ioOut);
+        free(Q); free(K); free(V);
+        done:
+        cleanup_kern(&kQKT); cleanup_kern(&kSV);
+        printf("\nDONE\n");
+    }
+    return 0;
+}
diff --git a/training/test_full_fused.m b/training/test_full_fused.m
new file mode 100644
index 0000000..8449ddb
--- /dev/null
+++ b/training/test_full_fused.m
@@ -0,0 +1,379 @@
+// Full fused forward: QKV convs → reshape → matmul(Q,K^T) → scale+mask → softmax → matmul(scores,V) → Wo conv
+// If ANE compiler rejects matmul, we'll know and fall back
+// Also test: fused scores@V + Wo (2 convs in 1 dispatch)
+#import <Foundation/Foundation.h>
+#import <objc/message.h>
+#import <dlfcn.h>
+#import <IOSurface/IOSurface.h>
+#import <mach/mach_time.h>
+#include <math.h>
+
+#define DIM 768
+#define HEADS 12
+#define HD (DIM/HEADS)
+#define HIDDEN 2048
+#define SEQ 64
+
+static Class g_D, g_I, g_AR, g_AIO;
+static mach_timebase_info_data_t g_tb;
+static void ane_init(void) {
+    dlopen("/System/Library/PrivateFrameworks/AppleNeuralEngine.framework/AppleNeuralEngine", RTLD_NOW);
+    g_D  = NSClassFromString(@"_ANEInMemoryModelDescriptor");
+    g_I  = NSClassFromString(@"_ANEInMemoryModel");
+    g_AR = NSClassFromString(@"_ANERequest");
+    g_AIO= NSClassFromString(@"_ANEIOSurfaceObject");
+}
+static double tb_ms(uint64_t t) { return (double)t * g_tb.numer / g_tb.denom / 1e6; }
+static IOSurfaceRef make_surface(size_t bytes) {
+    return IOSurfaceCreate((__bridge CFDictionaryRef)@{
+        (id)kIOSurfaceWidth:@(bytes), (id)kIOSurfaceHeight:@1,
+        (id)kIOSurfaceBytesPerElement:@1, (id)kIOSurfaceBytesPerRow:@(bytes),
+        (id)kIOSurfaceAllocSize:@(bytes), (id)kIOSurfacePixelFormat:@0});
+}
+static NSData *build_blob(const float *w, int oc, int ic) {
+    int wsize = oc*ic*2, total = 128+wsize;
+    uint8_t *buf = (uint8_t*)calloc(total,1);
+    buf[0]=1; buf[4]=2; buf[64]=0xEF; buf[65]=0xBE; buf[66]=0xAD; buf[67]=0xDE; buf[68]=1;
+    *(uint32_t*)(buf+72)=wsize; *(uint32_t*)(buf+80)=128;
+    _Float16 *fp16 = (_Float16*)(buf+128);
+    for (int i = 0; i < oc*ic; i++) fp16[i] = (_Float16)w[i];
+    return [NSData dataWithBytesNoCopy:buf length:total freeWhenDone:YES];
+}
+static NSData *build_blob_fp16(_Float16 *data, int count) {
+    int wsize = count*2, total = 128+wsize;
+    uint8_t *buf = (uint8_t*)calloc(total,1);
+    buf[0]=1; buf[4]=2; buf[64]=0xEF; buf[65]=0xBE; buf[66]=0xAD; buf[67]=0xDE; buf[68]=1;
+    *(uint32_t*)(buf+72)=wsize; *(uint32_t*)(buf+80)=128;
+    memcpy(buf+128, data, wsize);
+    return [NSData dataWithBytesNoCopy:buf length:total freeWhenDone:YES];
+}
+
+typedef struct { id model; NSString *td; } Kern;
+static Kern compile_mil(NSString *mil, NSDictionary *wd) {
+    Kern k = {nil, nil};
+    NSData *md = [mil dataUsingEncoding:NSUTF8StringEncoding];
+    id desc = ((id(*)(Class,SEL,id,id,id))objc_msgSend)(g_D, @selector(modelWithMILText:weights:optionsPlist:), md, wd ?: @{}, nil);
+    if (!desc) { printf("  desc=NULL\n"); return k; }
+    id mdl = ((id(*)(Class,SEL,id))objc_msgSend)(g_I, @selector(inMemoryModelWithDescriptor:), desc);
+    id hx = ((id(*)(id,SEL))objc_msgSend)(mdl, @selector(hexStringIdentifier));
+    NSString *td = [NSTemporaryDirectory() stringByAppendingPathComponent:hx];
+    [[NSFileManager defaultManager] createDirectoryAtPath:[td stringByAppendingPathComponent:@"weights"]
+        withIntermediateDirectories:YES attributes:nil error:nil];
+    [md writeToFile:[td stringByAppendingPathComponent:@"model.mil"] atomically:YES];
+    for (NSString *path in wd) {
+        [wd[path][@"data"] writeToFile:[td stringByAppendingPathComponent:
+            [path stringByReplacingOccurrencesOfString:@"@model_path/" withString:@""]] atomically:YES];
+    }
+    NSError *e = nil;
+    if (!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(compileWithQoS:options:error:), 21, @{}, &e)) {
+        printf("  compile FAIL: %s\n", e?[[[e localizedDescription] substringToIndex:MIN(300,(int)[[e localizedDescription] length])] UTF8String]:"");
+        [[NSFileManager defaultManager] removeItemAtPath:td error:nil]; return k;
+    }
+    if (!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(loadWithQoS:options:error:), 21, @{}, &e)) {
+        printf("  load FAIL\n"); [[NSFileManager defaultManager] removeItemAtPath:td error:nil]; return k;
+    }
+    k.model = mdl; k.td = td;
+    return k;
+}
+static BOOL ane_eval_io(Kern *k, IOSurfaceRef *ins, int nin, IOSurfaceRef *outs, int nout) {
+    NSMutableArray *inArr = [NSMutableArray array], *inIdx = [NSMutableArray array];
+    NSMutableArray *outArr = [NSMutableArray array], *outIdx = [NSMutableArray array];
+    for (int i = 0; i < nin; i++) {
+        [inArr addObject:((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ins[i])];
+        [inIdx addObject:@(i)];
+    }
+    for (int i = 0; i < nout; i++) {
+        [outArr addObject:((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), outs[i])];
+        [outIdx addObject:@(i)];
+    }
+    id req = ((id(*)(Class,SEL,id,id,id,id,id,id,id))objc_msgSend)(g_AR,
+        @selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:),
+        inArr, inIdx, outArr, outIdx, nil, nil, @0);
+    NSError *e = nil;
+    return ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)(
+        k->model, @selector(evaluateWithQoS:options:request:error:), 21, @{}, req, &e);
+}
+static void cleanup_kern(Kern *k) {
+    if (!k->model) return;
+    NSError *e = nil;
+    ((BOOL(*)(id,SEL,unsigned int,NSError**))objc_msgSend)(k->model, @selector(unloadWithQoS:error:), 21, &e);
+    [[NSFileManager defaultManager] removeItemAtPath:k->td error:nil];
+}
+
+int main() {
+    @autoreleasepool {
+        setbuf(stdout, NULL);
+        ane_init();
+        mach_timebase_info(&g_tb);
+
+        srand48(42);
+        float sc_d = 1.0f/sqrtf(DIM), sc_h = 1.0f/sqrtf(HIDDEN);
+        float *Wq = (float*)malloc(DIM*DIM*4); for(int i=0;i<DIM*DIM;i++) Wq[i]=sc_d*(2*drand48()-1);
+        float *Wk = (float*)malloc(DIM*DIM*4); for(int i=0;i<DIM*DIM;i++) Wk[i]=sc_d*(2*drand48()-1);
+        float *Wv = (float*)malloc(DIM*DIM*4); for(int i=0;i<DIM*DIM;i++) Wv[i]=sc_d*(2*drand48()-1);
+        float *Wo = (float*)malloc(DIM*DIM*4); for(int i=0;i<DIM*DIM;i++) Wo[i]=sc_d*(2*drand48()-1);
+        float *W1 = (float*)malloc(HIDDEN*DIM*4); for(int i=0;i<HIDDEN*DIM;i++) W1[i]=sc_h*(2*drand48()-1);
+        float *W2 = (float*)malloc(DIM*HIDDEN*4); for(int i=0;i<DIM*HIDDEN;i++) W2[i]=sc_d*(2*drand48()-1);
+        float *W3 = (float*)malloc(HIDDEN*DIM*4); for(int i=0;i<HIDDEN*DIM;i++) W3[i]=sc_h*(2*drand48()-1);
+
+        // === Test 1: Full attention in one MIL graph ===
+        // QKV convs → reshape → matmul(Q,K^T) → scale → add causal mask → softmax → matmul(scores,V) → reshape → Wo conv
+        printf("=== Test 1: Full fused attention (QKV + matmul + softmax + Wo) ===\n");
+        {
+            // Build causal mask blob [1, 1, SEQ, SEQ]
+            _Float16 *mask = (_Float16*)calloc(SEQ*SEQ, sizeof(_Float16));
+            for (int t = 0; t < SEQ; t++)
+                for (int t2 = 0; t2 < SEQ; t2++)
+                    mask[t*SEQ+t2] = (t2 <= t) ? (_Float16)0.0f : (_Float16)(-65504.0f);
+
+            // scale constant
+            float scale_val = 1.0f / sqrtf((float)HD);
+
+            NSString *mil = [NSString stringWithFormat:
+                @"program(1.3)\n[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, "
+                "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, "
+                "{\"coremltools-version\", \"9.0\"}})]\n{\n"
+                "    func main<ios18>(tensor<fp16, [1, %d, 1, %d]> x) {\n"
+                // Conv boilerplate
+                "        string pt = const()[name = string(\"pt\"), val = string(\"valid\")];\n"
+                "        tensor<int32, [2]> st = const()[name = string(\"st\"), val = tensor<int32, [2]>([1, 1])];\n"
+                "        tensor<int32, [4]> pd = const()[name = string(\"pd\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
+                "        tensor<int32, [2]> dl = const()[name = string(\"dl\"), val = tensor<int32, [2]>([1, 1])];\n"
+                "        int32 gr1 = const()[name = string(\"g1\"), val = int32(1)];\n"
+                // QKV weights
+                "        tensor<fp16, [%d, %d, 1, 1]> Wq = const()[name = string(\"Wq\"), "
+                "val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = string(\"@model_path/weights/wq.bin\"), offset = uint64(64)))];\n"
+                "        tensor<fp16, [%d, %d, 1, 1]> Wk = const()[name = string(\"Wk\"), "
+                "val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = string(\"@model_path/weights/wk.bin\"), offset = uint64(64)))];\n"
+                "        tensor<fp16, [%d, %d, 1, 1]> Wv = const()[name = string(\"Wv\"), "
+                "val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = string(\"@model_path/weights/wv.bin\"), offset = uint64(64)))];\n"
+                "        tensor<fp16, [%d, %d, 1, 1]> Wout = const()[name = string(\"Wo\"), "
+                "val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = string(\"@model_path/weights/wo.bin\"), offset = uint64(64)))];\n"
+                // QKV projections
+                "        tensor<fp16, [1, %d, 1, %d]> q_flat = conv(dilations = dl, groups = gr1, pad = pd, "
+                "pad_type = pt, strides = st, weight = Wq, x = x)[name = string(\"cq\")];\n"
+                "        tensor<fp16, [1, %d, 1, %d]> k_flat = conv(dilations = dl, groups = gr1, pad = pd, "
+                "pad_type = pt, strides = st, weight = Wk, x = x)[name = string(\"ck\")];\n"
+                "        tensor<fp16, [1, %d, 1, %d]> v_flat = conv(dilations = dl, groups = gr1, pad = pd, "
+                "pad_type = pt, strides = st, weight = Wv, x = x)[name = string(\"cv\")];\n"
+                // Reshape: [1, DIM, 1, SEQ] → [1, HEADS, HD, SEQ] → transpose → [1, HEADS, SEQ, HD]
+                "        tensor<int32, [4]> qsh = const()[name = string(\"qsh\"), val = tensor<int32, [4]>([1, %d, %d, %d])];\n"
+                "        tensor<fp16, [1, %d, %d, %d]> q_4d = reshape(shape = qsh, x = q_flat)[name = string(\"rq\")];\n"
+                "        tensor<int32, [4]> perm = const()[name = string(\"pm\"), val = tensor<int32, [4]>([0, 1, 3, 2])];\n"
+                "        tensor<fp16, [1, %d, %d, %d]> q = transpose(perm = perm, x = q_4d)[name = string(\"tq\")];\n"
+                "        tensor<fp16, [1, %d, %d, %d]> k_4d = reshape(shape = qsh, x = k_flat)[name = string(\"rk\")];\n"
+                "        tensor<fp16, [1, %d, %d, %d]> k = transpose(perm = perm, x = k_4d)[name = string(\"tk\")];\n"
+                "        tensor<fp16, [1, %d, %d, %d]> v_4d = reshape(shape = qsh, x = v_flat)[name = string(\"rv\")];\n"
+                "        tensor<fp16, [1, %d, %d, %d]> v = transpose(perm = perm, x = v_4d)[name = string(\"tv\")];\n"
+                // Q @ K^T
+                "        bool ty = const()[name = string(\"ty\"), val = bool(true)];\n"
+                "        bool tx = const()[name = string(\"tx\"), val = bool(false)];\n"
+                "        tensor<fp16, [1, %d, %d, %d]> scores = matmul(transpose_x = tx, transpose_y = ty, x = q, y = k)[name = string(\"mm1\")];\n"
+                // Scale
+                "        fp16 sc = const()[name = string(\"sc\"), val = fp16(%f)];\n"
+                "        tensor<fp16, [1, %d, %d, %d]> scaled = mul(x = scores, y = sc)[name = string(\"scl\")];\n"
+                // Causal mask
+                "        tensor<fp16, [1, 1, %d, %d]> cmask = const()[name = string(\"cm\"), "
+                "val = tensor<fp16, [1, 1, %d, %d]>(BLOBFILE(path = string(\"@model_path/weights/mask.bin\"), offset = uint64(64)))];\n"
+                "        tensor<fp16, [1, %d, %d, %d]> masked = add(x = scaled, y = cmask)[name = string(\"msk\")];\n"
+                // Softmax
+                "        int32 sax = const()[name = string(\"sax\"), val = int32(-1)];\n"
+                "        tensor<fp16, [1, %d, %d, %d]> attn_w = softmax(axis = sax, x = masked)[name = string(\"sm\")];\n"
+                // scores @ V
+                "        tensor<fp16, [1, %d, %d, %d]> attn_4d = matmul(transpose_x = tx, transpose_y = tx, x = attn_w, y = v)[name = string(\"mm2\")];\n"
+                // Reshape back: [1, HEADS, SEQ, HD] → transpose → [1, HEADS, HD, SEQ] → reshape → [1, DIM, 1, SEQ]
+                "        tensor<fp16, [1, %d, %d, %d]> attn_t = transpose(perm = perm, x = attn_4d)[name = string(\"ta\")];\n"
+                "        tensor<int32, [4]> osh = const()[name = string(\"osh\"), val = tensor<int32, [4]>([1, %d, 1, %d])];\n"
+                "        tensor<fp16, [1, %d, 1, %d]> attn_flat = reshape(shape = osh, x = attn_t)[name = string(\"ra\")];\n"
+                // Wo projection
+                "        tensor<fp16, [1, %d, 1, %d]> out = conv(dilations = dl, groups = gr1, pad = pd, "
+                "pad_type = pt, strides = st, weight = Wout, x = attn_flat)[name = string(\"co\")];\n"
+                "    } -> (out);\n}\n",
+                DIM, SEQ,                              // input
+                DIM,DIM,DIM,DIM, DIM,DIM,DIM,DIM,      // Wq, Wk
+                DIM,DIM,DIM,DIM, DIM,DIM,DIM,DIM,      // Wv, Wo
+                DIM, SEQ, DIM, SEQ, DIM, SEQ,           // q_flat, k_flat, v_flat
+                HEADS, HD, SEQ,                         // reshape shape
+                HEADS, HD, SEQ,                         // q_4d
+                HEADS, SEQ, HD,                         // q (after transpose)
+                HEADS, HD, SEQ,                         // k_4d
+                HEADS, SEQ, HD,                         // k
+                HEADS, HD, SEQ,                         // v_4d
+                HEADS, SEQ, HD,                         // v
+                HEADS, SEQ, SEQ,                        // scores
+                scale_val,
+                HEADS, SEQ, SEQ,                        // scaled
+                SEQ, SEQ, SEQ, SEQ,                     // mask
+                HEADS, SEQ, SEQ,                        // masked
+                HEADS, SEQ, SEQ,                        // attn_w (softmax)
+                HEADS, SEQ, HD,                         // attn_4d
+                HEADS, HD, SEQ,                         // attn_t
+                DIM, SEQ,                               // reshape back
+                DIM, SEQ,                               // attn_flat
+                DIM, SEQ];                              // out
+
+            NSDictionary *wd = @{
+                @"@model_path/weights/wq.bin": @{@"offset":@0, @"data":build_blob(Wq,DIM,DIM)},
+                @"@model_path/weights/wk.bin": @{@"offset":@0, @"data":build_blob(Wk,DIM,DIM)},
+                @"@model_path/weights/wv.bin": @{@"offset":@0, @"data":build_blob(Wv,DIM,DIM)},
+                @"@model_path/weights/wo.bin": @{@"offset":@0, @"data":build_blob(Wo,DIM,DIM)},
+                @"@model_path/weights/mask.bin": @{@"offset":@0, @"data":build_blob_fp16(mask,SEQ*SEQ)},
+            };
+            free(mask);
+            Kern k = compile_mil(mil, wd);
+            if (k.model) {
+                printf("  COMPILED! Full fused attention works on ANE!\n");
+
+                // Verify vs CPU
+                float *x = (float*)malloc(SEQ*DIM*4);
+                for (int i = 0; i < SEQ*DIM; i++) x[i] = 0.1f*(2*drand48()-1);
+
+                IOSurfaceRef ioIn = make_surface(DIM*SEQ*2);
+                IOSurfaceRef ioOut = make_surface(DIM*SEQ*2);
+                IOSurfaceLock(ioIn, 0, NULL);
+                _Float16 *p = (_Float16*)IOSurfaceGetBaseAddress(ioIn);
+                for (int t = 0; t < SEQ; t++)
+                    for (int c = 0; c < DIM; c++)
+                        p[c*SEQ+t] = (_Float16)x[t*DIM+c];
+                IOSurfaceUnlock(ioIn, 0, NULL);
+
+                IOSurfaceRef ins[] = {ioIn}, outs[] = {ioOut};
+                BOOL ok = ane_eval_io(&k, ins, 1, outs, 1);
+                printf("  Eval: %s\n", ok?"OK":"FAIL");
+
+                if (ok) {
+                    // CPU reference
+                    float *q_cpu = (float*)calloc(SEQ*DIM, 4);
+                    float *k_cpu = (float*)calloc(SEQ*DIM, 4);
+                    float *v_cpu = (float*)calloc(SEQ*DIM, 4);
+                    for (int t=0;t<SEQ;t++) for (int oc=0;oc<DIM;oc++) {
+                        float sq=0,sk=0,sv=0;
+                        for (int ic=0;ic<DIM;ic++) {
+                            sq += Wq[oc*DIM+ic]*x[t*DIM+ic];
+                            sk += Wk[oc*DIM+ic]*x[t*DIM+ic];
+                            sv += Wv[oc*DIM+ic]*x[t*DIM+ic];
+                        }
+                        q_cpu[t*DIM+oc]=sq; k_cpu[t*DIM+oc]=sk; v_cpu[t*DIM+oc]=sv;
+                    }
+                    // Attention
+                    float *attn = (float*)calloc(SEQ*DIM, 4);
+                    float asc = 1.0f/sqrtf((float)HD);
+                    float *sc2 = (float*)malloc(SEQ*4);
+                    for (int h=0;h<HEADS;h++) for (int t=0;t<SEQ;t++) {
+                        float maxs=-1e30f;
+                        for (int t2=0;t2<=t;t2++) {
+                            float s=0;
+                            for (int d=0;d<HD;d++) s+=q_cpu[t*DIM+h*HD+d]*k_cpu[t2*DIM+h*HD+d];
+                            s*=asc; sc2[t2]=s; if(s>maxs) maxs=s;
+                        }
+                        float sum=0;
+                        for (int t2=0;t2<=t;t2++){sc2[t2]=expf(sc2[t2]-maxs);sum+=sc2[t2];}
+                        for (int t2=0;t2<=t;t2++) sc2[t2]/=sum;
+                        for (int d=0;d<HD;d++){
+                            float r=0;
+                            for (int t2=0;t2<=t;t2++) r+=sc2[t2]*v_cpu[t2*DIM+h*HD+d];
+                            attn[t*DIM+h*HD+d]=r;
+                        }
+                    }
+                    free(sc2);
+                    // Wo
+                    float *ref = (float*)calloc(SEQ*DIM, 4);
+                    for (int t=0;t<SEQ;t++) for (int oc=0;oc<DIM;oc++){
+                        float s=0;
+                        for (int ic=0;ic<DIM;ic++) s+=Wo[oc*DIM+ic]*attn[t*DIM+ic];
+                        ref[t*DIM+oc]=s;
+                    }
+                    IOSurfaceLock(ioOut, kIOSurfaceLockReadOnly, NULL);
+                    _Float16 *o = (_Float16*)IOSurfaceGetBaseAddress(ioOut);
+                    float maxdiff=0;
+                    for (int t=0;t<SEQ;t++) for (int c=0;c<DIM;c++){
+                        float diff=fabsf((float)o[c*SEQ+t]-ref[t*DIM+c]);
+                        if(diff>maxdiff) maxdiff=diff;
+                    }
+                    IOSurfaceUnlock(ioOut, kIOSurfaceLockReadOnly, NULL);
+                    printf("  Max diff vs CPU: %.6f → %s\n", maxdiff, maxdiff<0.1f?"PASS":"FAIL");
+
+                    // Benchmark
+                    for (int i=0;i<20;i++) ane_eval_io(&k, ins, 1, outs, 1);
+                    int N=500;
+                    uint64_t t0 = mach_absolute_time();
+                    for (int i=0;i<N;i++) ane_eval_io(&k, ins, 1, outs, 1);
+                    double ms = tb_ms(mach_absolute_time()-t0);
+                    // FLOPs: QKV=3*2*D*D*S + QKT=2*H*S*S*HD + SV=2*H*S*S*HD + Wo=2*D*D*S
+                    double flops = 4.0*2*DIM*DIM*SEQ + 4.0*HEADS*SEQ*SEQ*HD;
+                    printf("  %.3f ms/iter  %.1f GFLOPS (%.1f TFLOPS)\n", ms/N, N*flops/ms/1e6, N*flops/ms/1e9);
+
+                    free(q_cpu); free(k_cpu); free(v_cpu); free(attn); free(ref);
+                }
+                CFRelease(ioIn); CFRelease(ioOut);
+                free(x);
+                cleanup_kern(&k);
+            } else {
+                printf("  Full fused attention FAILED to compile on ANE\n");
+            }
+        }
+
+        // === Test 2: Fused FFN (already proven, just benchmark for comparison) ===
+        printf("\n=== Test 2: Fused FFN benchmark ===\n");
+        {
+            NSString *mil = [NSString stringWithFormat:
+                @"program(1.3)\n[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, "
+                "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, "
+                "{\"coremltools-version\", \"9.0\"}})]\n{\n"
+                "    func main<ios18>(tensor<fp16, [1, %d, 1, %d]> x) {\n"
+                "        string pt = const()[name = string(\"pt\"), val = string(\"valid\")];\n"
+                "        tensor<int32, [2]> st = const()[name = string(\"st\"), val = tensor<int32, [2]>([1, 1])];\n"
+                "        tensor<int32, [4]> pd = const()[name = string(\"pd\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
+                "        tensor<int32, [2]> dl = const()[name = string(\"dl\"), val = tensor<int32, [2]>([1, 1])];\n"
+                "        int32 gr = const()[name = string(\"gr\"), val = int32(1)];\n"
+                "        tensor<fp16, [%d, %d, 1, 1]> W1 = const()[name = string(\"W1\"), "
+                "val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = string(\"@model_path/weights/w1.bin\"), offset = uint64(64)))];\n"
+                "        tensor<fp16, [%d, %d, 1, 1]> W3 = const()[name = string(\"W3\"), "
+                "val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = string(\"@model_path/weights/w3.bin\"), offset = uint64(64)))];\n"
+                "        tensor<fp16, [%d, %d, 1, 1]> W2 = const()[name = string(\"W2\"), "
+                "val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = string(\"@model_path/weights/w2.bin\"), offset = uint64(64)))];\n"
+                "        tensor<fp16, [1, %d, 1, %d]> h1 = conv(dilations = dl, groups = gr, pad = pd, "
+                "pad_type = pt, strides = st, weight = W1, x = x)[name = string(\"c1\")];\n"
+                "        tensor<fp16, [1, %d, 1, %d]> h3 = conv(dilations = dl, groups = gr, pad = pd, "
+                "pad_type = pt, strides = st, weight = W3, x = x)[name = string(\"c3\")];\n"
+                "        tensor<fp16, [1, %d, 1, %d]> sig = sigmoid(x = h1)[name = string(\"sg\")];\n"
+                "        tensor<fp16, [1, %d, 1, %d]> silu = mul(x = h1, y = sig)[name = string(\"si\")];\n"
+                "        tensor<fp16, [1, %d, 1, %d]> gate = mul(x = silu, y = h3)[name = string(\"gt\")];\n"
+                "        tensor<fp16, [1, %d, 1, %d]> out = conv(dilations = dl, groups = gr, pad = pd, "
+                "pad_type = pt, strides = st, weight = W2, x = gate)[name = string(\"c2\")];\n"
+                "    } -> (out);\n}\n",
+                DIM, SEQ,
+                HIDDEN,DIM,HIDDEN,DIM, HIDDEN,DIM,HIDDEN,DIM, DIM,HIDDEN,DIM,HIDDEN,
+                HIDDEN,SEQ, HIDDEN,SEQ, HIDDEN,SEQ, HIDDEN,SEQ, HIDDEN,SEQ, DIM,SEQ];
+
+            NSDictionary *wd = @{
+                @"@model_path/weights/w1.bin": @{@"offset":@0, @"data":build_blob(W1,HIDDEN,DIM)},
+                @"@model_path/weights/w3.bin": @{@"offset":@0, @"data":build_blob(W3,HIDDEN,DIM)},
+                @"@model_path/weights/w2.bin": @{@"offset":@0, @"data":build_blob(W2,DIM,HIDDEN)},
+            };
+            Kern k = compile_mil(mil, wd);
+            printf("  FFN: %s\n", k.model?"OK":"FAIL");
+            if (k.model) {
+                IOSurfaceRef ioIn = make_surface(DIM*SEQ*2), ioOut = make_surface(DIM*SEQ*2);
+                IOSurfaceRef ins[]={ioIn}, outs[]={ioOut};
+                for (int i=0;i<20;i++) ane_eval_io(&k,ins,1,outs,1);
+                int N=500;
+                uint64_t t0 = mach_absolute_time();
+                for (int i=0;i<N;i++) ane_eval_io(&k,ins,1,outs,1);
+                double ms = tb_ms(mach_absolute_time()-t0);
+                double flops = 2.0*(2*HIDDEN*DIM + DIM*HIDDEN)*SEQ;
+                printf("  %.3f ms/iter  %.1f GFLOPS (%.1f TFLOPS)\n", ms/N, N*flops/ms/1e6, N*flops/ms/1e9);
+                CFRelease(ioIn); CFRelease(ioOut);
+                cleanup_kern(&k);
+            }
+        }
+
+        printf("\n=== Summary ===\n");
+        printf("Full transformer layer = Attention + FFN\n");
+        printf("2 ANE dispatches (+ CPU RMSNorm/residual) for entire forward pass\n");
+
+        free(Wq); free(Wk); free(Wv); free(Wo); free(W1); free(W2); free(W3);
+        printf("\nDONE\n");
+    }
+    return 0;
+}
diff --git a/training/test_fused_bwd.m b/training/test_fused_bwd.m
new file mode 100644
index 0000000..b91d7b6
--- /dev/null
+++ b/training/test_fused_bwd.m
@@ -0,0 +1,184 @@
+// Test: fused backward dx kernels
+// 1. Fused QKV backward: concat(Wq^T@dq, Wk^T@dk, Wv^T@dv) — 3 inputs, 1 output
+//    Problem: 3 separate gradient inputs. Can we concat them as input?
+//    Input: [1, DIM*3, 1, SEQ] = concat(dq, dk, dv)
+//    Use 3 separate convs on slices? MIL has slice_by_size.
+// 2. Fused W1b+W3b: input concat(dh1, dh3) [1, HIDDEN*2, 1, SEQ]
+//    Two convs on slices, add results → [1, DIM, 1, SEQ]
+#import <Foundation/Foundation.h>
+#import <objc/message.h>
+#import <dlfcn.h>
+#import <IOSurface/IOSurface.h>
+#include <math.h>
+
+#define DIM 768
+#define HIDDEN 2048
+#define SEQ 64
+
+static Class g_D, g_I, g_AR, g_AIO;
+static void ane_init(void) {
+    dlopen("/System/Library/PrivateFrameworks/AppleNeuralEngine.framework/AppleNeuralEngine", RTLD_NOW);
+    g_D  = NSClassFromString(@"_ANEInMemoryModelDescriptor");
+    g_I  = NSClassFromString(@"_ANEInMemoryModel");
+    g_AR = NSClassFromString(@"_ANERequest");
+    g_AIO= NSClassFromString(@"_ANEIOSurfaceObject");
+}
+static IOSurfaceRef make_surface(size_t bytes) {
+    return IOSurfaceCreate((__bridge CFDictionaryRef)@{
+        (id)kIOSurfaceWidth:@(bytes), (id)kIOSurfaceHeight:@1,
+        (id)kIOSurfaceBytesPerElement:@1, (id)kIOSurfaceBytesPerRow:@(bytes),
+        (id)kIOSurfaceAllocSize:@(bytes), (id)kIOSurfacePixelFormat:@0});
+}
+static NSData *build_blob_t(const float *w, int rows, int cols) {
+    int wsize = cols * rows * 2, total = 128 + wsize;
+    uint8_t *buf = (uint8_t*)calloc(total, 1);
+    buf[0]=1; buf[4]=2; buf[64]=0xEF; buf[65]=0xBE; buf[66]=0xAD; buf[67]=0xDE; buf[68]=1;
+    *(uint32_t*)(buf+72)=wsize; *(uint32_t*)(buf+80)=128;
+    _Float16 *fp16 = (_Float16*)(buf+128);
+    for (int i = 0; i < rows; i++)
+        for (int j = 0; j < cols; j++)
+            fp16[j*rows+i] = (_Float16)w[i*cols+j];
+    return [NSData dataWithBytesNoCopy:buf length:total freeWhenDone:YES];
+}
+
+int main() {
+    @autoreleasepool {
+        setbuf(stdout, NULL);
+        ane_init();
+
+        srand48(42);
+        float *W1 = (float*)malloc(HIDDEN*DIM*sizeof(float));
+        float *W3 = (float*)malloc(HIDDEN*DIM*sizeof(float));
+        float sc = 1.0f/sqrtf(HIDDEN);
+        for (int i = 0; i < HIDDEN*DIM; i++) { W1[i]=sc*(2*drand48()-1); W3[i]=sc*(2*drand48()-1); }
+
+        // Test: fused W1b+W3b backward
+        // Input: concat(dh1, dh3) [1, HIDDEN*2, 1, SEQ]
+        // Output: W1^T@dh1 + W3^T@dh3 [1, DIM, 1, SEQ]
+        // MIL: slice input → 2 convs → add
+        printf("=== Fused W1b+W3b backward (slice+conv+add) ===\n");
+
+        NSString *mil = [NSString stringWithFormat:
+            @"program(1.3)\n[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, "
+            "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, "
+            "{\"coremltools-version\", \"9.0\"}})]\n{\n"
+            "    func main<ios18>(tensor<fp32, [1, %d, 1, %d]> x) {\n"  // [1, HIDDEN*2, 1, SEQ]
+            "        string d1 = const()[name = string(\"d1\"), val = string(\"fp16\")];\n"
+            "        tensor<fp16, [1, %d, 1, %d]> x16 = cast(dtype = d1, x = x)[name = string(\"cx\")];\n"
+            // Slice: dh1 = x16[:, 0:HIDDEN, :, :], dh3 = x16[:, HIDDEN:2*HIDDEN, :, :]
+            "        tensor<int32, [4]> b1 = const()[name = string(\"b1\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
+            "        tensor<int32, [4]> s1 = const()[name = string(\"s1\"), val = tensor<int32, [4]>([1, %d, 1, %d])];\n"
+            "        tensor<fp16, [1, %d, 1, %d]> dh1 = slice_by_size(x = x16, begin = b1, size = s1)[name = string(\"sl1\")];\n"
+            "        tensor<int32, [4]> b3 = const()[name = string(\"b3\"), val = tensor<int32, [4]>([0, %d, 0, 0])];\n"
+            "        tensor<int32, [4]> s3 = const()[name = string(\"s3\"), val = tensor<int32, [4]>([1, %d, 1, %d])];\n"
+            "        tensor<fp16, [1, %d, 1, %d]> dh3 = slice_by_size(x = x16, begin = b3, size = s3)[name = string(\"sl3\")];\n"
+            // Conv: W1^T @ dh1, W3^T @ dh3
+            "        string pt = const()[name = string(\"pt\"), val = string(\"valid\")];\n"
+            "        tensor<int32, [2]> st = const()[name = string(\"st\"), val = tensor<int32, [2]>([1, 1])];\n"
+            "        tensor<int32, [4]> pd = const()[name = string(\"pd\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
+            "        tensor<int32, [2]> dl = const()[name = string(\"dl\"), val = tensor<int32, [2]>([1, 1])];\n"
+            "        int32 gr = const()[name = string(\"gr\"), val = int32(1)];\n"
+            // W1^T: [DIM, HIDDEN, 1, 1]  (transposed from [HIDDEN, DIM])
+            "        tensor<fp16, [%d, %d, 1, 1]> W1t = const()[name = string(\"W1t\"), "
+            "val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = string(\"@model_path/weights/w1t.bin\"), offset = uint64(64)))];\n"
+            "        tensor<fp16, [%d, %d, 1, 1]> W3t = const()[name = string(\"W3t\"), "
+            "val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = string(\"@model_path/weights/w3t.bin\"), offset = uint64(64)))];\n"
+            "        tensor<fp16, [1, %d, 1, %d]> dx1 = conv(dilations = dl, groups = gr, pad = pd, "
+            "pad_type = pt, strides = st, weight = W1t, x = dh1)[name = string(\"cv1\")];\n"
+            "        tensor<fp16, [1, %d, 1, %d]> dx3 = conv(dilations = dl, groups = gr, pad = pd, "
+            "pad_type = pt, strides = st, weight = W3t, x = dh3)[name = string(\"cv3\")];\n"
+            // Add
+            "        tensor<fp16, [1, %d, 1, %d]> sum = add(x = dx1, y = dx3)[name = string(\"ad\")];\n"
+            "        string d2 = const()[name = string(\"d2\"), val = string(\"fp32\")];\n"
+            "        tensor<fp32, [1, %d, 1, %d]> y = cast(dtype = d2, x = sum)[name = string(\"co\")];\n"
+            "    } -> (y);\n}\n",
+            HIDDEN*2, SEQ, HIDDEN*2, SEQ,
+            HIDDEN, SEQ, HIDDEN, SEQ,  // slice1
+            HIDDEN, HIDDEN, SEQ, HIDDEN, SEQ,  // slice3
+            DIM, HIDDEN, DIM, HIDDEN,   // W1t
+            DIM, HIDDEN, DIM, HIDDEN,   // W3t
+            DIM, SEQ, DIM, SEQ,         // dx1, dx3
+            DIM, SEQ, DIM, SEQ];        // sum, y
+
+        NSDictionary *wd = @{
+            @"@model_path/weights/w1t.bin": @{@"offset":@0, @"data":build_blob_t(W1, HIDDEN, DIM)},
+            @"@model_path/weights/w3t.bin": @{@"offset":@0, @"data":build_blob_t(W3, HIDDEN, DIM)}
+        };
+
+        NSData *md = [mil dataUsingEncoding:NSUTF8StringEncoding];
+        id desc = ((id(*)(Class,SEL,id,id,id))objc_msgSend)(g_D, @selector(modelWithMILText:weights:optionsPlist:), md, wd, nil);
+        if (!desc) { printf("desc=NULL\n"); return 1; }
+        id mdl = ((id(*)(Class,SEL,id))objc_msgSend)(g_I, @selector(inMemoryModelWithDescriptor:), desc);
+        id hx = ((id(*)(id,SEL))objc_msgSend)(mdl, @selector(hexStringIdentifier));
+        NSString *td = [NSTemporaryDirectory() stringByAppendingPathComponent:hx];
+        [[NSFileManager defaultManager] createDirectoryAtPath:[td stringByAppendingPathComponent:@"weights"] withIntermediateDirectories:YES attributes:nil error:nil];
+        [md writeToFile:[td stringByAppendingPathComponent:@"model.mil"] atomically:YES];
+        for (NSString *path in wd) {
+            [wd[path][@"data"] writeToFile:[td stringByAppendingPathComponent:[path stringByReplacingOccurrencesOfString:@"@model_path/" withString:@""]] atomically:YES];
+        }
+
+        NSError *e = nil;
+        BOOL ok = ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(compileWithQoS:options:error:), 21, @{}, &e);
+        printf("Compile: %s\n", ok?"OK":"FAIL");
+        if (!ok) { printf("  %s\n", e?[[e description] UTF8String]:""); return 1; }
+        ok = ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(loadWithQoS:options:error:), 21, @{}, &e);
+        printf("Load: %s\n", ok?"OK":"FAIL");
+        if (!ok) return 1;
+
+        // Prepare input: concat(dh1, dh3) in channel-first layout
+        float *dh1 = (float*)malloc(SEQ*HIDDEN*sizeof(float));
+        float *dh3 = (float*)malloc(SEQ*HIDDEN*sizeof(float));
+        for (int i = 0; i < SEQ*HIDDEN; i++) { dh1[i]=0.01f*sinf(i*0.007f); dh3[i]=0.01f*cosf(i*0.011f); }
+
+        IOSurfaceRef ioI = make_surface(HIDDEN*2*SEQ*4), ioO = make_surface(DIM*SEQ*4);
+        IOSurfaceLock(ioI, 0, NULL);
+        float *dst = (float*)IOSurfaceGetBaseAddress(ioI);
+        // Channel-first: channels 0..HIDDEN-1 = dh1, channels HIDDEN..2*HIDDEN-1 = dh3
+        for (int t = 0; t < SEQ; t++) {
+            for (int c = 0; c < HIDDEN; c++) dst[c*SEQ+t] = dh1[t*HIDDEN+c];
+            for (int c = 0; c < HIDDEN; c++) dst[(HIDDEN+c)*SEQ+t] = dh3[t*HIDDEN+c];
+        }
+        IOSurfaceUnlock(ioI, 0, NULL);
+
+        id wI = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioI);
+        id wO = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioO);
+        id req = ((id(*)(Class,SEL,id,id,id,id,id,id,id))objc_msgSend)(g_AR,
+            @selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:),
+            @[wI], @[@0], @[wO], @[@0], nil, nil, @0);
+
+        ok = ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)(
+            mdl, @selector(evaluateWithQoS:options:request:error:), 21, @{}, req, &e);
+        printf("Eval: %s\n", ok?"OK":"FAIL");
+        if (!ok) { printf("  %s\n", e?[[e description] UTF8String]:""); return 1; }
+
+        // CPU reference: dx = W1^T @ dh1 + W3^T @ dh3
+        float *ref = (float*)calloc(SEQ*DIM, sizeof(float));
+        for (int t = 0; t < SEQ; t++)
+            for (int i = 0; i < DIM; i++) {
+                float s = 0;
+                for (int j = 0; j < HIDDEN; j++) {
+                    s += W1[j*DIM+i] * dh1[t*HIDDEN+j]; // W1^T[i,j] = W1[j,i]
+                    s += W3[j*DIM+i] * dh3[t*HIDDEN+j];
+                }
+                ref[t*DIM+i] = s;
+            }
+
+        IOSurfaceLock(ioO, kIOSurfaceLockReadOnly, NULL);
+        float *src = (float*)IOSurfaceGetBaseAddress(ioO);
+        float maxd = 0;
+        for (int t = 0; t < SEQ; t++)
+            for (int c = 0; c < DIM; c++) {
+                float d = fabsf(src[c*SEQ+t] - ref[t*DIM+c]);
+                if (d > maxd) maxd = d;
+            }
+        IOSurfaceUnlock(ioO, kIOSurfaceLockReadOnly, NULL);
+        printf("dx max diff: %.6f\n", maxd);
+
+        ((BOOL(*)(id,SEL,unsigned int,NSError**))objc_msgSend)(mdl, @selector(unloadWithQoS:error:), 21, &e);
+        [[NSFileManager defaultManager] removeItemAtPath:td error:nil];
+        CFRelease(ioI); CFRelease(ioO);
+        free(W1); free(W3); free(dh1); free(dh3); free(ref);
+        printf("\nDONE\n");
+    }
+    return 0;
+}
diff --git a/training/test_fused_qkv.m b/training/test_fused_qkv.m
new file mode 100644
index 0000000..69f41d6
--- /dev/null
+++ b/training/test_fused_qkv.m
@@ -0,0 +1,265 @@
+// Test: Fused QKV projections in single MIL graph (3 convs → concat output)
+// Input: x [1, DIM, 1, SEQ]
+// Output: concat(Q, K, V) [1, DIM*3, 1, SEQ]
+// 3 convs with separate weights, 1 ANE dispatch
+#import <Foundation/Foundation.h>
+#import <objc/message.h>
+#import <dlfcn.h>
+#import <IOSurface/IOSurface.h>
+#import <mach/mach_time.h>
+#include <math.h>
+
+#define DIM 768
+#define SEQ 64
+
+static Class g_D, g_I, g_AR, g_AIO;
+static mach_timebase_info_data_t g_tb;
+static void ane_init(void) {
+    dlopen("/System/Library/PrivateFrameworks/AppleNeuralEngine.framework/AppleNeuralEngine", RTLD_NOW);
+    g_D  = NSClassFromString(@"_ANEInMemoryModelDescriptor");
+    g_I  = NSClassFromString(@"_ANEInMemoryModel");
+    g_AR = NSClassFromString(@"_ANERequest");
+    g_AIO= NSClassFromString(@"_ANEIOSurfaceObject");
+}
+static double tb_ms(uint64_t t) { return (double)t * g_tb.numer / g_tb.denom / 1e6; }
+static IOSurfaceRef make_surface(size_t bytes) {
+    return IOSurfaceCreate((__bridge CFDictionaryRef)@{
+        (id)kIOSurfaceWidth:@(bytes), (id)kIOSurfaceHeight:@1,
+        (id)kIOSurfaceBytesPerElement:@1, (id)kIOSurfaceBytesPerRow:@(bytes),
+        (id)kIOSurfaceAllocSize:@(bytes), (id)kIOSurfacePixelFormat:@0});
+}
+static NSData *build_blob(const float *w, int oc, int ic) {
+    int wsize = oc * ic * 2, total = 128 + wsize;
+    uint8_t *buf = (uint8_t*)calloc(total, 1);
+    buf[0]=1; buf[4]=2; buf[64]=0xEF; buf[65]=0xBE; buf[66]=0xAD; buf[67]=0xDE; buf[68]=1;
+    *(uint32_t*)(buf+72)=wsize; *(uint32_t*)(buf+80)=128;
+    _Float16 *fp16 = (_Float16*)(buf+128);
+    for (int i = 0; i < oc*ic; i++) fp16[i] = (_Float16)w[i]; // layout A: row-major [oc, ic]
+    return [NSData dataWithBytesNoCopy:buf length:total freeWhenDone:YES];
+}
+
+typedef struct { id model; NSString *td; } Kern;
+static Kern compile_mil(NSString *mil, NSDictionary *wd) {
+    Kern k = {nil, nil};
+    NSData *md = [mil dataUsingEncoding:NSUTF8StringEncoding];
+    id desc = ((id(*)(Class,SEL,id,id,id))objc_msgSend)(g_D, @selector(modelWithMILText:weights:optionsPlist:), md, wd ?: @{}, nil);
+    if (!desc) { printf("desc=NULL\n"); return k; }
+    id mdl = ((id(*)(Class,SEL,id))objc_msgSend)(g_I, @selector(inMemoryModelWithDescriptor:), desc);
+    id hx = ((id(*)(id,SEL))objc_msgSend)(mdl, @selector(hexStringIdentifier));
+    NSString *td = [NSTemporaryDirectory() stringByAppendingPathComponent:hx];
+    [[NSFileManager defaultManager] createDirectoryAtPath:[td stringByAppendingPathComponent:@"weights"]
+        withIntermediateDirectories:YES attributes:nil error:nil];
+    [md writeToFile:[td stringByAppendingPathComponent:@"model.mil"] atomically:YES];
+    for (NSString *path in wd) {
+        [wd[path][@"data"] writeToFile:[td stringByAppendingPathComponent:
+            [path stringByReplacingOccurrencesOfString:@"@model_path/" withString:@""]] atomically:YES];
+    }
+    NSError *e = nil;
+    if (!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(compileWithQoS:options:error:), 21, @{}, &e)) {
+        printf("compile FAIL: %s\n", e?[[e localizedDescription] UTF8String]:""); return k;
+    }
+    ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(loadWithQoS:options:error:), 21, @{}, &e);
+    k.model = mdl; k.td = td;
+    return k;
+}
+static BOOL ane_eval(Kern *k, IOSurfaceRef *ins, int nin, IOSurfaceRef out) {
+    NSMutableArray *inArr = [NSMutableArray array], *inIdx = [NSMutableArray array];
+    for (int i = 0; i < nin; i++) {
+        [inArr addObject:((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ins[i])];
+        [inIdx addObject:@(i)];
+    }
+    id wO = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), out);
+    id req = ((id(*)(Class,SEL,id,id,id,id,id,id,id))objc_msgSend)(g_AR,
+        @selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:),
+        inArr, inIdx, @[wO], @[@0], nil, nil, @0);
+    NSError *e = nil;
+    return ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)(
+        k->model, @selector(evaluateWithQoS:options:request:error:), 21, @{}, req, &e);
+}
+static void cleanup_kern(Kern *k) {
+    if (!k->model) return;
+    NSError *e = nil;
+    ((BOOL(*)(id,SEL,unsigned int,NSError**))objc_msgSend)(k->model, @selector(unloadWithQoS:error:), 21, &e);
+    [[NSFileManager defaultManager] removeItemAtPath:k->td error:nil];
+}
+
+// Fused QKV: 3 convs + concat in one MIL
+static NSString *gen_fused_qkv_mil(void) {
+    return [NSString stringWithFormat:
+        @"program(1.3)\n[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, "
+        "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, "
+        "{\"coremltools-version\", \"9.0\"}})]\n{\n"
+        "    func main<ios18>(tensor<fp32, [1, %d, 1, %d]> x) {\n"
+        "        string d1 = const()[name = string(\"d1\"), val = string(\"fp16\")];\n"
+        "        tensor<fp16, [1, %d, 1, %d]> x16 = cast(dtype = d1, x = x)[name = string(\"cx\")];\n"
+        "        string pt = const()[name = string(\"pt\"), val = string(\"valid\")];\n"
+        "        tensor<int32, [2]> st = const()[name = string(\"st\"), val = tensor<int32, [2]>([1, 1])];\n"
+        "        tensor<int32, [4]> pd = const()[name = string(\"pd\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
+        "        tensor<int32, [2]> dl = const()[name = string(\"dl\"), val = tensor<int32, [2]>([1, 1])];\n"
+        "        int32 gr = const()[name = string(\"gr\"), val = int32(1)];\n"
+        "        tensor<fp16, [%d, %d, 1, 1]> Wq = const()[name = string(\"Wq\"), "
+        "val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = string(\"@model_path/weights/wq.bin\"), offset = uint64(64)))];\n"
+        "        tensor<fp16, [%d, %d, 1, 1]> Wk = const()[name = string(\"Wk\"), "
+        "val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = string(\"@model_path/weights/wk.bin\"), offset = uint64(64)))];\n"
+        "        tensor<fp16, [%d, %d, 1, 1]> Wv = const()[name = string(\"Wv\"), "
+        "val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = string(\"@model_path/weights/wv.bin\"), offset = uint64(64)))];\n"
+        "        tensor<fp16, [1, %d, 1, %d]> q = conv(dilations = dl, groups = gr, pad = pd, "
+        "pad_type = pt, strides = st, weight = Wq, x = x16)[name = string(\"cq\")];\n"
+        "        tensor<fp16, [1, %d, 1, %d]> k = conv(dilations = dl, groups = gr, pad = pd, "
+        "pad_type = pt, strides = st, weight = Wk, x = x16)[name = string(\"ck\")];\n"
+        "        tensor<fp16, [1, %d, 1, %d]> v = conv(dilations = dl, groups = gr, pad = pd, "
+        "pad_type = pt, strides = st, weight = Wv, x = x16)[name = string(\"cv\")];\n"
+        "        int32 ax = const()[name = string(\"ax\"), val = int32(1)];\n"
+        "        bool inter = const()[name = string(\"il\"), val = bool(false)];\n"
+        "        tensor<fp16, [1, %d, 1, %d]> qkv = concat(axis = ax, interleave = inter, values = (q, k, v))[name = string(\"cat\")];\n"
+        "        string d2 = const()[name = string(\"d2\"), val = string(\"fp32\")];\n"
+        "        tensor<fp32, [1, %d, 1, %d]> y = cast(dtype = d2, x = qkv)[name = string(\"co\")];\n"
+        "    } -> (y);\n}\n",
+        DIM, SEQ, DIM, SEQ,
+        DIM, DIM, DIM, DIM,  // Wq
+        DIM, DIM, DIM, DIM,  // Wk
+        DIM, DIM, DIM, DIM,  // Wv
+        DIM, SEQ,  // q
+        DIM, SEQ,  // k
+        DIM, SEQ,  // v
+        DIM*3, SEQ,  // concat
+        DIM*3, SEQ]; // output
+}
+
+// Single conv MIL for comparison
+static NSString *gen_single_mil(void) {
+    return [NSString stringWithFormat:
+        @"program(1.3)\n[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, "
+        "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, "
+        "{\"coremltools-version\", \"9.0\"}})]\n{\n"
+        "    func main<ios18>(tensor<fp32, [1, %d, 1, %d]> x) {\n"
+        "        string d1 = const()[name = string(\"d1\"), val = string(\"fp16\")];\n"
+        "        tensor<fp16, [1, %d, 1, %d]> x16 = cast(dtype = d1, x = x)[name = string(\"cx\")];\n"
+        "        tensor<fp16, [%d, %d, 1, 1]> W = const()[name = string(\"W\"), "
+        "val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = string(\"@model_path/weights/w.bin\"), offset = uint64(64)))];\n"
+        "        string pt = const()[name = string(\"pt\"), val = string(\"valid\")];\n"
+        "        tensor<int32, [2]> st = const()[name = string(\"st\"), val = tensor<int32, [2]>([1, 1])];\n"
+        "        tensor<int32, [4]> pd = const()[name = string(\"pd\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
+        "        tensor<int32, [2]> dl = const()[name = string(\"dl\"), val = tensor<int32, [2]>([1, 1])];\n"
+        "        int32 gr = const()[name = string(\"gr\"), val = int32(1)];\n"
+        "        tensor<fp16, [1, %d, 1, %d]> y16 = conv(dilations = dl, groups = gr, pad = pd, "
+        "pad_type = pt, strides = st, weight = W, x = x16)[name = string(\"cv\")];\n"
+        "        string d2 = const()[name = string(\"d2\"), val = string(\"fp32\")];\n"
+        "        tensor<fp32, [1, %d, 1, %d]> y = cast(dtype = d2, x = y16)[name = string(\"co\")];\n"
+        "    } -> (y);\n}\n",
+        DIM, SEQ, DIM, SEQ, DIM, DIM, DIM, DIM, DIM, SEQ, DIM, SEQ];
+}
+
+int main() {
+    @autoreleasepool {
+        setbuf(stdout, NULL);
+        ane_init();
+        mach_timebase_info(&g_tb);
+
+        printf("=== Fused QKV vs 3x Separate Convs ===\n");
+        printf("DIM=%d SEQ=%d\n\n", DIM, SEQ);
+
+        srand48(42);
+        float *Wq = (float*)malloc(DIM*DIM*sizeof(float));
+        float *Wk = (float*)malloc(DIM*DIM*sizeof(float));
+        float *Wv = (float*)malloc(DIM*DIM*sizeof(float));
+        float sc = 1.0f/sqrtf(DIM);
+        for (int i = 0; i < DIM*DIM; i++) { Wq[i]=sc*(2*drand48()-1); Wk[i]=sc*(2*drand48()-1); Wv[i]=sc*(2*drand48()-1); }
+
+        float *x = (float*)malloc(SEQ*DIM*sizeof(float));
+        for (int i = 0; i < SEQ*DIM; i++) x[i] = 0.1f*(2*drand48()-1);
+
+        // === Compile fused QKV ===
+        NSDictionary *fused_wd = @{
+            @"@model_path/weights/wq.bin": @{@"offset":@0, @"data":build_blob(Wq, DIM, DIM)},
+            @"@model_path/weights/wk.bin": @{@"offset":@0, @"data":build_blob(Wk, DIM, DIM)},
+            @"@model_path/weights/wv.bin": @{@"offset":@0, @"data":build_blob(Wv, DIM, DIM)},
+        };
+        Kern kFused = compile_mil(gen_fused_qkv_mil(), fused_wd);
+        printf("Fused QKV: %s\n", kFused.model ? "OK" : "FAIL");
+
+        // === Compile 3 separate ===
+        Kern kQ = compile_mil(gen_single_mil(), @{@"@model_path/weights/w.bin": @{@"offset":@0, @"data":build_blob(Wq, DIM, DIM)}});
+        Kern kK = compile_mil(gen_single_mil(), @{@"@model_path/weights/w.bin": @{@"offset":@0, @"data":build_blob(Wk, DIM, DIM)}});
+        Kern kV = compile_mil(gen_single_mil(), @{@"@model_path/weights/w.bin": @{@"offset":@0, @"data":build_blob(Wv, DIM, DIM)}});
+        printf("Separate Q,K,V: %s %s %s\n", kQ.model?"OK":"FAIL", kK.model?"OK":"FAIL", kV.model?"OK":"FAIL");
+
+        if (!kFused.model || !kQ.model) goto done;
+
+        // IOSurfaces
+        size_t in_bytes = DIM*SEQ*4, out1_bytes = DIM*SEQ*4, out3_bytes = DIM*3*SEQ*4;
+        IOSurfaceRef ioIn = make_surface(in_bytes);
+        IOSurfaceRef ioFused = make_surface(out3_bytes);
+        IOSurfaceRef ioQ = make_surface(out1_bytes), ioK = make_surface(out1_bytes), ioV = make_surface(out1_bytes);
+
+        IOSurfaceLock(ioIn, 0, NULL);
+        float *dst = (float*)IOSurfaceGetBaseAddress(ioIn);
+        for (int t = 0; t < SEQ; t++)
+            for (int c = 0; c < DIM; c++)
+                dst[c*SEQ+t] = x[t*DIM+c];
+        IOSurfaceUnlock(ioIn, 0, NULL);
+
+        // Eval fused
+        IOSurfaceRef ins[] = {ioIn};
+        ane_eval(&kFused, ins, 1, ioFused);
+        // Eval separate
+        ane_eval(&kQ, ins, 1, ioQ);
+        ane_eval(&kK, ins, 1, ioK);
+        ane_eval(&kV, ins, 1, ioV);
+
+        // Compare fused output (concat Q,K,V) vs separate
+        IOSurfaceLock(ioFused, kIOSurfaceLockReadOnly, NULL);
+        IOSurfaceLock(ioQ, kIOSurfaceLockReadOnly, NULL);
+        IOSurfaceLock(ioK, kIOSurfaceLockReadOnly, NULL);
+        IOSurfaceLock(ioV, kIOSurfaceLockReadOnly, NULL);
+        float *fo = (float*)IOSurfaceGetBaseAddress(ioFused);
+        float *qo = (float*)IOSurfaceGetBaseAddress(ioQ);
+        float *ko = (float*)IOSurfaceGetBaseAddress(ioK);
+        float *vo = (float*)IOSurfaceGetBaseAddress(ioV);
+        float dq=0, dk=0, dv=0;
+        for (int c = 0; c < DIM; c++)
+            for (int t = 0; t < SEQ; t++) {
+                float d1 = fabsf(fo[c*SEQ+t] - qo[c*SEQ+t]); if(d1>dq) dq=d1;
+                float d2 = fabsf(fo[(DIM+c)*SEQ+t] - ko[c*SEQ+t]); if(d2>dk) dk=d2;
+                float d3 = fabsf(fo[(DIM*2+c)*SEQ+t] - vo[c*SEQ+t]); if(d3>dv) dv=d3;
+            }
+        IOSurfaceUnlock(ioFused, kIOSurfaceLockReadOnly, NULL);
+        IOSurfaceUnlock(ioQ, kIOSurfaceLockReadOnly, NULL);
+        IOSurfaceUnlock(ioK, kIOSurfaceLockReadOnly, NULL);
+        IOSurfaceUnlock(ioV, kIOSurfaceLockReadOnly, NULL);
+        printf("\nFused vs Separate: dQ=%.6f dK=%.6f dV=%.6f → %s\n",
+               dq, dk, dv, (dq<0.001f && dk<0.001f && dv<0.001f) ? "PASS" : "FAIL");
+
+        // === Benchmark ===
+        printf("\n=== Benchmark ===\n");
+        int N = 500;
+        // Warmup
+        for (int i = 0; i < 20; i++) { ane_eval(&kFused, ins, 1, ioFused); ane_eval(&kQ, ins, 1, ioQ); }
+
+        uint64_t t0 = mach_absolute_time();
+        for (int i = 0; i < N; i++) ane_eval(&kFused, ins, 1, ioFused);
+        double ms_fused = tb_ms(mach_absolute_time() - t0);
+
+        t0 = mach_absolute_time();
+        for (int i = 0; i < N; i++) {
+            ane_eval(&kQ, ins, 1, ioQ);
+            ane_eval(&kK, ins, 1, ioK);
+            ane_eval(&kV, ins, 1, ioV);
+        }
+        double ms_sep = tb_ms(mach_absolute_time() - t0);
+
+        double flops_one = 2.0 * DIM * DIM * SEQ;
+        printf("Fused QKV (1 dispatch, 3 convs):  %.3f ms/iter  %.1f GFLOPS\n",
+               ms_fused/N, N*3*flops_one/ms_fused/1e6);
+        printf("Separate Q+K+V (3 dispatches):    %.3f ms/iter  %.1f GFLOPS\n",
+               ms_sep/N, N*3*flops_one/ms_sep/1e6);
+        printf("Speedup: %.2fx\n", ms_sep/ms_fused);
+
+        CFRelease(ioIn); CFRelease(ioFused); CFRelease(ioQ); CFRelease(ioK); CFRelease(ioV);
+        free(Wq); free(Wk); free(Wv); free(x);
+        done:
+        cleanup_kern(&kFused); cleanup_kern(&kQ); cleanup_kern(&kK); cleanup_kern(&kV);
+        printf("\nDONE\n");
+    }
+    return 0;
+}
diff --git a/training/tiny_train.m b/training/tiny_train.m
new file mode 100644
index 0000000..e1e9d7d
--- /dev/null
+++ b/training/tiny_train.m
@@ -0,0 +1,593 @@
+// tiny_train.m — Train a 2-layer linear model on ANE (forward AND backward)
+// y = W2 @ relu(W1 @ x), MSE loss, SGD update
+// Pipeline: compile next kernels on background thread while ANE runs current batch
+// Bypasses ANE 119-compile limit via exec() self-restart
+#import <Foundation/Foundation.h>
+#import <objc/runtime.h>
+#import <objc/message.h>
+#import <dlfcn.h>
+#import <IOSurface/IOSurface.h>
+#import <mach/mach_time.h>
+#include <math.h>
+#include <unistd.h>
+#include <dispatch/dispatch.h>
+
+static Class g_D, g_I, g_AR, g_AIO;
+
+static void ane_init(void) {
+    dlopen("/System/Library/PrivateFrameworks/AppleNeuralEngine.framework/AppleNeuralEngine", RTLD_NOW);
+    g_D  = NSClassFromString(@"_ANEInMemoryModelDescriptor");
+    g_I  = NSClassFromString(@"_ANEInMemoryModel");
+    g_AR = NSClassFromString(@"_ANERequest");
+    g_AIO= NSClassFromString(@"_ANEIOSurfaceObject");
+}
+
+static IOSurfaceRef make_surface(size_t bytes) {
+    return IOSurfaceCreate((__bridge CFDictionaryRef)@{
+        (id)kIOSurfaceWidth:@(bytes), (id)kIOSurfaceHeight:@1,
+        (id)kIOSurfaceBytesPerElement:@1, (id)kIOSurfaceBytesPerRow:@(bytes),
+        (id)kIOSurfaceAllocSize:@(bytes), (id)kIOSurfacePixelFormat:@0});
+}
+
+static NSData *build_blob(const float *w, int rows, int cols) {
+    int wsize = rows * cols * 2;
+    int total = 128 + wsize;
+    uint8_t *buf = (uint8_t*)calloc(total, 1);
+    buf[0] = 0x01; buf[4] = 0x02;
+    buf[64] = 0xEF; buf[65] = 0xBE; buf[66] = 0xAD; buf[67] = 0xDE;
+    buf[68] = 0x01;
+    *(uint32_t*)(buf+72) = wsize;
+    *(uint32_t*)(buf+80) = 128;
+    _Float16 *fp16 = (_Float16*)(buf + 128);
+    for (int i = 0; i < rows * cols; i++) fp16[i] = (_Float16)w[i];
+    return [NSData dataWithBytesNoCopy:buf length:total freeWhenDone:YES];
+}
+
+static NSData *build_blob_transposed(const float *w, int rows, int cols) {
+    int wsize = cols * rows * 2;
+    int total = 128 + wsize;
+    uint8_t *buf = (uint8_t*)calloc(total, 1);
+    buf[0] = 0x01; buf[4] = 0x02;
+    buf[64] = 0xEF; buf[65] = 0xBE; buf[66] = 0xAD; buf[67] = 0xDE;
+    buf[68] = 0x01;
+    *(uint32_t*)(buf+72) = wsize;
+    *(uint32_t*)(buf+80) = 128;
+    _Float16 *fp16 = (_Float16*)(buf + 128);
+    for (int i = 0; i < rows; i++)
+        for (int j = 0; j < cols; j++)
+            fp16[j * rows + i] = (_Float16)w[i * cols + j];
+    return [NSData dataWithBytesNoCopy:buf length:total freeWhenDone:YES];
+}
+
+static NSString *gen_conv_mil(int in_ch, int out_ch, int sp) {
+    return [NSString stringWithFormat:
+        @"program(1.3)\n[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, "
+        "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, "
+        "{\"coremltools-version\", \"9.0\"}})]\n{\n"
+        "    func main<ios18>(tensor<fp32, [1, %d, 1, %d]> x) {\n"
+        "        string d1 = const()[name = string(\"d1\"), val = string(\"fp16\")];\n"
+        "        tensor<fp16, [1, %d, 1, %d]> x16 = cast(dtype = d1, x = x)[name = string(\"cx\")];\n"
+        "        tensor<fp16, [%d, %d, 1, 1]> W = const()[name = string(\"W\"), "
+        "val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = string(\"@model_path/weights/weight.bin\"), offset = uint64(64)))];\n"
+        "        string pt = const()[name = string(\"pt\"), val = string(\"valid\")];\n"
+        "        tensor<int32, [2]> st = const()[name = string(\"st\"), val = tensor<int32, [2]>([1, 1])];\n"
+        "        tensor<int32, [4]> pd = const()[name = string(\"pd\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
+        "        tensor<int32, [2]> dl = const()[name = string(\"dl\"), val = tensor<int32, [2]>([1, 1])];\n"
+        "        int32 gr = const()[name = string(\"gr\"), val = int32(1)];\n"
+        "        tensor<fp16, [1, %d, 1, %d]> y16 = conv(dilations = dl, groups = gr, pad = pd, "
+        "pad_type = pt, strides = st, weight = W, x = x16)[name = string(\"cv\")];\n"
+        "        string d2 = const()[name = string(\"d2\"), val = string(\"fp32\")];\n"
+        "        tensor<fp32, [1, %d, 1, %d]> y = cast(dtype = d2, x = y16)[name = string(\"co\")];\n"
+        "    } -> (y);\n}\n",
+        in_ch, sp, in_ch, sp, out_ch, in_ch, out_ch, in_ch, out_ch, sp, out_ch, sp];
+}
+
+typedef struct {
+    void *model;    // CFBridgingRetain'd _ANEInMemoryModel
+    IOSurfaceRef ioIn, ioOut;
+    void *request;  // CFBridgingRetain'd _ANERequest
+    void *tmpDir;   // CFBridgingRetain'd NSString
+} Kern;
+
+static int g_compile_count = 0;
+
+static Kern *compile_kern_with_blob(NSData *blob, int in_ch, int out_ch, int sp) {
+    @autoreleasepool {
+    NSString *mil = gen_conv_mil(in_ch, out_ch, sp);
+    NSData *milData = [mil dataUsingEncoding:NSUTF8StringEncoding];
+    NSDictionary *wd = @{@"@model_path/weights/weight.bin":@{@"offset":@0,@"data":blob}};
+    id desc = ((id(*)(Class,SEL,id,id,id))objc_msgSend)(g_D, @selector(modelWithMILText:weights:optionsPlist:), milData, wd, nil);
+    if (!desc) return NULL;
+    id mdl = ((id(*)(Class,SEL,id))objc_msgSend)(g_I, @selector(inMemoryModelWithDescriptor:), desc);
+    id hx = ((id(*)(id,SEL))objc_msgSend)(mdl, @selector(hexStringIdentifier));
+    NSString *td = [NSTemporaryDirectory() stringByAppendingPathComponent:hx];
+    NSFileManager *fm = [NSFileManager defaultManager];
+    [fm createDirectoryAtPath:[td stringByAppendingPathComponent:@"weights"] withIntermediateDirectories:YES attributes:nil error:nil];
+    [milData writeToFile:[td stringByAppendingPathComponent:@"model.mil"] atomically:YES];
+    [blob writeToFile:[td stringByAppendingPathComponent:@"weights/weight.bin"] atomically:YES];
+    NSError *e = nil;
+    if (!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(compileWithQoS:options:error:), 21, @{}, &e)) return NULL;
+    if (!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(loadWithQoS:options:error:), 21, @{}, &e)) return NULL;
+    __sync_fetch_and_add(&g_compile_count, 1);
+    size_t inB = in_ch * sp * 4, outB = out_ch * sp * 4;
+    IOSurfaceRef ioI = make_surface(inB), ioO = make_surface(outB);
+    id wI = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioI);
+    id wO = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioO);
+    id req = ((id(*)(Class,SEL,id,id,id,id,id,id,id))objc_msgSend)(g_AR,
+        @selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:),
+        @[wI], @[@0], @[wO], @[@0], nil, nil, @0);
+    Kern *k = calloc(1, sizeof(Kern));
+    k->model = CFBridgingRetain(mdl);
+    k->ioIn = ioI; k->ioOut = ioO;
+    k->request = CFBridgingRetain(req);
+    k->tmpDir = CFBridgingRetain(td);
+    return k;
+    }
+}
+
+static void free_kern(Kern *k) {
+    if (!k) return;
+    id mdl = (__bridge id)k->model;
+    NSError *e = nil;
+    ((BOOL(*)(id,SEL,unsigned int,NSError**))objc_msgSend)(mdl, @selector(unloadWithQoS:error:), 21, &e);
+    CFRelease(k->ioIn); CFRelease(k->ioOut);
+    NSString *td = (__bridge id)k->tmpDir;
+    [[NSFileManager defaultManager] removeItemAtPath:td error:nil];
+    CFRelease(k->model);
+    CFRelease(k->request);
+    CFRelease(k->tmpDir);
+    free(k);
+}
+
+static void ane_eval_k(Kern *k, const float *in, float *out, int in_ch, int out_ch, int sp) {
+    float *tmp = (float*)malloc(in_ch * sp * sizeof(float));
+    for (int t = 0; t < sp; t++)
+        for (int c = 0; c < in_ch; c++)
+            tmp[c*sp + t] = in[t*in_ch + c];
+    IOSurfaceLock(k->ioIn, 0, NULL);
+    memcpy(IOSurfaceGetBaseAddress(k->ioIn), tmp, in_ch * sp * sizeof(float));
+    IOSurfaceUnlock(k->ioIn, 0, NULL);
+    free(tmp);
+    NSError *e = nil;
+    id mdl = (__bridge id)k->model;
+    id req = (__bridge id)k->request;
+    ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)(
+        mdl, @selector(evaluateWithQoS:options:request:error:), 21, @{}, req, &e);
+    float *tmp2 = (float*)malloc(out_ch * sp * sizeof(float));
+    IOSurfaceLock(k->ioOut, kIOSurfaceLockReadOnly, NULL);
+    memcpy(tmp2, IOSurfaceGetBaseAddress(k->ioOut), out_ch * sp * sizeof(float));
+    IOSurfaceUnlock(k->ioOut, kIOSurfaceLockReadOnly, NULL);
+    for (int t = 0; t < sp; t++)
+        for (int c = 0; c < out_ch; c++)
+            out[t*out_ch + c] = tmp2[c*sp + t];
+    free(tmp2);
+}
+
+// === Checkpoint: save/restore training state for exec() restart ===
+#define CKPT_PATH "/tmp/ane_train_ckpt.bin"
+
+typedef struct {
+    int step;
+    float loss;
+    int D, H, S, total_steps;
+    float lr;
+    double cum_compile_ms, cum_train_ms, cum_wall_ms;
+    int cum_steps, cum_batches;
+} CkptHeader;
+
+static void save_checkpoint(const char *path, int step, float loss,
+                            int D, int H, int S, int total_steps, float lr,
+                            const float *W1, const float *W2,
+                            double cc, double ct, double cw, int cs, int cb) {
+    FILE *f = fopen(path, "wb");
+    CkptHeader hdr = {step, loss, D, H, S, total_steps, lr, cc, ct, cw, cs, cb};
+    fwrite(&hdr, sizeof(hdr), 1, f);
+    fwrite(W1, sizeof(float), H * D, f);
+    fwrite(W2, sizeof(float), D * H, f);
+    fclose(f);
+}
+
+static bool load_checkpoint(const char *path, CkptHeader *hdr,
+                            float *W1, float *W2, int H, int D) {
+    FILE *f = fopen(path, "rb");
+    if (!f) return false;
+    fread(hdr, sizeof(CkptHeader), 1, f);
+    fread(W1, sizeof(float), H * D, f);
+    fread(W2, sizeof(float), D * H, f);
+    fclose(f);
+    return true;
+}
+
+#define MAX_COMPILES 100
+#define KERNELS_PER_STEP 4
+#define ACCUM_STEPS 10
+
+// === Pipeline: background compile via GCD ===
+typedef struct {
+    Kern *k1_fwd, *k2_fwd, *k1_bwd, *k2_bwd;
+    float *W1, *W2;
+    int D, H, S;
+    bool ok;
+    double compile_ms;
+} PipelineCompile;
+
+static double tb_to_ms(uint64_t elapsed, mach_timebase_info_data_t tb) {
+    return (double)elapsed * tb.numer / tb.denom / 1e6;
+}
+
+static mach_timebase_info_data_t g_tb;
+// Serial queue ensures ANE compiles don't overlap with each other
+static dispatch_queue_t g_compile_queue;
+
+int main(int argc, char *argv[]) {
+    @autoreleasepool {
+        setbuf(stdout, NULL);
+        ane_init();
+        mach_timebase_info(&g_tb);
+        g_compile_queue = dispatch_queue_create("ane.compile", DISPATCH_QUEUE_SERIAL);
+
+        int D = 64, H = 128, S = 16;
+        int total_steps = 2000;
+        float lr = 1.0f;
+        int start_step = 0;
+        bool resuming = false;
+
+        float *W1 = (float*)malloc(H * D * sizeof(float));
+        float *W2 = (float*)malloc(D * H * sizeof(float));
+
+        if (argc > 1 && strcmp(argv[1], "--resume") == 0) {
+            CkptHeader hdr;
+            if (load_checkpoint(CKPT_PATH, &hdr, W1, W2, H, D)) {
+                start_step = hdr.step;
+                total_steps = hdr.total_steps;
+                lr = hdr.lr;
+                resuming = true;
+                printf("[RESUMED at step %d, loss=%.6f, compiles reset]\n", start_step, hdr.loss);
+            }
+        }
+
+        // Cumulative stats (restored from checkpoint if resuming)
+        double cum_compile_ms = 0, cum_train_ms = 0, cum_wall_ms = 0;
+        int cum_steps = 0, cum_batches = 0;
+        if (resuming) {
+            CkptHeader hdr2;
+            FILE *f = fopen(CKPT_PATH, "rb");
+            if (f) { fread(&hdr2, sizeof(hdr2), 1, f); fclose(f);
+                cum_compile_ms = hdr2.cum_compile_ms;
+                cum_train_ms = hdr2.cum_train_ms;
+                cum_wall_ms = hdr2.cum_wall_ms;
+                cum_steps = hdr2.cum_steps;
+                cum_batches = hdr2.cum_batches;
+            }
+        }
+
+        // FLOPs calculation
+        // Forward: W1[H,D] @ x[D,S] = 2*H*D*S, W2[D,H] @ h[H,S] = 2*D*H*S → total fwd = 4*D*H*S
+        // Backward dx: W2^T[H,D] @ dy[D,S] = 2*H*D*S, W1^T[D,H] @ dh[H,S] = 2*D*H*S → total bwd = 4*D*H*S
+        // dW (CPU): dW2[D,H] = dy[D,S] @ h^T[S,H] = 2*D*S*H, dW1 same → total dW = 4*D*H*S
+        // ANE FLOPs per step = 8*D*H*S (fwd + bwd on ANE)
+        // CPU FLOPs per step = 4*D*H*S (dW accumulation)
+        // Total FLOPs per step = 12*D*H*S
+        double ane_flops_per_step = 8.0 * D * H * S;
+        double cpu_flops_per_step = 4.0 * D * H * S;
+        double total_flops_per_step = ane_flops_per_step + cpu_flops_per_step;
+        double weight_bytes = (H*D + D*H) * 2.0; // FP16 weights on ANE
+
+        if (!resuming) {
+            for (int i = 0; i < H*D; i++) W1[i] = 0.01f * sinf(i * 1.3f + 0.7f);
+            for (int i = 0; i < D*H; i++) W2[i] = 0.01f * cosf(i * 0.9f + 1.1f);
+            printf("=== ANE Training: Pipeline Parallel + Grad Accumulation ===\n");
+            printf("x:[%d,%d] -> W1:[%d,%d] -> ReLU -> W2:[%d,%d] -> y:[%d,%d]\n", S,D, H,D, D,H, S,D);
+            printf("Accum %d steps per recompile | Pipeline: compile overlaps ANE eval\n", ACCUM_STEPS);
+            printf("ANE FP16 peak: 15.8 TFLOPS (M4) | Weights: %.1f KB\n\n", weight_bytes/1024.0);
+            printf("FLOPs/step: ANE=%.0f (fwd+bwd)  CPU=%.0f (dW)  Total=%.0f\n",
+                   ane_flops_per_step, cpu_flops_per_step, total_flops_per_step);
+            printf("Steps: %d, LR: %.4f, exec() budget: %d compiles\n\n",
+                   total_steps, lr, MAX_COMPILES);
+        }
+
+        float *x = (float*)calloc(S * D, sizeof(float));
+        float *y_target = (float*)calloc(S * D, sizeof(float));
+        for (int t = 0; t < S; t++)
+            for (int i = 0; i < D; i++) {
+                float v = sinf((t * D + i) * 0.1f);
+                x[t*D + i] = v;
+                y_target[t*D + i] = v;
+            }
+
+        float *h = (float*)malloc(S * H * sizeof(float));
+        float *h_relu = (float*)malloc(S * H * sizeof(float));
+        float *y = (float*)malloc(S * D * sizeof(float));
+        float *dy = (float*)malloc(S * D * sizeof(float));
+        float *dh_relu = (float*)malloc(S * H * sizeof(float));
+        float *dh = (float*)malloc(S * H * sizeof(float));
+        float *dx_layer = (float*)malloc(S * D * sizeof(float));
+
+        Kern *k1_fwd = NULL, *k2_fwd = NULL;
+        Kern *k1_bwd = NULL, *k2_bwd = NULL;
+        float last_loss = 999.0f;
+
+        // Stats
+        double total_compile_ms = 0, total_train_ms = 0, total_wall_ms = 0;
+        double total_hidden_compile_ms = 0; // compile time hidden by pipeline
+        int total_batches = 0;
+        int total_steps_done = 0;
+        uint64_t t_wall_start = mach_absolute_time();
+
+        // First compile is synchronous (no pipeline yet)
+        {
+            uint64_t t0 = mach_absolute_time();
+            k1_fwd = compile_kern_with_blob(build_blob(W1, H, D), D, H, S);
+            k2_fwd = compile_kern_with_blob(build_blob(W2, D, H), H, D, S);
+            k2_bwd = compile_kern_with_blob(build_blob_transposed(W2, D, H), D, H, S);
+            k1_bwd = compile_kern_with_blob(build_blob_transposed(W1, H, D), H, D, S);
+            double cms = tb_to_ms(mach_absolute_time() - t0, g_tb);
+            total_compile_ms += cms;
+            if (!k1_fwd || !k2_fwd || !k1_bwd || !k2_bwd) {
+                printf("Initial compile failed!\n"); return 1;
+            }
+            printf("Initial compile: %.0fms\n", cms);
+        }
+
+        int step = start_step;
+        while (step < total_steps) {
+            // Check compile budget
+            if (g_compile_count + KERNELS_PER_STEP > MAX_COMPILES) {
+                free_kern(k1_fwd); free_kern(k2_fwd);
+                free_kern(k1_bwd); free_kern(k2_bwd);
+                save_checkpoint(CKPT_PATH, step, last_loss, D, H, S, total_steps, lr, W1, W2,
+                                    cum_compile_ms + total_compile_ms, cum_train_ms + total_train_ms,
+                                    cum_wall_ms + tb_to_ms(mach_absolute_time() - t_wall_start, g_tb),
+                                    cum_steps + total_steps_done, cum_batches + total_batches);
+                double wall = tb_to_ms(mach_absolute_time() - t_wall_start, g_tb);
+                printf("[exec() restart at step %d, %d compiles, loss=%.6f, wall=%.0fms]\n",
+                       step, g_compile_count, last_loss, wall);
+                fflush(stdout);
+                execl(argv[0], argv[0], "--resume", NULL);
+                perror("execl failed"); return 1;
+            }
+
+            // === Run ACCUM_STEPS with current kernels ===
+            float *aW1 = (float*)calloc(H * D, sizeof(float));
+            float *aW2 = (float*)calloc(D * H, sizeof(float));
+            int steps_this_batch = 0;
+
+            // Pipeline: start compiling NEXT batch's kernels in background
+            // We'll apply gradients first, then launch compile with updated W
+            // But for pipeline, we compile AHEAD: while running batch N, compile for N+1
+            // So we need to update weights BEFORE launching background compile
+
+            uint64_t t_batch = mach_absolute_time();
+            for (int a = 0; a < ACCUM_STEPS && step < total_steps; a++, step++) {
+                ane_eval_k(k1_fwd, x, h, D, H, S);
+                for (int i = 0; i < S*H; i++) h_relu[i] = h[i] > 0 ? h[i] : 0;
+                ane_eval_k(k2_fwd, h_relu, y, H, D, S);
+
+                float loss = 0;
+                for (int i = 0; i < S*D; i++) {
+                    float diff = y[i] - y_target[i];
+                    loss += diff * diff;
+                    dy[i] = 2.0f * diff / (S * D);
+                }
+                loss /= (S * D);
+                last_loss = loss;
+
+                ane_eval_k(k2_bwd, dy, dh_relu, D, H, S);
+                for (int i = 0; i < S*H; i++) dh[i] = h[i] > 0 ? dh_relu[i] : 0;
+                ane_eval_k(k1_bwd, dh, dx_layer, H, D, S);
+
+                for (int t = 0; t < S; t++)
+                    for (int i = 0; i < D; i++)
+                        for (int j = 0; j < H; j++)
+                            aW2[i*H + j] += dy[t*D + i] * h_relu[t*H + j];
+                for (int t = 0; t < S; t++)
+                    for (int i = 0; i < H; i++)
+                        for (int j = 0; j < D; j++)
+                            aW1[i*D + j] += dh[t*H + i] * x[t*D + j];
+
+                steps_this_batch++;
+            }
+            double batch_ms = tb_to_ms(mach_absolute_time() - t_batch, g_tb);
+            total_train_ms += batch_ms;
+
+            // Apply accumulated gradients
+            float scale = 1.0f / steps_this_batch;
+            for (int i = 0; i < H*D; i++) W1[i] -= lr * aW1[i] * scale;
+            for (int i = 0; i < D*H; i++) W2[i] -= lr * aW2[i] * scale;
+            free(aW1); free(aW2);
+
+            total_steps_done += steps_this_batch;
+            total_batches++;
+
+            // Print progress
+            double step_ms = batch_ms / steps_this_batch;
+            double ane_gflops = (ane_flops_per_step * steps_this_batch) / (batch_ms * 1e6);
+            double total_gflops = (total_flops_per_step * steps_this_batch) / (batch_ms * 1e6);
+
+            if (total_batches % 5 == 1 || total_batches <= 2 || step >= total_steps) {
+                printf("step %-5d loss=%-10.6f  %5.1fms/step  ANE=%.2f GFLOPS  total=%.2f GFLOPS  compiles=%d\n",
+                       step - steps_this_batch, last_loss, step_ms, ane_gflops, total_gflops, g_compile_count);
+            }
+
+            // Pipeline: launch background compile with updated weights,
+            // then immediately start NEXT batch's ANE evals with OLD kernels
+            // while compile runs concurrently on GCD queue
+            bool can_pipeline = (step < total_steps) && (g_compile_count + KERNELS_PER_STEP <= MAX_COMPILES);
+
+            if (can_pipeline) {
+                // Snapshot weights for background compile
+                PipelineCompile *pc = calloc(1, sizeof(PipelineCompile));
+                pc->W1 = (float*)malloc(H * D * sizeof(float));
+                pc->W2 = (float*)malloc(D * H * sizeof(float));
+                memcpy(pc->W1, W1, H * D * sizeof(float));
+                memcpy(pc->W2, W2, D * H * sizeof(float));
+                pc->D = D; pc->H = H; pc->S = S;
+
+                dispatch_semaphore_t sem = dispatch_semaphore_create(0);
+
+                dispatch_async(g_compile_queue, ^{
+                    @autoreleasepool {
+                        uint64_t t0 = mach_absolute_time();
+                        pc->k1_fwd = compile_kern_with_blob(build_blob(pc->W1, pc->H, pc->D), pc->D, pc->H, pc->S);
+                        pc->k2_fwd = compile_kern_with_blob(build_blob(pc->W2, pc->D, pc->H), pc->H, pc->D, pc->S);
+                        pc->k2_bwd = compile_kern_with_blob(build_blob_transposed(pc->W2, pc->D, pc->H), pc->D, pc->H, pc->S);
+                        pc->k1_bwd = compile_kern_with_blob(build_blob_transposed(pc->W1, pc->H, pc->D), pc->H, pc->D, pc->S);
+                        pc->compile_ms = tb_to_ms(mach_absolute_time() - t0, g_tb);
+                        pc->ok = pc->k1_fwd && pc->k2_fwd && pc->k1_bwd && pc->k2_bwd;
+                        dispatch_semaphore_signal(sem);
+                    }
+                });
+
+                // === While compile runs in background, do ANOTHER batch with OLD kernels ===
+                if (step < total_steps && k1_fwd && k2_fwd && k1_bwd && k2_bwd) {
+                    float *aW1b = (float*)calloc(H * D, sizeof(float));
+                    float *aW2b = (float*)calloc(D * H, sizeof(float));
+                    int steps_overlap = 0;
+                    uint64_t t_overlap = mach_absolute_time();
+
+                    for (int a = 0; a < ACCUM_STEPS && step < total_steps; a++, step++) {
+                        ane_eval_k(k1_fwd, x, h, D, H, S);
+                        for (int i = 0; i < S*H; i++) h_relu[i] = h[i] > 0 ? h[i] : 0;
+                        ane_eval_k(k2_fwd, h_relu, y, H, D, S);
+
+                        float loss = 0;
+                        for (int i = 0; i < S*D; i++) {
+                            float diff = y[i] - y_target[i];
+                            loss += diff * diff;
+                            dy[i] = 2.0f * diff / (S * D);
+                        }
+                        loss /= (S * D);
+                        last_loss = loss;
+
+                        ane_eval_k(k2_bwd, dy, dh_relu, D, H, S);
+                        for (int i = 0; i < S*H; i++) dh[i] = h[i] > 0 ? dh_relu[i] : 0;
+                        ane_eval_k(k1_bwd, dh, dx_layer, H, D, S);
+
+                        for (int t = 0; t < S; t++)
+                            for (int i = 0; i < D; i++)
+                                for (int j = 0; j < H; j++)
+                                    aW2b[i*H + j] += dy[t*D + i] * h_relu[t*H + j];
+                        for (int t = 0; t < S; t++)
+                            for (int i = 0; i < H; i++)
+                                for (int j = 0; j < D; j++)
+                                    aW1b[i*D + j] += dh[t*H + i] * x[t*D + j];
+                        steps_overlap++;
+                    }
+                    double overlap_ms = tb_to_ms(mach_absolute_time() - t_overlap, g_tb);
+                    total_train_ms += overlap_ms;
+                    total_steps_done += steps_overlap;
+                    total_batches++;
+
+                    // Apply these gradients with reduced LR (stale weights — 1 batch behind)
+                    float sc = 0.5f / steps_overlap; // half LR for stale batch
+                    for (int i = 0; i < H*D; i++) W1[i] -= lr * aW1b[i] * sc;
+                    for (int i = 0; i < D*H; i++) W2[i] -= lr * aW2b[i] * sc;
+                    free(aW1b); free(aW2b);
+
+                    if (total_batches % 5 == 1) {
+                        double sm = overlap_ms / steps_overlap;
+                        printf("step %-5d loss=%-10.6f  %5.1fms/step  (overlapped with compile)  compiles=%d\n",
+                               step - steps_overlap, last_loss, sm, g_compile_count);
+                    }
+                }
+
+                // Wait for compile to finish
+                dispatch_semaphore_wait(sem, DISPATCH_TIME_FOREVER);
+                total_compile_ms += pc->compile_ms;
+                total_hidden_compile_ms += pc->compile_ms; // all hidden behind train
+
+                free_kern(k1_fwd); free_kern(k2_fwd);
+                free_kern(k1_bwd); free_kern(k2_bwd);
+
+                if (pc->ok) {
+                    k1_fwd = pc->k1_fwd; k2_fwd = pc->k2_fwd;
+                    k1_bwd = pc->k1_bwd; k2_bwd = pc->k2_bwd;
+                } else {
+                    k1_fwd = k2_fwd = k1_bwd = k2_bwd = NULL;
+                }
+                free(pc->W1); free(pc->W2); free(pc);
+            } else if (step < total_steps) {
+                // Synchronous compile (no budget for pipeline)
+                uint64_t t0 = mach_absolute_time();
+                free_kern(k1_fwd); free_kern(k2_fwd);
+                free_kern(k1_bwd); free_kern(k2_bwd);
+                k1_fwd = compile_kern_with_blob(build_blob(W1, H, D), D, H, S);
+                k2_fwd = compile_kern_with_blob(build_blob(W2, D, H), H, D, S);
+                k2_bwd = compile_kern_with_blob(build_blob_transposed(W2, D, H), D, H, S);
+                k1_bwd = compile_kern_with_blob(build_blob_transposed(W1, H, D), H, D, S);
+                double cms = tb_to_ms(mach_absolute_time() - t0, g_tb);
+                total_compile_ms += cms;
+                if (!k1_fwd || !k2_fwd || !k1_bwd || !k2_bwd) {
+                    save_checkpoint(CKPT_PATH, step, last_loss, D, H, S, total_steps, lr, W1, W2,
+                                    cum_compile_ms + total_compile_ms, cum_train_ms + total_train_ms,
+                                    cum_wall_ms + tb_to_ms(mach_absolute_time() - t_wall_start, g_tb),
+                                    cum_steps + total_steps_done, cum_batches + total_batches);
+                    fflush(stdout);
+                    execl(argv[0], argv[0], "--resume", NULL);
+                    perror("execl failed"); return 1;
+                }
+            }
+
+            if (last_loss < 1e-6f) { printf("\nConverged at step %d!\n", step); break; }
+        }
+
+        total_wall_ms = tb_to_ms(mach_absolute_time() - t_wall_start, g_tb);
+        // Add cumulative from previous exec() runs
+        total_compile_ms += cum_compile_ms;
+        total_train_ms += cum_train_ms;
+        total_wall_ms += cum_wall_ms;
+        total_steps_done += cum_steps;
+        total_batches += cum_batches;
+
+        // === Final output ===
+        printf("\nFinal output vs target (first 8):\n");
+        if (k1_fwd && k2_fwd) {
+            ane_eval_k(k1_fwd, x, h, D, H, S);
+            for (int i = 0; i < S*H; i++) h_relu[i] = h[i] > 0 ? h[i] : 0;
+            ane_eval_k(k2_fwd, h_relu, y, H, D, S);
+        }
+        printf("  y:      "); for (int i = 0; i < 8; i++) printf("%.4f ", y[i]); printf("\n");
+        printf("  target: "); for (int i = 0; i < 8; i++) printf("%.4f ", y_target[i]); printf("\n");
+
+        // === Efficiency Report ===
+        printf("\n=== Efficiency Report ===\n");
+        printf("Total steps:     %d\n", total_steps_done);
+        printf("Total batches:   %d (accum %d steps each)\n", total_batches, ACCUM_STEPS);
+        printf("Wall time:       %.0f ms\n", total_wall_ms);
+        printf("Compile time:    %.0f ms (%.1f%%)\n", total_compile_ms, 100.0*total_compile_ms/total_wall_ms);
+        printf("Train time:      %.0f ms (%.1f%%)\n", total_train_ms, 100.0*total_train_ms/total_wall_ms);
+        printf("Overhead:        %.0f ms (%.1f%%)\n",
+               total_wall_ms - total_compile_ms - total_train_ms,
+               100.0*(total_wall_ms - total_compile_ms - total_train_ms)/total_wall_ms);
+        printf("\n");
+        printf("Avg compile:     %.1f ms per batch (4 kernels)\n", total_compile_ms / total_batches);
+        printf("Avg train:       %.2f ms per step (ANE fwd+bwd + CPU dW)\n", total_train_ms / total_steps_done);
+        printf("Avg wall/step:   %.2f ms\n", total_wall_ms / total_steps_done);
+        printf("\n");
+        double ane_total_flops = ane_flops_per_step * total_steps_done;
+        double cpu_total_flops = cpu_flops_per_step * total_steps_done;
+        printf("ANE FLOPs total: %.3f MFLOP  (%.2f GFLOPS sustained)\n",
+               ane_total_flops / 1e6, ane_total_flops / (total_train_ms * 1e6));
+        printf("CPU FLOPs total: %.3f MFLOP  (%.2f GFLOPS sustained)\n",
+               cpu_total_flops / 1e6, cpu_total_flops / (total_train_ms * 1e6));
+        printf("Total FLOPs:     %.3f MFLOP  (%.2f GFLOPS sustained)\n",
+               (ane_total_flops + cpu_total_flops) / 1e6,
+               (ane_total_flops + cpu_total_flops) / (total_train_ms * 1e6));
+        printf("\n");
+        printf("ANE utilization: %.4f%% of 15.8 TFLOPS peak\n",
+               100.0 * ane_total_flops / (total_train_ms * 1e6) / 15800.0);
+        printf("Weight params:   %d (%.1f KB FP16)\n",
+               H*D + D*H, weight_bytes / 1024.0);
+        printf("Compile amortization: %.1f ms compile / %d steps = %.2f ms/step overhead\n",
+               total_compile_ms / total_batches, ACCUM_STEPS,
+               total_compile_ms / total_batches / ACCUM_STEPS);
+        printf("Compile fraction: %.1f%% of wall time\n", 100.0 * total_compile_ms / total_wall_ms);
+        printf("Train fraction:   %.1f%% of wall time (useful work)\n", 100.0 * total_train_ms / total_wall_ms);
+
+        free_kern(k1_fwd); free_kern(k2_fwd); free_kern(k1_bwd); free_kern(k2_bwd);
+        free(W1); free(W2); free(x); free(y_target);
+        free(h); free(h_relu); free(y); free(dy); free(dh_relu); free(dh); free(dx_layer);
+        unlink(CKPT_PATH);
+    }
+    return 0;
+}
diff --git a/training/tiny_train_old.m b/training/tiny_train_old.m
new file mode 100644
index 0000000..c22a90c
--- /dev/null
+++ b/training/tiny_train_old.m
@@ -0,0 +1,309 @@
+// tiny_train.m — Train a 2-layer linear model on ANE (forward AND backward)
+// y = W2 @ relu(W1 @ x), MSE loss, SGD update
+// Forward: ANE conv with baked weights
+// Backward dx: ANE conv with transposed baked weights
+// Backward dW: CPU (outer product, memory-bound)
+#import <Foundation/Foundation.h>
+#import <objc/runtime.h>
+#import <objc/message.h>
+#import <dlfcn.h>
+#import <IOSurface/IOSurface.h>
+#import <mach/mach_time.h>
+#include <math.h>
+
+static Class g_D, g_I, g_AR, g_AIO;
+
+static void ane_init(void) {
+    dlopen("/System/Library/PrivateFrameworks/AppleNeuralEngine.framework/AppleNeuralEngine", RTLD_NOW);
+    g_D  = NSClassFromString(@"_ANEInMemoryModelDescriptor");
+    g_I  = NSClassFromString(@"_ANEInMemoryModel");
+    g_AR = NSClassFromString(@"_ANERequest");
+    g_AIO= NSClassFromString(@"_ANEIOSurfaceObject");
+}
+
+static IOSurfaceRef make_surface(size_t bytes) {
+    return IOSurfaceCreate((__bridge CFDictionaryRef)@{
+        (id)kIOSurfaceWidth:@(bytes), (id)kIOSurfaceHeight:@1,
+        (id)kIOSurfaceBytesPerElement:@1, (id)kIOSurfaceBytesPerRow:@(bytes),
+        (id)kIOSurfaceAllocSize:@(bytes), (id)kIOSurfacePixelFormat:@0});
+}
+
+static NSData *build_blob(const float *w, int rows, int cols) {
+    int wsize = rows * cols * 2;
+    int total = 128 + wsize;
+    uint8_t *buf = (uint8_t*)calloc(total, 1);
+    buf[0] = 0x01; buf[4] = 0x02;
+    buf[64] = 0xEF; buf[65] = 0xBE; buf[66] = 0xAD; buf[67] = 0xDE;
+    buf[68] = 0x01;
+    *(uint32_t*)(buf+72) = wsize;
+    *(uint32_t*)(buf+80) = 128;
+    _Float16 *fp16 = (_Float16*)(buf + 128);
+    for (int i = 0; i < rows * cols; i++) fp16[i] = (_Float16)w[i];
+    return [NSData dataWithBytesNoCopy:buf length:total freeWhenDone:YES];
+}
+
+// Build blob with TRANSPOSED weights: W[rows,cols] → W^T[cols,rows]
+static NSData *build_blob_transposed(const float *w, int rows, int cols) {
+    int wsize = cols * rows * 2;
+    int total = 128 + wsize;
+    uint8_t *buf = (uint8_t*)calloc(total, 1);
+    buf[0] = 0x01; buf[4] = 0x02;
+    buf[64] = 0xEF; buf[65] = 0xBE; buf[66] = 0xAD; buf[67] = 0xDE;
+    buf[68] = 0x01;
+    *(uint32_t*)(buf+72) = wsize;
+    *(uint32_t*)(buf+80) = 128;
+    _Float16 *fp16 = (_Float16*)(buf + 128);
+    for (int i = 0; i < rows; i++)
+        for (int j = 0; j < cols; j++)
+            fp16[j * rows + i] = (_Float16)w[i * cols + j]; // transpose
+    return [NSData dataWithBytesNoCopy:buf length:total freeWhenDone:YES];
+}
+
+static NSString *gen_conv_mil(int in_ch, int out_ch, int sp) {
+    return [NSString stringWithFormat:
+        @"program(1.3)\n[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, "
+        "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, "
+        "{\"coremltools-version\", \"9.0\"}})]\n{\n"
+        "    func main<ios18>(tensor<fp32, [1, %d, 1, %d]> x) {\n"
+        "        string d1 = const()[name = string(\"d1\"), val = string(\"fp16\")];\n"
+        "        tensor<fp16, [1, %d, 1, %d]> x16 = cast(dtype = d1, x = x)[name = string(\"cx\")];\n"
+        "        tensor<fp16, [%d, %d, 1, 1]> W = const()[name = string(\"W\"), "
+        "val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = string(\"@model_path/weights/weight.bin\"), offset = uint64(64)))];\n"
+        "        string pt = const()[name = string(\"pt\"), val = string(\"valid\")];\n"
+        "        tensor<int32, [2]> st = const()[name = string(\"st\"), val = tensor<int32, [2]>([1, 1])];\n"
+        "        tensor<int32, [4]> pd = const()[name = string(\"pd\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
+        "        tensor<int32, [2]> dl = const()[name = string(\"dl\"), val = tensor<int32, [2]>([1, 1])];\n"
+        "        int32 gr = const()[name = string(\"gr\"), val = int32(1)];\n"
+        "        tensor<fp16, [1, %d, 1, %d]> y16 = conv(dilations = dl, groups = gr, pad = pd, "
+        "pad_type = pt, strides = st, weight = W, x = x16)[name = string(\"cv\")];\n"
+        "        string d2 = const()[name = string(\"d2\"), val = string(\"fp32\")];\n"
+        "        tensor<fp32, [1, %d, 1, %d]> y = cast(dtype = d2, x = y16)[name = string(\"co\")];\n"
+        "    } -> (y);\n}\n",
+        in_ch, sp, in_ch, sp, out_ch, in_ch, out_ch, in_ch, out_ch, sp, out_ch, sp];
+}
+
+typedef struct {
+    id model;
+    IOSurfaceRef ioIn, ioOut;
+    id request;
+    NSString *tmpDir;
+} Kern;
+
+static Kern *compile_kern_with_blob(NSData *blob, int in_ch, int out_ch, int sp) {
+    NSString *mil = gen_conv_mil(in_ch, out_ch, sp);
+    NSData *milData = [mil dataUsingEncoding:NSUTF8StringEncoding];
+    NSDictionary *wd = @{@"@model_path/weights/weight.bin":@{@"offset":@0,@"data":blob}};
+    id desc = ((id(*)(Class,SEL,id,id,id))objc_msgSend)(g_D, @selector(modelWithMILText:weights:optionsPlist:), milData, wd, nil);
+    if (!desc) return NULL;
+    id mdl = ((id(*)(Class,SEL,id))objc_msgSend)(g_I, @selector(inMemoryModelWithDescriptor:), desc);
+    id hx = ((id(*)(id,SEL))objc_msgSend)(mdl, @selector(hexStringIdentifier));
+    NSString *td = [NSTemporaryDirectory() stringByAppendingPathComponent:hx];
+    NSFileManager *fm = [NSFileManager defaultManager];
+    [fm createDirectoryAtPath:[td stringByAppendingPathComponent:@"weights"] withIntermediateDirectories:YES attributes:nil error:nil];
+    [milData writeToFile:[td stringByAppendingPathComponent:@"model.mil"] atomically:YES];
+    [blob writeToFile:[td stringByAppendingPathComponent:@"weights/weight.bin"] atomically:YES];
+    NSError *e = nil;
+    if (!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(compileWithQoS:options:error:), 21, @{}, &e)) return NULL;
+    if (!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(loadWithQoS:options:error:), 21, @{}, &e)) return NULL;
+    size_t inB = in_ch * sp * 4, outB = out_ch * sp * 4;
+    IOSurfaceRef ioI = make_surface(inB), ioO = make_surface(outB);
+    id wI = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioI);
+    id wO = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioO);
+    id req = ((id(*)(Class,SEL,id,id,id,id,id,id,id))objc_msgSend)(g_AR,
+        @selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:),
+        @[wI], @[@0], @[wO], @[@0], nil, nil, @0);
+    Kern *k = calloc(1, sizeof(Kern));
+    k->model = mdl; k->ioIn = ioI; k->ioOut = ioO; k->request = req; k->tmpDir = td;
+    return k;
+}
+
+static void free_kern(Kern *k) {
+    if (!k) return;
+    NSError *e = nil;
+    ((BOOL(*)(id,SEL,unsigned int,NSError**))objc_msgSend)(k->model, @selector(unloadWithQoS:error:), 21, &e);
+    CFRelease(k->ioIn); CFRelease(k->ioOut);
+    [[NSFileManager defaultManager] removeItemAtPath:k->tmpDir error:nil];
+    free(k);
+}
+
+// ANE eval: input [S, in_ch] row-major ↔ [in_ch, S] channels-first
+static void ane_eval(Kern *k, const float *in, float *out, int in_ch, int out_ch, int sp) {
+    float *tmp = (float*)malloc(in_ch * sp * sizeof(float));
+    for (int t = 0; t < sp; t++)
+        for (int c = 0; c < in_ch; c++)
+            tmp[c*sp + t] = in[t*in_ch + c];
+    IOSurfaceLock(k->ioIn, 0, NULL);
+    memcpy(IOSurfaceGetBaseAddress(k->ioIn), tmp, in_ch * sp * sizeof(float));
+    IOSurfaceUnlock(k->ioIn, 0, NULL);
+    free(tmp);
+    NSError *e = nil;
+    ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)(
+        k->model, @selector(evaluateWithQoS:options:request:error:), 21, @{}, k->request, &e);
+    float *tmp2 = (float*)malloc(out_ch * sp * sizeof(float));
+    IOSurfaceLock(k->ioOut, kIOSurfaceLockReadOnly, NULL);
+    memcpy(tmp2, IOSurfaceGetBaseAddress(k->ioOut), out_ch * sp * sizeof(float));
+    IOSurfaceUnlock(k->ioOut, kIOSurfaceLockReadOnly, NULL);
+    for (int t = 0; t < sp; t++)
+        for (int c = 0; c < out_ch; c++)
+            out[t*out_ch + c] = tmp2[c*sp + t];
+    free(tmp2);
+}
+
+int main(int argc, char *argv[]) {
+    @autoreleasepool {
+        ane_init();
+        mach_timebase_info_data_t tb;
+        mach_timebase_info(&tb);
+
+        int D = 64, H = 128, S = 16;
+        int steps = 25; // 4 kernels × 25 = 100 compiles, under 119 limit
+        float lr = 0.5f;
+        int recompile_every = 1; // recompile every step for correct gradients
+
+        float *W1 = (float*)malloc(H * D * sizeof(float));
+        float *W2 = (float*)malloc(D * H * sizeof(float));
+        for (int i = 0; i < H*D; i++) W1[i] = 0.01f * sinf(i * 1.3f + 0.7f);
+        for (int i = 0; i < D*H; i++) W2[i] = 0.01f * cosf(i * 0.9f + 1.1f);
+
+        float *x = (float*)calloc(S * D, sizeof(float));
+        float *y_target = (float*)calloc(S * D, sizeof(float));
+        for (int t = 0; t < S; t++)
+            for (int i = 0; i < D; i++) {
+                float v = sinf((t * D + i) * 0.1f);
+                x[t*D + i] = v;
+                y_target[t*D + i] = v;
+            }
+
+        printf("=== Tiny 2-Layer ANE Training (Forward + Backward on ANE) ===\n");
+        printf("x:[%d,%d] → W1:[%d,%d] → ReLU → W2:[%d,%d] → y:[%d,%d]\n", S,D, H,D, D,H, S,D);
+        printf("Forward: ANE conv | Backward dx: ANE conv(W^T) | Backward dW: CPU\n");
+        printf("Steps: %d, LR: %.4f, Recompile every %d steps\n\n", steps, lr, recompile_every);
+
+        float *h = (float*)malloc(S * H * sizeof(float));
+        float *h_relu = (float*)malloc(S * H * sizeof(float));
+        float *y = (float*)malloc(S * D * sizeof(float));
+        float *dy = (float*)malloc(S * D * sizeof(float));
+        float *dh_relu = (float*)malloc(S * H * sizeof(float));
+        float *dh = (float*)malloc(S * H * sizeof(float));
+        float *dx_layer = (float*)malloc(S * D * sizeof(float)); // not used for update but proves backward works
+        float *dW1 = (float*)calloc(H * D, sizeof(float));
+        float *dW2 = (float*)calloc(D * H, sizeof(float));
+
+        // 4 ANE kernels: 2 forward + 2 backward (transposed weights)
+        Kern *k1_fwd = NULL, *k2_fwd = NULL;  // W1: [H,D]→conv(D→H), W2: [D,H]→conv(H→D)
+        Kern *k1_bwd = NULL, *k2_bwd = NULL;  // W1^T: [D,H]→conv(H→D), W2^T: [H,D]→conv(D→H)
+        bool on_ane = true;
+
+        printf("%-6s %-12s %-10s %-6s\n", "Step", "MSE Loss", "ms/step", "Backend");
+        printf("--------------------------------------\n");
+
+        for (int step = 0; step < steps; step++) {
+            uint64_t t0 = mach_absolute_time();
+
+            if (on_ane && step % recompile_every == 0) {
+                free_kern(k1_fwd); free_kern(k2_fwd);
+                free_kern(k1_bwd); free_kern(k2_bwd);
+                k1_fwd = k2_fwd = k1_bwd = k2_bwd = NULL;
+                @autoreleasepool {
+                    k1_fwd = compile_kern_with_blob(build_blob(W1, H, D), D, H, S);
+                    k2_fwd = compile_kern_with_blob(build_blob(W2, D, H), H, D, S);
+                    // Backward: dx = W^T @ dy → conv with transposed weight
+                    // W2^T: [H,D] as conv weight, input dy [1,D,1,S] → output dh [1,H,1,S]
+                    k2_bwd = compile_kern_with_blob(build_blob_transposed(W2, D, H), D, H, S);
+                    // W1^T: [D,H] as conv weight, input dh [1,H,1,S] → output dx [1,D,1,S]
+                    k1_bwd = compile_kern_with_blob(build_blob_transposed(W1, H, D), H, D, S);
+                }
+                if (!k1_fwd || !k2_fwd || !k1_bwd || !k2_bwd) {
+                    printf("ANE limit at step %d, continuing on CPU\n", step);
+                    free_kern(k1_fwd); free_kern(k2_fwd);
+                    free_kern(k1_bwd); free_kern(k2_bwd);
+                    k1_fwd = k2_fwd = k1_bwd = k2_bwd = NULL;
+                    on_ane = false;
+                }
+            }
+
+            if (on_ane) {
+                // === Forward on ANE ===
+                ane_eval(k1_fwd, x, h, D, H, S);
+                for (int i = 0; i < S*H; i++) h_relu[i] = h[i] > 0 ? h[i] : 0;
+                ane_eval(k2_fwd, h_relu, y, H, D, S);
+            } else {
+                for (int t = 0; t < S; t++)
+                    for (int i = 0; i < H; i++) {
+                        float s = 0; for (int j = 0; j < D; j++) s += W1[i*D+j] * x[t*D+j];
+                        h[t*H+i] = s;
+                    }
+                for (int i = 0; i < S*H; i++) h_relu[i] = h[i] > 0 ? h[i] : 0;
+                for (int t = 0; t < S; t++)
+                    for (int i = 0; i < D; i++) {
+                        float s = 0; for (int j = 0; j < H; j++) s += W2[i*H+j] * h_relu[t*H+j];
+                        y[t*D+i] = s;
+                    }
+            }
+
+            // MSE loss + dL/dy
+            float loss = 0;
+            for (int i = 0; i < S*D; i++) {
+                float diff = y[i] - y_target[i];
+                loss += diff * diff;
+                dy[i] = 2.0f * diff / (S * D);
+            }
+            loss /= (S * D);
+
+            if (on_ane) {
+                // === Backward dx on ANE ===
+                // dh_relu = W2^T @ dy (ANE conv with transposed W2)
+                ane_eval(k2_bwd, dy, dh_relu, D, H, S);
+                // ReLU backward (CPU, element-wise)
+                for (int i = 0; i < S*H; i++) dh[i] = h[i] > 0 ? dh_relu[i] : 0;
+                // dx = W1^T @ dh (ANE conv with transposed W1)
+                ane_eval(k1_bwd, dh, dx_layer, H, D, S);
+            } else {
+                memset(dh_relu, 0, S * H * sizeof(float));
+                for (int t = 0; t < S; t++)
+                    for (int j = 0; j < H; j++)
+                        for (int i = 0; i < D; i++)
+                            dh_relu[t*H + j] += W2[i*H + j] * dy[t*D + i];
+                for (int i = 0; i < S*H; i++) dh[i] = h[i] > 0 ? dh_relu[i] : 0;
+            }
+
+            // dW on CPU (outer products — memory-bound, not worth ANE)
+            memset(dW2, 0, D * H * sizeof(float));
+            for (int t = 0; t < S; t++)
+                for (int i = 0; i < D; i++)
+                    for (int j = 0; j < H; j++)
+                        dW2[i*H + j] += dy[t*D + i] * h_relu[t*H + j];
+            memset(dW1, 0, H * D * sizeof(float));
+            for (int t = 0; t < S; t++)
+                for (int i = 0; i < H; i++)
+                    for (int j = 0; j < D; j++)
+                        dW1[i*D + j] += dh[t*H + i] * x[t*D + j];
+
+            // SGD
+            for (int i = 0; i < H*D; i++) W1[i] -= lr * dW1[i];
+            for (int i = 0; i < D*H; i++) W2[i] -= lr * dW2[i];
+
+            double ms = (double)(mach_absolute_time() - t0) * tb.numer / tb.denom / 1e6;
+
+            if (step % 1 == 0 || step == steps - 1)
+                printf("%-6d %-12.6f %-10.1f %-6s\n", step, loss, ms, on_ane ? "ANE" : "CPU");
+
+            if (loss < 1e-6f) { printf("\nConverged at step %d!\n", step); break; }
+        }
+
+        printf("\nFinal output vs target (first 8):\n");
+        if (on_ane && k1_fwd && k2_fwd) {
+            ane_eval(k1_fwd, x, h, D, H, S);
+            for (int i = 0; i < S*H; i++) h_relu[i] = h[i] > 0 ? h[i] : 0;
+            ane_eval(k2_fwd, h_relu, y, H, D, S);
+        }
+        printf("  y:      "); for (int i = 0; i < 8; i++) printf("%.4f ", y[i]); printf("\n");
+        printf("  target: "); for (int i = 0; i < 8; i++) printf("%.4f ", y_target[i]); printf("\n");
+
+        free_kern(k1_fwd); free_kern(k2_fwd); free_kern(k1_bwd); free_kern(k2_bwd);
+        free(W1); free(W2); free(x); free(y_target);
+        free(h); free(h_relu); free(y); free(dy); free(dh_relu); free(dh); free(dx_layer); free(dW1); free(dW2);
+        printf("\nDone.\n");
+    }
+    return 0;
+}
diff --git a/training/train.m b/training/train.m
new file mode 100644
index 0000000..6fd4a86
--- /dev/null
+++ b/training/train.m
@@ -0,0 +1,103 @@
+// train.m — Stories110M training loop on ANE
+// Usage: ./train <model.bin> [seq_len] [steps] [lr] [--cpu]
+#import <Foundation/Foundation.h>
+#import <mach/mach_time.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include "backward.h"
+
+static mach_timebase_info_data_t g_tb;
+static double ticksToMs(uint64_t t) { return (double)t * g_tb.numer / g_tb.denom / 1e6; }
+
+int main(int argc, char *argv[]) {
+    @autoreleasepool {
+        mach_timebase_info(&g_tb);
+
+        if (argc < 2) {
+            fprintf(stderr, "Usage: %s <model.bin> [seq_len=16] [steps=100] [lr=1e-4] [--cpu]\n", argv[0]);
+            return 1;
+        }
+
+        int seq_len = argc > 2 ? atoi(argv[2]) : 16;
+        int steps = argc > 3 ? atoi(argv[3]) : 100;
+        float lr = argc > 4 ? atof(argv[4]) : 1e-4f;
+        bool use_ane = true;
+        for (int i = 1; i < argc; i++)
+            if (strcmp(argv[i], "--cpu") == 0) use_ane = false;
+
+        printf("=== Stories110M ANE Training ===\n");
+        printf("Seq len: %d, Steps: %d, LR: %.2e, Backend: %s\n\n",
+               seq_len, steps, lr, use_ane ? "ANE" : "CPU");
+
+        Model m = {0};
+        printf("Loading weights...\n");
+        if (model_load_weights(&m, argv[1]) != 0) return 1;
+
+        if (use_ane) {
+            if (model_compile_kernels(&m, seq_len) != 0) {
+                fprintf(stderr, "ANE kernel compilation failed, falling back to CPU\n");
+                use_ane = false;
+            }
+        }
+        if (!use_ane) m.seq_len = seq_len;
+
+        model_alloc_training(&m);
+
+        // Training tokens: simple repeating pattern to overfit on
+        int *train_tokens = (int*)malloc(seq_len * sizeof(int));
+        for (int i = 0; i < seq_len; i++)
+            train_tokens[i] = (i * 7 + 13) % 256 + 1;
+
+        printf("\nTraining tokens (first 16): ");
+        for (int i = 0; i < 16 && i < seq_len; i++) printf("%d ", train_tokens[i]);
+        printf("...\n\n");
+
+        printf("%-6s %-10s %-12s %-10s %-10s\n", "Step", "Loss", "GradNorm", "ms/step", "tok/s");
+        printf("------------------------------------------------------\n");
+
+        int recompile_interval = 1; // Recompile ANE kernels every N steps
+        for (int step = 0; step < steps; step++) {
+            uint64_t t0 = mach_absolute_time();
+
+            float loss = model_forward(&m, train_tokens, use_ane);
+            if (isnan(loss) || isinf(loss)) {
+                printf("NaN/Inf loss at step %d, stopping.\n", step);
+                break;
+            }
+
+            model_backward(&m, train_tokens);
+            model_clip_gradients(&m, 1.0f);
+            model_adam_step(&m, lr, 0.9f, 0.999f, 1e-8f);
+
+            // Recompile ANE kernels with updated weights
+            if (use_ane && (step + 1) % recompile_interval == 0) {
+                if (model_recompile_kernels(&m) != 0) {
+                    printf("Recompile failed at step %d, switching to CPU\n", step);
+                    use_ane = false;
+                }
+            }
+
+            double ms = ticksToMs(mach_absolute_time() - t0);
+            double tps = (seq_len - 1) / (ms / 1000.0);
+
+            if (step % 10 == 0 || step == steps - 1) {
+                double gnorm = 0;
+                int d2 = m.cfg.dim;
+                for (int i = 0; i < d2*d2; i++) gnorm += (double)m.grad_wq[0][i]*m.grad_wq[0][i];
+                gnorm = sqrt(gnorm);
+                printf("%-6d %-10.4f %-12.4f %-10.1f %-10.1f\n", step, loss, gnorm, ms, tps);
+            }
+
+            if (loss < 0.01f) {
+                printf("\nConverged at step %d! Loss: %.6f\n", step, loss);
+                break;
+            }
+        }
+
+        free(train_tokens);
+        printf("\nDone.\n");
+    }
+    return 0;
+}
diff --git a/training/train_large.m b/training/train_large.m
new file mode 100644
index 0000000..55c1cf8
--- /dev/null
+++ b/training/train_large.m
@@ -0,0 +1,1005 @@
+// train_large.m — Train a single transformer layer FULLY on ANE
+// 7 ANE kernels per step:
+//   Forward:  kFwdAttn (QKV+SDPA+Wo, taps Q,K,V,attn_out) + kFwdFFN (W1+W3+SiLU+W2, taps h1,h3,silu_out)
+//   Backward: kFFNBwd (W2^T+SiLU_bwd+W1^T+W3^T) + kSdpaBwd1 (Wo^T+SDPA) + kSdpaBwd2 + kQKVb (Wq^T+Wk^T+Wv^T)
+// CPU: RMSNorm (fwd+bwd), residuals, loss, dW accumulation (cblas), SGD update
+// NO CPU recompute of Q,K,V,h1,h3 — all exposed via forward taps
+#import <Foundation/Foundation.h>
+#import <objc/runtime.h>
+#import <objc/message.h>
+#import <dlfcn.h>
+#import <IOSurface/IOSurface.h>
+#import <mach/mach_time.h>
+#import <Accelerate/Accelerate.h>
+#include <math.h>
+#include <unistd.h>
+#include <dispatch/dispatch.h>
+
+#define DIM 768
+#define HIDDEN 2048
+#define HEADS 12
+#define HD (DIM/HEADS)
+#define SEQ 512
+#define ACCUM_STEPS 100
+#define MAX_COMPILES 100
+#define NUM_KERNELS 6
+#define CKPT_PATH "/tmp/ane_large_ckpt.bin"
+
+static Class g_D, g_I, g_AR, g_AIO;
+static mach_timebase_info_data_t g_tb;
+static int g_compile_count = 0;
+
+static void ane_init(void) {
+    dlopen("/System/Library/PrivateFrameworks/AppleNeuralEngine.framework/AppleNeuralEngine", RTLD_NOW);
+    g_D  = NSClassFromString(@"_ANEInMemoryModelDescriptor");
+    g_I  = NSClassFromString(@"_ANEInMemoryModel");
+    g_AR = NSClassFromString(@"_ANERequest");
+    g_AIO= NSClassFromString(@"_ANEIOSurfaceObject");
+}
+static double tb_ms(uint64_t t) { return (double)t * g_tb.numer / g_tb.denom / 1e6; }
+static IOSurfaceRef make_surface(size_t bytes) {
+    return IOSurfaceCreate((__bridge CFDictionaryRef)@{
+        (id)kIOSurfaceWidth:@(bytes), (id)kIOSurfaceHeight:@1,
+        (id)kIOSurfaceBytesPerElement:@1, (id)kIOSurfaceBytesPerRow:@(bytes),
+        (id)kIOSurfaceAllocSize:@(bytes), (id)kIOSurfacePixelFormat:@0});
+}
+static NSData *build_blob(const float *w, int rows, int cols) {
+    int ws=rows*cols*2, tot=128+ws;
+    uint8_t *b=(uint8_t*)calloc(tot,1);
+    b[0]=1;b[4]=2;b[64]=0xEF;b[65]=0xBE;b[66]=0xAD;b[67]=0xDE;b[68]=1;
+    *(uint32_t*)(b+72)=ws;*(uint32_t*)(b+80)=128;
+    _Float16 *fp16=(_Float16*)(b+128);
+    for(int i=0;i<rows*cols;i++) fp16[i]=(_Float16)w[i];
+    return [NSData dataWithBytesNoCopy:b length:tot freeWhenDone:YES];
+}
+static NSData *build_blob_t(const float *w, int rows, int cols) {
+    int ws=cols*rows*2, tot=128+ws;
+    uint8_t *b=(uint8_t*)calloc(tot,1);
+    b[0]=1;b[4]=2;b[64]=0xEF;b[65]=0xBE;b[66]=0xAD;b[67]=0xDE;b[68]=1;
+    *(uint32_t*)(b+72)=ws;*(uint32_t*)(b+80)=128;
+    _Float16 *fp16=(_Float16*)(b+128);
+    for(int i=0;i<rows;i++) for(int j=0;j<cols;j++) fp16[j*rows+i]=(_Float16)w[i*cols+j];
+    return [NSData dataWithBytesNoCopy:b length:tot freeWhenDone:YES];
+}
+static NSData *build_blob_fp16(_Float16 *d, int cnt) {
+    int ws=cnt*2, tot=128+ws;
+    uint8_t *b=(uint8_t*)calloc(tot,1);
+    b[0]=1;b[4]=2;b[64]=0xEF;b[65]=0xBE;b[66]=0xAD;b[67]=0xDE;b[68]=1;
+    *(uint32_t*)(b+72)=ws;*(uint32_t*)(b+80)=128;
+    memcpy(b+128,d,ws);
+    return [NSData dataWithBytesNoCopy:b length:tot freeWhenDone:YES];
+}
+
+// ===== MIL generators =====
+
+#define MIL_HDR \
+    @"program(1.3)\n[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, " \
+    "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, " \
+    "{\"coremltools-version\", \"9.0\"}})]\n{\n"
+#define CONV_CONST \
+    "        string pt = const()[name=string(\"pt\"), val=string(\"valid\")];\n" \
+    "        tensor<int32, [2]> st = const()[name=string(\"st\"), val=tensor<int32, [2]>([1,1])];\n" \
+    "        tensor<int32, [4]> pd = const()[name=string(\"pd\"), val=tensor<int32, [4]>([0,0,0,0])];\n" \
+    "        tensor<int32, [2]> dl = const()[name=string(\"dl\"), val=tensor<int32, [2]>([1,1])];\n" \
+    "        int32 gr = const()[name=string(\"gr\"), val=int32(1)];\n"
+
+// SDPA forward + taps: x_in → rmsnorm → QKV+SDPA+Wo → concat(o_out, Q, K, V, attn_out, xnorm) fp16
+static NSString *gen_sdpa_fwd_taps(void) {
+    float sc = 1.0f/sqrtf((float)HD);
+    float invd = 1.0f/(float)DIM;
+    NSMutableString *m = [NSMutableString string];
+    [m appendString:MIL_HDR];
+    [m appendFormat:@"    func main<ios18>(tensor<fp16, [1, %d, 1, %d]> x) {\n", DIM, SEQ];
+    // --- RMSNorm: x → xn ---
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> sq = mul(x=x,y=x)[name=string(\"sq\")];\n", DIM, SEQ];
+    [m appendFormat:@"        tensor<int32, [1]> rax = const()[name=string(\"rax\"), val=tensor<int32, [1]>([1])];\n"];
+    [m appendFormat:@"        bool kd = const()[name=string(\"kd\"), val=bool(true)];\n"];
+    [m appendFormat:@"        tensor<fp16, [1,1,1,%d]> ss = reduce_sum(x=sq,axes=rax,keep_dims=kd)[name=string(\"ss\")];\n", SEQ];
+    [m appendFormat:@"        fp16 invd = const()[name=string(\"invd\"), val=fp16(%f)];\n", invd];
+    [m appendFormat:@"        tensor<fp16, [1,1,1,%d]> ss2 = mul(x=ss,y=invd)[name=string(\"ss2\")];\n", SEQ];
+    [m appendFormat:@"        fp16 eps = const()[name=string(\"eps\"), val=fp16(0.00001)];\n"];
+    [m appendFormat:@"        tensor<fp16, [1,1,1,%d]> ss3 = add(x=ss2,y=eps)[name=string(\"ss3\")];\n", SEQ];
+    [m appendFormat:@"        fp16 nhalf = const()[name=string(\"nhalf\"), val=fp16(-0.5)];\n"];
+    [m appendFormat:@"        tensor<fp16, [1,1,1,%d]> rrms = pow(x=ss3,y=nhalf)[name=string(\"rrms\")];\n", SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> xr = mul(x=x,y=rrms)[name=string(\"xr\")];\n", DIM, SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,1]> rw = const()[name=string(\"rw\"), val=tensor<fp16, [1,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/rms1.bin\"), offset=uint64(64)))];\n", DIM, DIM];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> xn = mul(x=xr,y=rw)[name=string(\"xn\")];\n", DIM, SEQ];
+    // --- QKV + SDPA + Wo (operates on xn) ---
+    [m appendString:@CONV_CONST];
+    [m appendFormat:@"        tensor<fp16, [%d,%d,1,1]> Wq = const()[name=string(\"Wq\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/wq.bin\"), offset=uint64(64)))];\n", DIM,DIM,DIM,DIM];
+    [m appendFormat:@"        tensor<fp16, [%d,%d,1,1]> Wk = const()[name=string(\"Wk\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/wk.bin\"), offset=uint64(64)))];\n", DIM,DIM,DIM,DIM];
+    [m appendFormat:@"        tensor<fp16, [%d,%d,1,1]> Wv = const()[name=string(\"Wv\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/wv.bin\"), offset=uint64(64)))];\n", DIM,DIM,DIM,DIM];
+    [m appendFormat:@"        tensor<fp16, [%d,%d,1,1]> Wo = const()[name=string(\"Wo\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/wo.bin\"), offset=uint64(64)))];\n", DIM,DIM,DIM,DIM];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> qf = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wq,x=xn)[name=string(\"cq\")];\n", DIM,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> kf = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wk,x=xn)[name=string(\"ck\")];\n", DIM,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> vf = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wv,x=xn)[name=string(\"cv\")];\n", DIM,SEQ];
+    [m appendFormat:@"        tensor<int32, [4]> qsh = const()[name=string(\"qsh\"), val=tensor<int32, [4]>([1,%d,%d,%d])];\n", HEADS,HD,SEQ];
+    [m appendString:@"        tensor<int32, [4]> pm = const()[name=string(\"pm\"), val=tensor<int32, [4]>([0,1,3,2])];\n"];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> q4 = reshape(shape=qsh,x=qf)[name=string(\"rq\")];\n", HEADS,HD,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> q = transpose(perm=pm,x=q4)[name=string(\"tq\")];\n", HEADS,SEQ,HD];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> k4 = reshape(shape=qsh,x=kf)[name=string(\"rk\")];\n", HEADS,HD,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> k = transpose(perm=pm,x=k4)[name=string(\"tk\")];\n", HEADS,SEQ,HD];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> v4 = reshape(shape=qsh,x=vf)[name=string(\"rv\")];\n", HEADS,HD,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> v = transpose(perm=pm,x=v4)[name=string(\"tv\")];\n", HEADS,SEQ,HD];
+    [m appendString:@"        bool tx = const()[name=string(\"tx\"), val=bool(false)];\n"];
+    [m appendString:@"        bool ty = const()[name=string(\"ty\"), val=bool(true)];\n"];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> sc1 = matmul(transpose_x=tx,transpose_y=ty,x=q,y=k)[name=string(\"mm1\")];\n", HEADS,SEQ,SEQ];
+    [m appendFormat:@"        fp16 scv = const()[name=string(\"scv\"), val=fp16(%f)];\n", sc];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> sc2 = mul(x=sc1,y=scv)[name=string(\"scl\")];\n", HEADS,SEQ,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,1,%d,%d]> cm = const()[name=string(\"cm\"), val=tensor<fp16, [1,1,%d,%d]>(BLOBFILE(path=string(\"@model_path/weights/mask.bin\"), offset=uint64(64)))];\n", SEQ,SEQ,SEQ,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> ms = add(x=sc2,y=cm)[name=string(\"msk\")];\n", HEADS,SEQ,SEQ];
+    [m appendString:@"        int32 sax = const()[name=string(\"sax\"), val=int32(-1)];\n"];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> aw = softmax(axis=sax,x=ms)[name=string(\"sm\")];\n", HEADS,SEQ,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> a4 = matmul(transpose_x=tx,transpose_y=tx,x=aw,y=v)[name=string(\"mm2\")];\n", HEADS,SEQ,HD];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> at = transpose(perm=pm,x=a4)[name=string(\"ta\")];\n", HEADS,HD,SEQ];
+    [m appendFormat:@"        tensor<int32, [4]> os = const()[name=string(\"os\"), val=tensor<int32, [4]>([1,%d,1,%d])];\n", DIM,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> af = reshape(shape=os,x=at)[name=string(\"ra\")];\n", DIM,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> oo = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wo,x=af)[name=string(\"co\")];\n", DIM,SEQ];
+    [m appendString:@"        int32 cax = const()[name=string(\"cax\"), val=int32(1)];\n"];
+    [m appendString:@"        bool cid = const()[name=string(\"cid\"), val=bool(false)];\n"];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> out = concat(axis=cax,interleave=cid,values=(oo,qf,kf,vf,af,xn))[name=string(\"cat\")];\n", 6*DIM,SEQ];
+    [m appendString:@"    } -> (out);\n}\n"];
+    return m;
+}
+
+// FFN forward + taps: x2 → rmsnorm → FFN → concat(ffn_out, h1, h3, silu_out, x2norm) fp16
+static NSString *gen_ffn_fwd_taps(void) {
+    float invd = 1.0f/(float)DIM;
+    NSMutableString *m = [NSMutableString string];
+    [m appendString:MIL_HDR];
+    [m appendFormat:@"    func main<ios18>(tensor<fp16, [1, %d, 1, %d]> x) {\n", DIM, SEQ];
+    // --- RMSNorm: x → xn ---
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> sq = mul(x=x,y=x)[name=string(\"sq\")];\n", DIM, SEQ];
+    [m appendFormat:@"        tensor<int32, [1]> rax = const()[name=string(\"rax\"), val=tensor<int32, [1]>([1])];\n"];
+    [m appendFormat:@"        bool kd = const()[name=string(\"kd\"), val=bool(true)];\n"];
+    [m appendFormat:@"        tensor<fp16, [1,1,1,%d]> ss = reduce_sum(x=sq,axes=rax,keep_dims=kd)[name=string(\"ss\")];\n", SEQ];
+    [m appendFormat:@"        fp16 invd = const()[name=string(\"invd\"), val=fp16(%f)];\n", invd];
+    [m appendFormat:@"        tensor<fp16, [1,1,1,%d]> ss2 = mul(x=ss,y=invd)[name=string(\"ss2\")];\n", SEQ];
+    [m appendFormat:@"        fp16 eps = const()[name=string(\"eps\"), val=fp16(0.00001)];\n"];
+    [m appendFormat:@"        tensor<fp16, [1,1,1,%d]> ss3 = add(x=ss2,y=eps)[name=string(\"ss3\")];\n", SEQ];
+    [m appendFormat:@"        fp16 nhalf = const()[name=string(\"nhalf\"), val=fp16(-0.5)];\n"];
+    [m appendFormat:@"        tensor<fp16, [1,1,1,%d]> rrms = pow(x=ss3,y=nhalf)[name=string(\"rrms\")];\n", SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> xr = mul(x=x,y=rrms)[name=string(\"xr\")];\n", DIM, SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,1]> rw = const()[name=string(\"rw\"), val=tensor<fp16, [1,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/rms2.bin\"), offset=uint64(64)))];\n", DIM, DIM];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> xn = mul(x=xr,y=rw)[name=string(\"xn\")];\n", DIM, SEQ];
+    // --- FFN (operates on xn) ---
+    [m appendString:@CONV_CONST];
+    [m appendFormat:@"        tensor<fp16, [%d,%d,1,1]> W1 = const()[name=string(\"W1\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/w1.bin\"), offset=uint64(64)))];\n", HIDDEN,DIM,HIDDEN,DIM];
+    [m appendFormat:@"        tensor<fp16, [%d,%d,1,1]> W3 = const()[name=string(\"W3\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/w3.bin\"), offset=uint64(64)))];\n", HIDDEN,DIM,HIDDEN,DIM];
+    [m appendFormat:@"        tensor<fp16, [%d,%d,1,1]> W2 = const()[name=string(\"W2\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/w2.bin\"), offset=uint64(64)))];\n", DIM,HIDDEN,DIM,HIDDEN];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> h1 = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W1,x=xn)[name=string(\"c1\")];\n", HIDDEN,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> h3 = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W3,x=xn)[name=string(\"c3\")];\n", HIDDEN,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> sig = sigmoid(x=h1)[name=string(\"sg\")];\n", HIDDEN,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> silu = mul(x=h1,y=sig)[name=string(\"si\")];\n", HIDDEN,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> gate = mul(x=silu,y=h3)[name=string(\"gt\")];\n", HIDDEN,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> y = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W2,x=gate)[name=string(\"c2\")];\n", DIM,SEQ];
+    [m appendString:@"        int32 cax = const()[name=string(\"cax\"), val=int32(1)];\n"];
+    [m appendString:@"        bool cid = const()[name=string(\"cid\"), val=bool(false)];\n"];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> out = concat(axis=cax,interleave=cid,values=(y,h1,h3,gate,xn))[name=string(\"cat\")];\n", 2*DIM+3*HIDDEN,SEQ];
+    [m appendString:@"    } -> (out);\n}\n"];
+    return m;
+}
+
+// Fused FFN backward: concat(dffn,h1,h3) → concat(dx,dh1,dh3) fp16
+static NSString *gen_ffn_bwd(void) {
+    NSMutableString *m = [NSMutableString string];
+    [m appendString:MIL_HDR];
+    [m appendFormat:@"    func main<ios18>(tensor<fp16, [1, %d, 1, %d]> x) {\n", DIM+2*HIDDEN, SEQ];
+    [m appendString:@CONV_CONST];
+    [m appendString:@"        tensor<int32, [4]> bd = const()[name=string(\"bd\"), val=tensor<int32, [4]>([0,0,0,0])];\n"];
+    [m appendFormat:@"        tensor<int32, [4]> sd = const()[name=string(\"sd\"), val=tensor<int32, [4]>([1,%d,1,%d])];\n", DIM, SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dffn = slice_by_size(x=x,begin=bd,size=sd)[name=string(\"s0\")];\n", DIM, SEQ];
+    [m appendFormat:@"        tensor<int32, [4]> b1 = const()[name=string(\"b1\"), val=tensor<int32, [4]>([0,%d,0,0])];\n", DIM];
+    [m appendFormat:@"        tensor<int32, [4]> s1 = const()[name=string(\"s1\"), val=tensor<int32, [4]>([1,%d,1,%d])];\n", HIDDEN, SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> h1 = slice_by_size(x=x,begin=b1,size=s1)[name=string(\"s1x\")];\n", HIDDEN, SEQ];
+    [m appendFormat:@"        tensor<int32, [4]> b3 = const()[name=string(\"b3\"), val=tensor<int32, [4]>([0,%d,0,0])];\n", DIM+HIDDEN];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> h3 = slice_by_size(x=x,begin=b3,size=s1)[name=string(\"s3x\")];\n", HIDDEN, SEQ];
+    [m appendFormat:@"        tensor<fp16, [%d,%d,1,1]> W2t = const()[name=string(\"W2t\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/w2t.bin\"), offset=uint64(64)))];\n", HIDDEN, DIM, HIDDEN, DIM];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dsilu = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W2t,x=dffn)[name=string(\"cw2\")];\n", HIDDEN, SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> sig = sigmoid(x=h1)[name=string(\"sg\")];\n", HIDDEN, SEQ];
+    [m appendString:@"        fp16 one = const()[name=string(\"one\"), val=fp16(1.0)];\n"];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> oms = sub(x=one,y=sig)[name=string(\"oms\")];\n", HIDDEN, SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> homs = mul(x=h1,y=oms)[name=string(\"homs\")];\n", HIDDEN, SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> brk = add(x=one,y=homs)[name=string(\"brk\")];\n", HIDDEN, SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dsd = mul(x=sig,y=brk)[name=string(\"dsd\")];\n", HIDDEN, SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> t1 = mul(x=dsilu,y=h3)[name=string(\"t1\")];\n", HIDDEN, SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dh1 = mul(x=t1,y=dsd)[name=string(\"dh1\")];\n", HIDDEN, SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> slh = mul(x=h1,y=sig)[name=string(\"slh\")];\n", HIDDEN, SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dh3 = mul(x=dsilu,y=slh)[name=string(\"dh3\")];\n", HIDDEN, SEQ];
+    [m appendFormat:@"        tensor<fp16, [%d,%d,1,1]> W1t = const()[name=string(\"W1t\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/w1t.bin\"), offset=uint64(64)))];\n", DIM, HIDDEN, DIM, HIDDEN];
+    [m appendFormat:@"        tensor<fp16, [%d,%d,1,1]> W3t = const()[name=string(\"W3t\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/w3t.bin\"), offset=uint64(64)))];\n", DIM, HIDDEN, DIM, HIDDEN];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dx1 = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W1t,x=dh1)[name=string(\"cw1\")];\n", DIM, SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dx3 = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W3t,x=dh3)[name=string(\"cw3\")];\n", DIM, SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dx = add(x=dx1,y=dx3)[name=string(\"adx\")];\n", DIM, SEQ];
+    [m appendString:@"        int32 cax = const()[name=string(\"cax\"), val=int32(1)];\n"];
+    [m appendString:@"        bool cid = const()[name=string(\"cid\"), val=bool(false)];\n"];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> out = concat(axis=cax,interleave=cid,values=(dx,dh1,dh3))[name=string(\"cat\")];\n", DIM+2*HIDDEN, SEQ];
+    [m appendString:@"    } -> (out);\n}\n"];
+    return m;
+}
+
+// Fused QKV backward: concat(dq,dk,dv) → dx fp16
+static NSString *gen_qkvb(void) {
+    NSMutableString *m = [NSMutableString string];
+    [m appendString:MIL_HDR];
+    [m appendFormat:@"    func main<ios18>(tensor<fp16, [1, %d, 1, %d]> x) {\n", 3*DIM, SEQ];
+    [m appendString:@CONV_CONST];
+    [m appendFormat:@"        tensor<int32, [4]> sz = const()[name=string(\"sz\"), val=tensor<int32, [4]>([1,%d,1,%d])];\n", DIM, SEQ];
+    [m appendString:@"        tensor<int32, [4]> b0 = const()[name=string(\"b0\"), val=tensor<int32, [4]>([0,0,0,0])];\n"];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dq = slice_by_size(x=x,begin=b0,size=sz)[name=string(\"s0\")];\n", DIM,SEQ];
+    [m appendFormat:@"        tensor<int32, [4]> b1 = const()[name=string(\"b1\"), val=tensor<int32, [4]>([0,%d,0,0])];\n", DIM];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dk = slice_by_size(x=x,begin=b1,size=sz)[name=string(\"s1\")];\n", DIM,SEQ];
+    [m appendFormat:@"        tensor<int32, [4]> b2 = const()[name=string(\"b2\"), val=tensor<int32, [4]>([0,%d,0,0])];\n", 2*DIM];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dv = slice_by_size(x=x,begin=b2,size=sz)[name=string(\"s2\")];\n", DIM,SEQ];
+    [m appendFormat:@"        tensor<fp16, [%d,%d,1,1]> Wqt = const()[name=string(\"Wqt\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/wqt.bin\"), offset=uint64(64)))];\n", DIM,DIM,DIM,DIM];
+    [m appendFormat:@"        tensor<fp16, [%d,%d,1,1]> Wkt = const()[name=string(\"Wkt\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/wkt.bin\"), offset=uint64(64)))];\n", DIM,DIM,DIM,DIM];
+    [m appendFormat:@"        tensor<fp16, [%d,%d,1,1]> Wvt = const()[name=string(\"Wvt\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/wvt.bin\"), offset=uint64(64)))];\n", DIM,DIM,DIM,DIM];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dxq = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wqt,x=dq)[name=string(\"cq\")];\n", DIM,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dxk = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wkt,x=dk)[name=string(\"ck\")];\n", DIM,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dxv = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wvt,x=dv)[name=string(\"cv\")];\n", DIM,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dxqk = add(x=dxq,y=dxk)[name=string(\"aqk\")];\n", DIM,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> out = add(x=dxqk,y=dxv)[name=string(\"out\")];\n", DIM,SEQ];
+    [m appendString:@"    } -> (out);\n}\n"];
+    return m;
+}
+
+// SDPA backward part 1 + Wo^T: concat(Q,K,V,dx2) → Wo^T(dx2) → concat(dV, probs_flat, dp_flat) fp16
+// SCORE_CH: channels needed for flattened attention scores [HEADS,SEQ,SEQ] → [HEADS*SEQ, SEQ]
+#define SCORE_CH (HEADS*SEQ)
+
+static NSString *gen_sdpa_bwd1(void) {
+    float sc = 1.0f/sqrtf((float)HD);
+    NSMutableString *m = [NSMutableString string];
+    [m appendString:MIL_HDR];
+    [m appendFormat:@"    func main<ios18>(tensor<fp16, [1, %d, 1, %d]> x) {\n", 4*DIM, SEQ];
+    [m appendString:@CONV_CONST];
+    [m appendFormat:@"        tensor<int32, [4]> sz = const()[name=string(\"sz\"), val=tensor<int32, [4]>([1,%d,1,%d])];\n", DIM, SEQ];
+    [m appendString:@"        tensor<int32, [4]> b0 = const()[name=string(\"b0\"), val=tensor<int32, [4]>([0,0,0,0])];\n"];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> qf = slice_by_size(x=x,begin=b0,size=sz)[name=string(\"s0\")];\n", DIM,SEQ];
+    [m appendFormat:@"        tensor<int32, [4]> b1 = const()[name=string(\"b1\"), val=tensor<int32, [4]>([0,%d,0,0])];\n", DIM];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> kf = slice_by_size(x=x,begin=b1,size=sz)[name=string(\"s1\")];\n", DIM,SEQ];
+    [m appendFormat:@"        tensor<int32, [4]> b2 = const()[name=string(\"b2\"), val=tensor<int32, [4]>([0,%d,0,0])];\n", 2*DIM];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> vf = slice_by_size(x=x,begin=b2,size=sz)[name=string(\"s2\")];\n", DIM,SEQ];
+    [m appendFormat:@"        tensor<int32, [4]> b3 = const()[name=string(\"b3\"), val=tensor<int32, [4]>([0,%d,0,0])];\n", 3*DIM];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dx2f = slice_by_size(x=x,begin=b3,size=sz)[name=string(\"s3\")];\n", DIM,SEQ];
+    // Wo^T backward: dx2 → dattn
+    [m appendFormat:@"        tensor<fp16, [%d,%d,1,1]> Wot = const()[name=string(\"Wot\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/wot.bin\"), offset=uint64(64)))];\n", DIM,DIM,DIM,DIM];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> df = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wot,x=dx2f)[name=string(\"cwo\")];\n", DIM,SEQ];
+    [m appendFormat:@"        tensor<int32, [4]> rsh = const()[name=string(\"rsh\"), val=tensor<int32, [4]>([1,%d,%d,%d])];\n", HEADS,HD,SEQ];
+    [m appendString:@"        tensor<int32, [4]> pm = const()[name=string(\"pm\"), val=tensor<int32, [4]>([0,1,3,2])];\n"];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> qr = reshape(shape=rsh,x=qf)[name=string(\"rq\")];\n", HEADS,HD,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> q = transpose(perm=pm,x=qr)[name=string(\"tq\")];\n", HEADS,SEQ,HD];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> kr = reshape(shape=rsh,x=kf)[name=string(\"rk\")];\n", HEADS,HD,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> k = transpose(perm=pm,x=kr)[name=string(\"tk\")];\n", HEADS,SEQ,HD];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> vr = reshape(shape=rsh,x=vf)[name=string(\"rv\")];\n", HEADS,HD,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> v = transpose(perm=pm,x=vr)[name=string(\"tv\")];\n", HEADS,SEQ,HD];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> dr = reshape(shape=rsh,x=df)[name=string(\"rd\")];\n", HEADS,HD,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> da = transpose(perm=pm,x=dr)[name=string(\"td\")];\n", HEADS,SEQ,HD];
+    [m appendString:@"        bool bF = const()[name=string(\"bF\"), val=bool(false)];\n"];
+    [m appendString:@"        bool bT = const()[name=string(\"bT\"), val=bool(true)];\n"];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> sc1 = matmul(transpose_x=bF,transpose_y=bT,x=q,y=k)[name=string(\"mm1\")];\n", HEADS,SEQ,SEQ];
+    [m appendFormat:@"        fp16 scv = const()[name=string(\"scv\"), val=fp16(%f)];\n", sc];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> sc2 = mul(x=sc1,y=scv)[name=string(\"scl\")];\n", HEADS,SEQ,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,1,%d,%d]> cm = const()[name=string(\"cm\"), val=tensor<fp16, [1,1,%d,%d]>(BLOBFILE(path=string(\"@model_path/weights/mask.bin\"), offset=uint64(64)))];\n", SEQ,SEQ,SEQ,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> ms = add(x=sc2,y=cm)[name=string(\"msk\")];\n", HEADS,SEQ,SEQ];
+    [m appendString:@"        int32 sax = const()[name=string(\"sax\"), val=int32(-1)];\n"];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> probs = softmax(axis=sax,x=ms)[name=string(\"sm\")];\n", HEADS,SEQ,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> dv4 = matmul(transpose_x=bT,transpose_y=bF,x=probs,y=da)[name=string(\"dv\")];\n", HEADS,SEQ,HD];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> dp4 = matmul(transpose_x=bF,transpose_y=bT,x=da,y=v)[name=string(\"dp\")];\n", HEADS,SEQ,SEQ];
+    // Flatten dv back to [1,DIM,1,SEQ]
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> dvt = transpose(perm=pm,x=dv4)[name=string(\"dvt\")];\n", HEADS,HD,SEQ];
+    [m appendFormat:@"        tensor<int32, [4]> dvs = const()[name=string(\"dvs\"), val=tensor<int32, [4]>([1,%d,1,%d])];\n", DIM,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dvf = reshape(shape=dvs,x=dvt)[name=string(\"dvf\")];\n", DIM,SEQ];
+    // Flatten probs [1,H,S,S] → [1,H*S,1,S] and dp [1,H,S,S] → [1,H*S,1,S]
+    [m appendFormat:@"        tensor<int32, [4]> scs = const()[name=string(\"scs\"), val=tensor<int32, [4]>([1,%d,1,%d])];\n", SCORE_CH,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> pf = reshape(shape=scs,x=probs)[name=string(\"pf\")];\n", SCORE_CH,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dpf = reshape(shape=scs,x=dp4)[name=string(\"dpf\")];\n", SCORE_CH,SEQ];
+    [m appendString:@"        int32 cax = const()[name=string(\"cax\"), val=int32(1)];\n"];
+    [m appendString:@"        bool cid = const()[name=string(\"cid\"), val=bool(false)];\n"];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> out = concat(axis=cax,interleave=cid,values=(dvf,pf,dpf))[name=string(\"cat\")];\n", DIM+2*SCORE_CH,SEQ];
+    [m appendString:@"    } -> (out);\n}\n"];
+    return m;
+}
+
+// SDPA backward part 2: concat(probs[SCORE_CH],dp[SCORE_CH],Q[DIM],K[DIM]) → concat(dQ,dK) fp16
+static NSString *gen_sdpa_bwd2(void) {
+    float sc = 1.0f/sqrtf((float)HD);
+    int bwd2_in = 2*SCORE_CH + 2*DIM;
+    NSMutableString *m = [NSMutableString string];
+    [m appendString:MIL_HDR];
+    [m appendFormat:@"    func main<ios18>(tensor<fp16, [1, %d, 1, %d]> x) {\n", bwd2_in, SEQ];
+    // Slice probs
+    [m appendFormat:@"        tensor<int32, [4]> sz_sc = const()[name=string(\"szsc\"), val=tensor<int32, [4]>([1,%d,1,%d])];\n", SCORE_CH, SEQ];
+    [m appendString:@"        tensor<int32, [4]> b0 = const()[name=string(\"b0\"), val=tensor<int32, [4]>([0,0,0,0])];\n"];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> pf = slice_by_size(x=x,begin=b0,size=sz_sc)[name=string(\"s0\")];\n", SCORE_CH,SEQ];
+    // Slice dp
+    [m appendFormat:@"        tensor<int32, [4]> b1 = const()[name=string(\"b1\"), val=tensor<int32, [4]>([0,%d,0,0])];\n", SCORE_CH];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dpf = slice_by_size(x=x,begin=b1,size=sz_sc)[name=string(\"s1\")];\n", SCORE_CH,SEQ];
+    // Slice Q
+    [m appendFormat:@"        tensor<int32, [4]> sz_d = const()[name=string(\"szd\"), val=tensor<int32, [4]>([1,%d,1,%d])];\n", DIM, SEQ];
+    [m appendFormat:@"        tensor<int32, [4]> b2 = const()[name=string(\"b2\"), val=tensor<int32, [4]>([0,%d,0,0])];\n", 2*SCORE_CH];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> qf = slice_by_size(x=x,begin=b2,size=sz_d)[name=string(\"s2\")];\n", DIM,SEQ];
+    // Slice K
+    [m appendFormat:@"        tensor<int32, [4]> b3 = const()[name=string(\"b3\"), val=tensor<int32, [4]>([0,%d,0,0])];\n", 2*SCORE_CH+DIM];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> kf = slice_by_size(x=x,begin=b3,size=sz_d)[name=string(\"s3\")];\n", DIM,SEQ];
+    // Reshape to multi-head
+    [m appendFormat:@"        tensor<int32, [4]> ssh = const()[name=string(\"ssh\"), val=tensor<int32, [4]>([1,%d,%d,%d])];\n", HEADS,SEQ,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> probs = reshape(shape=ssh,x=pf)[name=string(\"rp\")];\n", HEADS,SEQ,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> dp = reshape(shape=ssh,x=dpf)[name=string(\"rdp\")];\n", HEADS,SEQ,SEQ];
+    [m appendFormat:@"        tensor<int32, [4]> rsh = const()[name=string(\"rsh\"), val=tensor<int32, [4]>([1,%d,%d,%d])];\n", HEADS,HD,SEQ];
+    [m appendString:@"        tensor<int32, [4]> pm = const()[name=string(\"pm\"), val=tensor<int32, [4]>([0,1,3,2])];\n"];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> qr = reshape(shape=rsh,x=qf)[name=string(\"rq\")];\n", HEADS,HD,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> q = transpose(perm=pm,x=qr)[name=string(\"tq\")];\n", HEADS,SEQ,HD];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> kr = reshape(shape=rsh,x=kf)[name=string(\"rk\")];\n", HEADS,HD,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> k = transpose(perm=pm,x=kr)[name=string(\"tk\")];\n", HEADS,SEQ,HD];
+    // Softmax grad: ds = probs * (dp - sum(probs*dp)) * scale
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> pdp = mul(x=probs,y=dp)[name=string(\"pdp\")];\n", HEADS,SEQ,SEQ];
+    [m appendString:@"        tensor<int32, [1]> rax = const()[name=string(\"rax\"), val=tensor<int32, [1]>([-1])];\n"];
+    [m appendString:@"        bool kd = const()[name=string(\"kd\"), val=bool(true)];\n"];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,1]> spdp = reduce_sum(x=pdp,axes=rax,keep_dims=kd)[name=string(\"rs\")];\n", HEADS,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> dps = sub(x=dp,y=spdp)[name=string(\"dps\")];\n", HEADS,SEQ,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> ds0 = mul(x=probs,y=dps)[name=string(\"ds0\")];\n", HEADS,SEQ,SEQ];
+    [m appendFormat:@"        fp16 scv = const()[name=string(\"scv\"), val=fp16(%f)];\n", sc];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> ds = mul(x=ds0,y=scv)[name=string(\"ds\")];\n", HEADS,SEQ,SEQ];
+    [m appendString:@"        bool bF = const()[name=string(\"bF\"), val=bool(false)];\n"];
+    [m appendString:@"        bool bT = const()[name=string(\"bT\"), val=bool(true)];\n"];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> dq4 = matmul(transpose_x=bF,transpose_y=bF,x=ds,y=k)[name=string(\"dq\")];\n", HEADS,SEQ,HD];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> dk4 = matmul(transpose_x=bT,transpose_y=bF,x=ds,y=q)[name=string(\"dk\")];\n", HEADS,SEQ,HD];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> dqt = transpose(perm=pm,x=dq4)[name=string(\"dqt\")];\n", HEADS,HD,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> dkt = transpose(perm=pm,x=dk4)[name=string(\"dkt\")];\n", HEADS,HD,SEQ];
+    [m appendFormat:@"        tensor<int32, [4]> fs = const()[name=string(\"fs\"), val=tensor<int32, [4]>([1,%d,1,%d])];\n", DIM,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dqf = reshape(shape=fs,x=dqt)[name=string(\"dqf\")];\n", DIM,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dkf = reshape(shape=fs,x=dkt)[name=string(\"dkf\")];\n", DIM,SEQ];
+    [m appendString:@"        int32 cax = const()[name=string(\"cax\"), val=int32(1)];\n"];
+    [m appendString:@"        bool cid = const()[name=string(\"cid\"), val=bool(false)];\n"];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> out = concat(axis=cax,interleave=cid,values=(dqf,dkf))[name=string(\"cat\")];\n", 2*DIM,SEQ];
+    [m appendString:@"    } -> (out);\n}\n"];
+    return m;
+}
+
+// ===== Weight builders =====
+static NSData *g_mask_blob = nil;
+static NSData *get_mask_blob(void) {
+    if (!g_mask_blob) {
+        _Float16 *mask = (_Float16*)calloc(SEQ*SEQ, sizeof(_Float16));
+        for(int t=0;t<SEQ;t++) for(int t2=0;t2<SEQ;t2++)
+            mask[t*SEQ+t2] = (t2<=t) ? (_Float16)0.0f : (_Float16)(-65504.0f);
+        g_mask_blob = build_blob_fp16(mask, SEQ*SEQ);
+        free(mask);
+    }
+    return g_mask_blob;
+}
+
+// ===== Kernel compilation and evaluation =====
+typedef struct { void *model; IOSurfaceRef ioIn, ioOut; void *request; void *tmpDir; } Kern;
+
+static Kern *compile_kern_mil_w(NSString *mil, NSDictionary *weights, int ic_bytes, int oc_bytes) {
+    @autoreleasepool {
+    NSData *md = [mil dataUsingEncoding:NSUTF8StringEncoding];
+    id desc = ((id(*)(Class,SEL,id,id,id))objc_msgSend)(g_D, @selector(modelWithMILText:weights:optionsPlist:), md, weights, nil);
+    if (!desc) { printf("  [compile] desc=NULL\n"); return NULL; }
+    id mdl = ((id(*)(Class,SEL,id))objc_msgSend)(g_I, @selector(inMemoryModelWithDescriptor:), desc);
+    id hx = ((id(*)(id,SEL))objc_msgSend)(mdl, @selector(hexStringIdentifier));
+    NSString *td = [NSTemporaryDirectory() stringByAppendingPathComponent:hx];
+    [[NSFileManager defaultManager] createDirectoryAtPath:[td stringByAppendingPathComponent:@"weights"] withIntermediateDirectories:YES attributes:nil error:nil];
+    [md writeToFile:[td stringByAppendingPathComponent:@"model.mil"] atomically:YES];
+    for (NSString *path in weights) {
+        NSString *rel = [path stringByReplacingOccurrencesOfString:@"@model_path/" withString:@""];
+        [weights[path][@"data"] writeToFile:[td stringByAppendingPathComponent:rel] atomically:YES];
+    }
+    NSError *e = nil;
+    if (!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(compileWithQoS:options:error:), 21, @{}, &e)) {
+        printf("  [compile] FAIL: %s\n", e ? [[e description] UTF8String] : "no error"); return NULL;
+    }
+    if (!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(loadWithQoS:options:error:), 21, @{}, &e)) {
+        printf("  [compile] load FAIL\n"); return NULL;
+    }
+    __sync_fetch_and_add(&g_compile_count, 1);
+    Kern *k = calloc(1, sizeof(Kern));
+    k->model = CFBridgingRetain(mdl);
+    k->ioIn = make_surface(ic_bytes);
+    k->ioOut = make_surface(oc_bytes);
+    id wI = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), k->ioIn);
+    id wO = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), k->ioOut);
+    k->request = CFBridgingRetain(((id(*)(Class,SEL,id,id,id,id,id,id,id))objc_msgSend)(g_AR,
+        @selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:),
+        @[wI], @[@0], @[wO], @[@0], nil, nil, @0));
+    k->tmpDir = CFBridgingRetain(td);
+    return k;
+    }
+}
+static void free_kern(Kern *k) {
+    if (!k) return;
+    id mdl = (__bridge id)k->model; NSError *e = nil;
+    ((BOOL(*)(id,SEL,unsigned int,NSError**))objc_msgSend)(mdl, @selector(unloadWithQoS:error:), 21, &e);
+    CFRelease(k->ioIn); CFRelease(k->ioOut);
+    [[NSFileManager defaultManager] removeItemAtPath:(__bridge id)k->tmpDir error:nil];
+    CFRelease(k->model); CFRelease(k->request); CFRelease(k->tmpDir);
+    free(k);
+}
+static void ane_eval(Kern *k) {
+    id mdl = (__bridge id)k->model; id req = (__bridge id)k->request; NSError *e = nil;
+    ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)(mdl, @selector(evaluateWithQoS:options:request:error:), 21, @{}, req, &e);
+}
+
+// ===== Vectorized conversion helpers (NEON) =====
+#include <arm_neon.h>
+static void cvt_f16_f32(float *dst, const _Float16 *src, int n) {
+    int i = 0;
+    for (; i+7 < n; i += 8) {
+        float16x8_t h = vld1q_f16((const __fp16*)(src+i));
+        vst1q_f32(dst+i,   vcvt_f32_f16(vget_low_f16(h)));
+        vst1q_f32(dst+i+4, vcvt_f32_f16(vget_high_f16(h)));
+    }
+    for (; i < n; i++) dst[i] = (float)src[i];
+}
+static void cvt_f32_f16(_Float16 *dst, const float *src, int n) {
+    int i = 0;
+    for (; i+7 < n; i += 8) {
+        float16x8_t h = vcombine_f16(vcvt_f16_f32(vld1q_f32(src+i)),
+                                      vcvt_f16_f32(vld1q_f32(src+i+4)));
+        vst1q_f16((__fp16*)(dst+i), h);
+    }
+    for (; i < n; i++) dst[i] = (_Float16)src[i];
+}
+
+// ===== IOSurface I/O helpers (channel-first, no transpose) =====
+// All CPU buffers are [C,S] channel-first matching IOSurface [1,C,1,S]
+// Write fp32 [C,S] → fp16 [1,C,1,S] (just type conversion, no transpose)
+static void io_write_fp16(IOSurfaceRef s, const float *data, int channels, int sp) {
+    IOSurfaceLock(s, 0, NULL);
+    _Float16 *dst = (_Float16*)IOSurfaceGetBaseAddress(s);
+    cvt_f32_f16(dst, data, channels * sp);
+    IOSurfaceUnlock(s, 0, NULL);
+}
+// Write fp32 [C,S] → fp32 [1,C,1,S] (just memcpy)
+static void io_write_fp32(IOSurfaceRef s, const float *data, int channels, int sp) {
+    IOSurfaceLock(s, 0, NULL);
+    memcpy(IOSurfaceGetBaseAddress(s), data, channels * sp * sizeof(float));
+    IOSurfaceUnlock(s, 0, NULL);
+}
+// Read fp16 [1,C,1,S] → fp32 [C,S] at channel offset (just type conversion)
+static void io_read_fp16(IOSurfaceRef s, float *data, int ch_off, int channels, int sp) {
+    IOSurfaceLock(s, kIOSurfaceLockReadOnly, NULL);
+    _Float16 *src = (_Float16*)IOSurfaceGetBaseAddress(s) + ch_off * sp;
+    cvt_f16_f32(data, src, channels * sp);
+    IOSurfaceUnlock(s, kIOSurfaceLockReadOnly, NULL);
+}
+// Read fp32 [1,C,1,S] → fp32 [C,S] (just memcpy)
+static void io_read_fp32(IOSurfaceRef s, float *data, int channels, int sp) {
+    IOSurfaceLock(s, kIOSurfaceLockReadOnly, NULL);
+    memcpy(data, IOSurfaceGetBaseAddress(s), channels * sp * sizeof(float));
+    IOSurfaceUnlock(s, kIOSurfaceLockReadOnly, NULL);
+}
+// Write multiple fp32 [C,S] arrays concatenated along channel dim as fp16
+static void io_write_multi_fp16(IOSurfaceRef s, int sp, int n, ...) {
+    IOSurfaceLock(s, 0, NULL);
+    _Float16 *dst = (_Float16*)IOSurfaceGetBaseAddress(s);
+    va_list ap; va_start(ap, n);
+    int ch_off = 0;
+    for (int i=0; i<n; i++) {
+        const float *data = va_arg(ap, const float*);
+        int channels = va_arg(ap, int);
+        cvt_f32_f16(dst + ch_off*sp, data, channels * sp);
+        ch_off += channels;
+    }
+    va_end(ap);
+    IOSurfaceUnlock(s, 0, NULL);
+}
+// Direct copy between IOSurfaces (no format conversion — both fp16 channel-first)
+static void io_copy(IOSurfaceRef dst, int dst_ch, IOSurfaceRef src, int src_ch, int channels, int sp) {
+    IOSurfaceLock(dst, 0, NULL);
+    IOSurfaceLock(src, kIOSurfaceLockReadOnly, NULL);
+    memcpy((_Float16*)IOSurfaceGetBaseAddress(dst) + dst_ch*sp,
+           (_Float16*)IOSurfaceGetBaseAddress(src) + src_ch*sp,
+           channels * sp * sizeof(_Float16));
+    IOSurfaceUnlock(src, kIOSurfaceLockReadOnly, NULL);
+    IOSurfaceUnlock(dst, 0, NULL);
+}
+// Write one fp32 [C,S] array at specific channel offset in IOSurface as fp16
+static void io_write_fp16_at(IOSurfaceRef s, int ch_off, const float *data, int channels, int sp) {
+    IOSurfaceLock(s, 0, NULL);
+    _Float16 *dst = (_Float16*)IOSurfaceGetBaseAddress(s) + ch_off * sp;
+    cvt_f32_f16(dst, data, channels * sp);
+    IOSurfaceUnlock(s, 0, NULL);
+}
+
+// ===== CPU ops (channel-first [C,S] layout) =====
+// x[i*S+t] = channel i, position t
+// Process all positions in parallel using vectorized column ops
+static float *g_rms_tmp = NULL;
+static void rmsnorm(float *out, const float *x, const float *w, int d, int S) {
+    if (!g_rms_tmp) g_rms_tmp = malloc(S*4);
+    float *ss = calloc(S, sizeof(float));
+    for (int i=0; i<d; i++) {
+        vDSP_vmul(x+i*S, 1, x+i*S, 1, g_rms_tmp, 1, (vDSP_Length)S);
+        vDSP_vadd(g_rms_tmp, 1, ss, 1, ss, 1, (vDSP_Length)S);
+    }
+    float invd = 1.0f/d, eps=1e-5f;
+    vDSP_vsmsa(ss, 1, &invd, &eps, ss, 1, (vDSP_Length)S);
+    int n = S; vvrsqrtf(ss, ss, &n);
+    for (int i=0; i<d; i++) {
+        vDSP_vmul(x+i*S, 1, ss, 1, out+i*S, 1, (vDSP_Length)S);
+        vDSP_vsmul(out+i*S, 1, &w[i], out+i*S, 1, (vDSP_Length)S);
+    }
+    free(ss);
+}
+static void rmsnorm_bwd(float *dx, float *dw, const float *dy, const float *x, const float *w, int d, int S) {
+    if (!g_rms_tmp) g_rms_tmp = malloc(S*4);
+    float *ss = calloc(S, sizeof(float));
+    for (int i=0; i<d; i++) {
+        vDSP_vmul(x+i*S, 1, x+i*S, 1, g_rms_tmp, 1, (vDSP_Length)S);
+        vDSP_vadd(g_rms_tmp, 1, ss, 1, ss, 1, (vDSP_Length)S);
+    }
+    float invd = 1.0f/d, eps=1e-5f;
+    vDSP_vsmsa(ss, 1, &invd, &eps, ss, 1, (vDSP_Length)S);
+    float *rrms = malloc(S*4);
+    int n = S; vvrsqrtf(rrms, ss, &n);
+    // dot[t] = sum_i dy[i,t]*x[i,t]*w[i]
+    float *dot = calloc(S, sizeof(float));
+    for (int i=0; i<d; i++) {
+        vDSP_vmul(dy+i*S, 1, x+i*S, 1, g_rms_tmp, 1, (vDSP_Length)S);
+        // dot += tmp * w[i]  (vDSP_vsma: A*scalar+C→D)
+        vDSP_vsma(g_rms_tmp, 1, &w[i], dot, 1, dot, 1, (vDSP_Length)S);
+    }
+    // dot *= rrms^2/d
+    vDSP_vmul(rrms, 1, rrms, 1, ss, 1, (vDSP_Length)S);
+    vDSP_vsmul(ss, 1, &invd, ss, 1, (vDSP_Length)S);
+    vDSP_vmul(dot, 1, ss, 1, dot, 1, (vDSP_Length)S);
+    for (int i=0; i<d; i++) {
+        vDSP_vmul(x+i*S, 1, dot, 1, g_rms_tmp, 1, (vDSP_Length)S);
+        vDSP_vsub(g_rms_tmp, 1, dy+i*S, 1, g_rms_tmp, 1, (vDSP_Length)S);
+        vDSP_vmul(g_rms_tmp, 1, rrms, 1, g_rms_tmp, 1, (vDSP_Length)S);
+        vDSP_vsmul(g_rms_tmp, 1, &w[i], dx+i*S, 1, (vDSP_Length)S);
+        // dw[i] += sum_t dy[i,t]*x[i,t]*rrms[t]
+        vDSP_vmul(dy+i*S, 1, x+i*S, 1, g_rms_tmp, 1, (vDSP_Length)S);
+        vDSP_vmul(g_rms_tmp, 1, rrms, 1, g_rms_tmp, 1, (vDSP_Length)S);
+        float s; vDSP_sve(g_rms_tmp, 1, &s, (vDSP_Length)S);
+        dw[i] += s;
+    }
+    free(ss); free(rrms); free(dot);
+}
+
+// ===== Checkpoint =====
+typedef struct {
+    int step, total_steps;
+    float lr, loss;
+    double cum_compile, cum_train, cum_wall;
+    int cum_steps, cum_batches;
+    int adam_t; // Adam timestep
+} CkptHdr;
+
+// Adam optimizer state
+typedef struct {
+    float *m, *v; // first and second moment
+    size_t n;
+} AdamState;
+static AdamState adam_alloc(size_t n) { return (AdamState){calloc(n,4), calloc(n,4), n}; }
+static void adam_free(AdamState *s) { free(s->m); free(s->v); }
+static void adam_update(float *w, const float *g, AdamState *s, int t, float lr, float b1, float b2, float eps) {
+    float bc1 = 1.0f - powf(b1, t), bc2 = 1.0f - powf(b2, t);
+    for (size_t i=0; i<s->n; i++) {
+        s->m[i] = b1*s->m[i] + (1-b1)*g[i];
+        s->v[i] = b2*s->v[i] + (1-b2)*g[i]*g[i];
+        float mh = s->m[i]/bc1, vh = s->v[i]/bc2;
+        w[i] -= lr * mh / (sqrtf(vh) + eps);
+    }
+}
+
+int main(int argc, char *argv[]) {
+    @autoreleasepool {
+        setbuf(stdout, NULL);
+        ane_init();
+        mach_timebase_info(&g_tb);
+
+        int total_steps = 400;
+        float lr = 1e-3f;
+        float adam_b1=0.9f, adam_b2=0.999f, adam_eps=1e-8f;
+        int adam_t = 0;
+        int start_step = 0;
+
+        size_t wq_sz = DIM*DIM, wo_sz = DIM*DIM;
+        size_t w1_sz = HIDDEN*DIM, w2_sz = DIM*HIDDEN, w3_sz = HIDDEN*DIM;
+        size_t total_params = 4*wq_sz + w1_sz + w2_sz + w3_sz;
+
+        float *Wq=malloc(wq_sz*4), *Wk=malloc(wq_sz*4), *Wv=malloc(wq_sz*4), *Wo=malloc(wo_sz*4);
+        float *W1=malloc(w1_sz*4), *W2=malloc(w2_sz*4), *W3=malloc(w3_sz*4);
+        float *rms1_w=malloc(DIM*4), *rms2_w=malloc(DIM*4);
+
+        // Adam optimizer states (m and v for each weight)
+        AdamState aWq=adam_alloc(wq_sz), aWk=adam_alloc(wq_sz), aWv=adam_alloc(wq_sz), aWo=adam_alloc(wo_sz);
+        AdamState aW1=adam_alloc(w1_sz), aW2=adam_alloc(w2_sz), aW3=adam_alloc(w3_sz);
+        AdamState arms1=adam_alloc(DIM), arms2=adam_alloc(DIM);
+
+        double cum_compile=0, cum_train=0, cum_wall=0;
+        int cum_steps=0, cum_batches=0;
+
+        bool resuming = false;
+        if (argc > 1 && strcmp(argv[1], "--resume") == 0) {
+            FILE *f = fopen(CKPT_PATH, "rb");
+            if (f) {
+                CkptHdr h; fread(&h, sizeof(h), 1, f);
+                start_step=h.step; total_steps=h.total_steps; lr=h.lr;
+                cum_compile=h.cum_compile; cum_train=h.cum_train; cum_wall=h.cum_wall;
+                cum_steps=h.cum_steps; cum_batches=h.cum_batches; adam_t=h.adam_t;
+                fread(Wq,4,wq_sz,f); fread(Wk,4,wq_sz,f); fread(Wv,4,wq_sz,f); fread(Wo,4,wo_sz,f);
+                fread(W1,4,w1_sz,f); fread(W2,4,w2_sz,f); fread(W3,4,w3_sz,f);
+                fread(rms1_w,4,DIM,f); fread(rms2_w,4,DIM,f);
+                // Adam state
+                fread(aWq.m,4,wq_sz,f);fread(aWq.v,4,wq_sz,f);
+                fread(aWk.m,4,wq_sz,f);fread(aWk.v,4,wq_sz,f);
+                fread(aWv.m,4,wq_sz,f);fread(aWv.v,4,wq_sz,f);
+                fread(aWo.m,4,wo_sz,f);fread(aWo.v,4,wo_sz,f);
+                fread(aW1.m,4,w1_sz,f);fread(aW1.v,4,w1_sz,f);
+                fread(aW2.m,4,w2_sz,f);fread(aW2.v,4,w2_sz,f);
+                fread(aW3.m,4,w3_sz,f);fread(aW3.v,4,w3_sz,f);
+                fread(arms1.m,4,DIM,f);fread(arms1.v,4,DIM,f);
+                fread(arms2.m,4,DIM,f);fread(arms2.v,4,DIM,f);
+                fclose(f);
+                resuming = true;
+                printf("[RESUMED step %d, loss=%.6f]\n", start_step, h.loss);
+            }
+        }
+        if (!resuming) {
+            srand48(42);
+            float scale_d=1.0f/sqrtf(DIM), scale_h=1.0f/sqrtf(HIDDEN);
+            for(size_t i=0;i<wq_sz;i++){Wq[i]=scale_d*(2*drand48()-1);Wk[i]=scale_d*(2*drand48()-1);}
+            for(size_t i=0;i<wq_sz;i++){Wv[i]=scale_d*(2*drand48()-1);Wo[i]=scale_d*(2*drand48()-1);}
+            for(size_t i=0;i<w1_sz;i++) W1[i]=scale_h*(2*drand48()-1);
+            for(size_t i=0;i<w2_sz;i++) W2[i]=scale_d*(2*drand48()-1);
+            for(size_t i=0;i<w3_sz;i++) W3[i]=scale_h*(2*drand48()-1);
+            for(int i=0;i<DIM;i++){rms1_w[i]=1.0f; rms2_w[i]=1.0f;}
+        }
+
+        if (!resuming) {
+            // FLOP accounting: 7 weight matrices, each 2*OC*IC*SEQ for forward
+            double fwd_flops = 4.0*2*DIM*DIM*SEQ + 2.0*2*DIM*HIDDEN*SEQ + 2.0*HIDDEN*DIM*SEQ;
+            double bwd_dx_flops = fwd_flops; // same matmuls transposed
+            double bwd_dw_flops = fwd_flops; // dW = dy^T @ x, same FLOPs
+            double sdpa_flops = 2.0*HEADS*5*SEQ*SEQ*HD; // 5 SEQ×SEQ matmuls in backward
+            double total_flops = fwd_flops + bwd_dx_flops + bwd_dw_flops + sdpa_flops;
+            double ane_flops_step = fwd_flops + bwd_dx_flops + sdpa_flops;
+            printf("=== ANE Training: Fully-ANE Pipeline ===\n");
+            printf("dim=%d hidden=%d heads=%d seq=%d\n", DIM, HIDDEN, HEADS, SEQ);
+            printf("Params: %.2fM | Weights: %.1fMB FP16\n", total_params/1e6, total_params*2.0/1e6);
+            printf("Kernels: %d (fwdAttn+fwdFFN+ffnBwd+sdpaBwd1+sdpaBwd2+qkvBwd)\n", NUM_KERNELS);
+            printf("Accum %d steps per recompile | Adam LR=%.1e b1=%.1f b2=%.3f\n\n", ACCUM_STEPS, lr, adam_b1, adam_b2);
+            printf("FLOPs/step: fwd=%.0fM bwd_dx=%.0fM bwd_dW=%.0fM sdpa_bwd=%.0fM total=%.0fM\n",
+                   fwd_flops/1e6, bwd_dx_flops/1e6, bwd_dw_flops/1e6, sdpa_flops/1e6, total_flops/1e6);
+            printf("ANE FLOPs/step: %.0fM (fwd+bwd_dx+sdpa_bwd) | CPU: dW (cblas)\n\n", ane_flops_step/1e6);
+        }
+
+        // Training data
+        float *x_in=malloc(SEQ*DIM*4), *y_tgt=malloc(SEQ*DIM*4);
+        // Training data in channel-first [C,S] layout
+        if (!resuming) srand48(42);
+        for(int c=0;c<DIM;c++) for(int t=0;t<SEQ;t++) {
+            int idx = c*SEQ+t;
+            x_in[idx]=0.1f*(2*drand48()-1);
+            y_tgt[idx]=0.1f*sinf(idx*0.03f+1.0f);
+        }
+
+        // Activation buffers (saved from forward for backward)
+        float *xnorm=malloc(SEQ*DIM*4);
+        float *Q=malloc(SEQ*DIM*4), *K=malloc(SEQ*DIM*4), *V=malloc(SEQ*DIM*4);
+        float *attn_out=malloc(SEQ*DIM*4), *o_out=malloc(SEQ*DIM*4);
+        float *x2=malloc(SEQ*DIM*4), *x2norm=malloc(SEQ*DIM*4);
+        float *h1=malloc(SEQ*HIDDEN*4), *h3=malloc(SEQ*HIDDEN*4), *silu_out=malloc(SEQ*HIDDEN*4);
+        float *ffn_out=malloc(SEQ*DIM*4), *y_out=malloc(SEQ*DIM*4);
+
+        // Gradient buffers
+        float *dy=malloc(SEQ*DIM*4), *dffn=malloc(SEQ*DIM*4);
+        float *dh1=malloc(SEQ*HIDDEN*4), *dh3=malloc(SEQ*HIDDEN*4);
+        float *dx_ffn=malloc(SEQ*DIM*4), *dx2=malloc(SEQ*DIM*4);
+        float *do_out_buf=malloc(SEQ*DIM*4), *dattn=malloc(SEQ*DIM*4);
+        float *dq=malloc(SEQ*DIM*4), *dk=malloc(SEQ*DIM*4), *dv=malloc(SEQ*DIM*4);
+        float *dx_attn=malloc(SEQ*DIM*4);
+        // SDPA bwd intermediates
+        float *probs_flat=malloc(SEQ*DIM*4), *dp_flat=malloc(SEQ*DIM*4);
+
+        // Gradient accumulators
+        float *gWq=calloc(wq_sz,4), *gWk=calloc(wq_sz,4), *gWv=calloc(wq_sz,4), *gWo=calloc(wo_sz,4);
+        float *gW1=calloc(w1_sz,4), *gW2=calloc(w2_sz,4), *gW3=calloc(w3_sz,4);
+        float *grms1=calloc(DIM,4), *grms2=calloc(DIM,4);
+
+        // 7 ANE kernels
+        Kern *kFwdAttn=NULL, *kFwdFFN=NULL, *kFFNBwd=NULL;
+        Kern *kSdpaBwd1=NULL, *kSdpaBwd2=NULL, *kQKVb=NULL;
+
+        // Compile static (weight-free) kernels ONCE
+        kSdpaBwd2 = compile_kern_mil_w(gen_sdpa_bwd2(), @{},
+            (2*SCORE_CH+2*DIM)*SEQ*2, 2*DIM*SEQ*2);
+        if (!kSdpaBwd2) { printf("Static kernel compile failed\n"); return 1; }
+
+        // GCD queue for async dW cblas (overlaps with ANE evals)
+        dispatch_queue_t dw_q = dispatch_queue_create("dw_cblas", DISPATCH_QUEUE_SERIAL);
+        dispatch_group_t dw_grp = dispatch_group_create();
+
+        float last_loss = 999.0f;
+        double total_compile_ms=0, total_train_ms=0;
+        int total_steps_done=0, total_batches=0;
+        uint64_t t_wall_start = mach_absolute_time();
+
+        int step = start_step;
+        while (step < total_steps) {
+            // Check compile budget — 5 weight-bearing kernels per batch
+            if (g_compile_count + 5 > MAX_COMPILES) {
+                free_kern(kFwdAttn);free_kern(kFwdFFN);free_kern(kFFNBwd);
+                free_kern(kSdpaBwd1);free_kern(kQKVb);
+                free_kern(kSdpaBwd2);
+                double wall = tb_ms(mach_absolute_time() - t_wall_start);
+                FILE *f = fopen(CKPT_PATH, "wb");
+                CkptHdr h = {step,total_steps,lr,last_loss,
+                    total_compile_ms+cum_compile, total_train_ms+cum_train, wall+cum_wall,
+                    total_steps_done+cum_steps, total_batches+cum_batches, adam_t};
+                fwrite(&h,sizeof(h),1,f);
+                fwrite(Wq,4,wq_sz,f);fwrite(Wk,4,wq_sz,f);fwrite(Wv,4,wq_sz,f);fwrite(Wo,4,wo_sz,f);
+                fwrite(W1,4,w1_sz,f);fwrite(W2,4,w2_sz,f);fwrite(W3,4,w3_sz,f);
+                fwrite(rms1_w,4,DIM,f);fwrite(rms2_w,4,DIM,f);
+                // Adam state
+                fwrite(aWq.m,4,wq_sz,f);fwrite(aWq.v,4,wq_sz,f);
+                fwrite(aWk.m,4,wq_sz,f);fwrite(aWk.v,4,wq_sz,f);
+                fwrite(aWv.m,4,wq_sz,f);fwrite(aWv.v,4,wq_sz,f);
+                fwrite(aWo.m,4,wo_sz,f);fwrite(aWo.v,4,wo_sz,f);
+                fwrite(aW1.m,4,w1_sz,f);fwrite(aW1.v,4,w1_sz,f);
+                fwrite(aW2.m,4,w2_sz,f);fwrite(aW2.v,4,w2_sz,f);
+                fwrite(aW3.m,4,w3_sz,f);fwrite(aW3.v,4,w3_sz,f);
+                fwrite(arms1.m,4,DIM,f);fwrite(arms1.v,4,DIM,f);
+                fwrite(arms2.m,4,DIM,f);fwrite(arms2.v,4,DIM,f);
+                fclose(f);
+                printf("[exec() restart step %d, %d compiles, loss=%.6f]\n", step, g_compile_count, last_loss);
+                fflush(stdout);
+                execl(argv[0], argv[0], "--resume", NULL);
+                perror("execl"); return 1;
+            }
+
+            // Compile 5 weight-bearing kernels (sdpaBwd2 compiled once above)
+            uint64_t tc = mach_absolute_time();
+            free_kern(kFwdAttn);free_kern(kFwdFFN);free_kern(kFFNBwd);
+            free_kern(kSdpaBwd1);free_kern(kQKVb);
+
+            kFwdAttn = compile_kern_mil_w(gen_sdpa_fwd_taps(), (@{
+                @"@model_path/weights/rms1.bin": @{@"offset":@0, @"data":build_blob(rms1_w,1,DIM)},
+                @"@model_path/weights/wq.bin": @{@"offset":@0, @"data":build_blob(Wq,DIM,DIM)},
+                @"@model_path/weights/wk.bin": @{@"offset":@0, @"data":build_blob(Wk,DIM,DIM)},
+                @"@model_path/weights/wv.bin": @{@"offset":@0, @"data":build_blob(Wv,DIM,DIM)},
+                @"@model_path/weights/wo.bin": @{@"offset":@0, @"data":build_blob(Wo,DIM,DIM)},
+                @"@model_path/weights/mask.bin": @{@"offset":@0, @"data":get_mask_blob()},
+            }), DIM*SEQ*2, 6*DIM*SEQ*2);
+
+            kFwdFFN = compile_kern_mil_w(gen_ffn_fwd_taps(), (@{
+                @"@model_path/weights/rms2.bin": @{@"offset":@0, @"data":build_blob(rms2_w,1,DIM)},
+                @"@model_path/weights/w1.bin": @{@"offset":@0, @"data":build_blob(W1,HIDDEN,DIM)},
+                @"@model_path/weights/w3.bin": @{@"offset":@0, @"data":build_blob(W3,HIDDEN,DIM)},
+                @"@model_path/weights/w2.bin": @{@"offset":@0, @"data":build_blob(W2,DIM,HIDDEN)},
+            }), DIM*SEQ*2, (2*DIM+3*HIDDEN)*SEQ*2);
+
+            kFFNBwd = compile_kern_mil_w(gen_ffn_bwd(), (@{
+                @"@model_path/weights/w2t.bin": @{@"offset":@0, @"data":build_blob_t(W2,DIM,HIDDEN)},
+                @"@model_path/weights/w1t.bin": @{@"offset":@0, @"data":build_blob_t(W1,HIDDEN,DIM)},
+                @"@model_path/weights/w3t.bin": @{@"offset":@0, @"data":build_blob_t(W3,HIDDEN,DIM)},
+            }), (DIM+2*HIDDEN)*SEQ*2, (DIM+2*HIDDEN)*SEQ*2);
+
+            kSdpaBwd1 = compile_kern_mil_w(gen_sdpa_bwd1(), (@{
+                @"@model_path/weights/mask.bin": @{@"offset":@0, @"data":get_mask_blob()},
+                @"@model_path/weights/wot.bin": @{@"offset":@0, @"data":build_blob_t(Wo,DIM,DIM)},
+            }), 4*DIM*SEQ*2, (DIM+2*SCORE_CH)*SEQ*2);
+
+            kQKVb = compile_kern_mil_w(gen_qkvb(), (@{
+                @"@model_path/weights/wqt.bin": @{@"offset":@0, @"data":build_blob_t(Wq,DIM,DIM)},
+                @"@model_path/weights/wkt.bin": @{@"offset":@0, @"data":build_blob_t(Wk,DIM,DIM)},
+                @"@model_path/weights/wvt.bin": @{@"offset":@0, @"data":build_blob_t(Wv,DIM,DIM)},
+            }), 3*DIM*SEQ*2, DIM*SEQ*2);
+
+            double cms = tb_ms(mach_absolute_time() - tc);
+            total_compile_ms += cms;
+            if (!kFwdAttn||!kFwdFFN||!kFFNBwd||!kSdpaBwd1||!kQKVb) {
+                printf("Compile failed at step %d, restart\n", step);
+                g_compile_count = MAX_COMPILES; continue;
+            }
+
+            // === Training loop ===
+            memset(gWq,0,wq_sz*4);memset(gWk,0,wq_sz*4);memset(gWv,0,wq_sz*4);memset(gWo,0,wo_sz*4);
+            memset(gW1,0,w1_sz*4);memset(gW2,0,w2_sz*4);memset(gW3,0,w3_sz*4);
+            memset(grms1,0,DIM*4);memset(grms2,0,DIM*4);
+            int steps_batch = 0;
+            uint64_t tt = mach_absolute_time();
+
+            double t_ane=0,t_io=0,t_elem=0,t_rms=0,t_cblas_wait=0;
+            for (int a=0; a<ACCUM_STEPS && step<total_steps; a++, step++) {
+                uint64_t t0,t1;
+                // ===== FORWARD =====
+                // Attention fwd (ANE does rmsnorm internally): x_in → o_out,Q,K,V,attn_out,xnorm
+                t0=mach_absolute_time();
+                io_write_fp16(kFwdAttn->ioIn, x_in, DIM, SEQ);
+                t1=mach_absolute_time(); t_io+=tb_ms(t1-t0); t0=t1;
+                ane_eval(kFwdAttn);
+                t1=mach_absolute_time(); t_ane+=tb_ms(t1-t0); t0=t1;
+                // Wait for prev step's dW cblas before reading attn_out/xnorm
+                dispatch_group_wait(dw_grp, DISPATCH_TIME_FOREVER);
+                t1=mach_absolute_time(); t_cblas_wait+=tb_ms(t1-t0); t0=t1;
+                io_read_fp16(kFwdAttn->ioOut, o_out,    0,     DIM, SEQ);
+                io_read_fp16(kFwdAttn->ioOut, attn_out, 4*DIM, DIM, SEQ);
+                io_read_fp16(kFwdAttn->ioOut, xnorm,    5*DIM, DIM, SEQ);
+                t1=mach_absolute_time(); t_io+=tb_ms(t1-t0); t0=t1;
+
+                for(int i=0;i<SEQ*DIM;i++) x2[i] = x_in[i] + o_out[i];
+                t1=mach_absolute_time(); t_elem+=tb_ms(t1-t0); t0=t1;
+
+                // FFN fwd (ANE does rmsnorm internally): x2 → ffn_out,h1,h3,silu_out,x2norm
+                io_write_fp16(kFwdFFN->ioIn, x2, DIM, SEQ);
+                t1=mach_absolute_time(); t_io+=tb_ms(t1-t0); t0=t1;
+                ane_eval(kFwdFFN);
+                t1=mach_absolute_time(); t_ane+=tb_ms(t1-t0); t0=t1;
+                io_read_fp16(kFwdFFN->ioOut, ffn_out,  0,              DIM,    SEQ);
+                io_read_fp16(kFwdFFN->ioOut, h1,       DIM,            HIDDEN, SEQ);
+                io_read_fp16(kFwdFFN->ioOut, h3,       DIM+HIDDEN,     HIDDEN, SEQ);
+                io_read_fp16(kFwdFFN->ioOut, silu_out, DIM+2*HIDDEN,   HIDDEN, SEQ);
+                io_read_fp16(kFwdFFN->ioOut, x2norm,   DIM+3*HIDDEN,   DIM,    SEQ);
+                t1=mach_absolute_time(); t_io+=tb_ms(t1-t0); t0=t1;
+
+                // Residual + Loss
+                for(int i=0;i<SEQ*DIM;i++) y_out[i] = x2[i] + ffn_out[i];
+                float loss = 0;
+                for(int i=0;i<SEQ*DIM;i++){
+                    float d = y_out[i]-y_tgt[i]; loss += d*d;
+                    dy[i] = 2.0f*d/(SEQ*DIM);
+                }
+                loss /= (SEQ*DIM);
+                last_loss = loss;
+                memcpy(dffn, dy, SEQ*DIM*4);
+                t1=mach_absolute_time(); t_elem+=tb_ms(t1-t0); t0=t1;
+
+                // ===== BACKWARD =====
+                // FFN backward (ANE)
+                io_write_fp16_at(kFFNBwd->ioIn, 0, dffn, DIM, SEQ);
+                io_copy(kFFNBwd->ioIn, DIM, kFwdFFN->ioOut, DIM, 2*HIDDEN, SEQ);
+                t1=mach_absolute_time(); t_io+=tb_ms(t1-t0); t0=t1;
+                ane_eval(kFFNBwd);
+                t1=mach_absolute_time(); t_ane+=tb_ms(t1-t0); t0=t1;
+                io_read_fp16(kFFNBwd->ioOut, dx_ffn, 0,           DIM,    SEQ);
+                io_read_fp16(kFFNBwd->ioOut, dh1,    DIM,         HIDDEN, SEQ);
+                io_read_fp16(kFFNBwd->ioOut, dh3,    DIM+HIDDEN,  HIDDEN, SEQ);
+                t1=mach_absolute_time(); t_io+=tb_ms(t1-t0); t0=t1;
+
+                // dW FFN async (overlaps with rmsnorm2_bwd + SDPA)
+                dispatch_group_async(dw_grp, dw_q, ^{
+                    cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, DIM, HIDDEN, SEQ,
+                                1.0f, dffn, SEQ, silu_out, SEQ, 1.0f, gW2, HIDDEN);
+                    cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, HIDDEN, DIM, SEQ,
+                                1.0f, dh1, SEQ, x2norm, SEQ, 1.0f, gW1, DIM);
+                    cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, HIDDEN, DIM, SEQ,
+                                1.0f, dh3, SEQ, x2norm, SEQ, 1.0f, gW3, DIM);
+                });
+
+                // RMSNorm2 backward — runs in parallel with dW FFN
+                memset(dx2, 0, SEQ*DIM*4);
+                rmsnorm_bwd(dx2, grms2, dx_ffn, x2, rms2_w, DIM, SEQ);
+                for(int i=0;i<SEQ*DIM;i++) dx2[i] += dy[i];
+                t1=mach_absolute_time(); t_rms+=tb_ms(t1-t0); t0=t1;
+
+                // dWo async (overlaps with SDPA backward)
+                memcpy(do_out_buf, dx2, SEQ*DIM*4);
+                dispatch_group_async(dw_grp, dw_q, ^{
+                    cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, DIM, DIM, SEQ,
+                                1.0f, do_out_buf, SEQ, attn_out, SEQ, 1.0f, gWo, DIM);
+                });
+
+                // SDPA backward (ANE) — includes Wo^T conv
+                io_copy(kSdpaBwd1->ioIn, 0, kFwdAttn->ioOut, DIM, 3*DIM, SEQ);
+                io_write_fp16_at(kSdpaBwd1->ioIn, 3*DIM, dx2, DIM, SEQ);
+                t1=mach_absolute_time(); t_io+=tb_ms(t1-t0); t0=t1;
+                ane_eval(kSdpaBwd1);
+                t1=mach_absolute_time(); t_ane+=tb_ms(t1-t0); t0=t1;
+                io_copy(kSdpaBwd2->ioIn, 0, kSdpaBwd1->ioOut, DIM, 2*SCORE_CH, SEQ);
+                io_copy(kSdpaBwd2->ioIn, 2*SCORE_CH, kFwdAttn->ioOut, DIM, 2*DIM, SEQ);
+                t1=mach_absolute_time(); t_io+=tb_ms(t1-t0); t0=t1;
+                ane_eval(kSdpaBwd2);
+                t1=mach_absolute_time(); t_ane+=tb_ms(t1-t0); t0=t1;
+
+                // Read dq,dk,dv — dW FFN+dWo still running async on serial queue
+                io_read_fp16(kSdpaBwd2->ioOut, dq, 0,   DIM, SEQ);
+                io_read_fp16(kSdpaBwd2->ioOut, dk, DIM,  DIM, SEQ);
+                io_read_fp16(kSdpaBwd1->ioOut, dv, 0,    DIM, SEQ);
+                t1=mach_absolute_time(); t_io+=tb_ms(t1-t0); t0=t1;
+                // dWq/dWk/dWv queues after dWo on serial queue — no wait needed
+                dispatch_group_async(dw_grp, dw_q, ^{
+                    cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, DIM, DIM, SEQ,
+                                1.0f, dq, SEQ, xnorm, SEQ, 1.0f, gWq, DIM);
+                    cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, DIM, DIM, SEQ,
+                                1.0f, dk, SEQ, xnorm, SEQ, 1.0f, gWk, DIM);
+                    cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, DIM, DIM, SEQ,
+                                1.0f, dv, SEQ, xnorm, SEQ, 1.0f, gWv, DIM);
+                });
+
+                // QKV backward (ANE) — dWq/dWk/dWv runs async
+                io_copy(kQKVb->ioIn, 0, kSdpaBwd2->ioOut, 0, 2*DIM, SEQ);
+                io_copy(kQKVb->ioIn, 2*DIM, kSdpaBwd1->ioOut, 0, DIM, SEQ);
+                t1=mach_absolute_time(); t_io+=tb_ms(t1-t0); t0=t1;
+                ane_eval(kQKVb);
+                t1=mach_absolute_time(); t_ane+=tb_ms(t1-t0); t0=t1;
+                io_read_fp16(kQKVb->ioOut, dx_attn, 0, DIM, SEQ);
+                t1=mach_absolute_time(); t_io+=tb_ms(t1-t0); t0=t1;
+
+                // RMSNorm1 backward (CPU) — doesn't touch cblas buffers
+                float *dx_rms = calloc(SEQ*DIM, 4);
+                rmsnorm_bwd(dx_rms, grms1, dx_attn, x_in, rms1_w, DIM, SEQ);
+                free(dx_rms);
+                t1=mach_absolute_time(); t_rms+=tb_ms(t1-t0);
+
+                steps_batch++;
+                if (step % 10 == 0 || step == start_step)
+                    printf("step %-4d loss=%.6f\n", step, loss);
+            }
+            double tms = tb_ms(mach_absolute_time() - tt);
+            total_train_ms += tms;
+            total_steps_done += steps_batch;
+            total_batches++;
+
+            // Ensure all async dW finished before Adam
+            dispatch_group_wait(dw_grp, DISPATCH_TIME_FOREVER);
+            // Adam update (scale gradients by 1/steps_batch for averaging)
+            float gsc = 1.0f / steps_batch;
+            for(size_t i=0;i<wq_sz;i++){gWq[i]*=gsc;gWk[i]*=gsc;gWv[i]*=gsc;gWo[i]*=gsc;}
+            for(size_t i=0;i<w1_sz;i++) gW1[i]*=gsc;
+            for(size_t i=0;i<w2_sz;i++) gW2[i]*=gsc;
+            for(size_t i=0;i<w3_sz;i++) gW3[i]*=gsc;
+            for(int i=0;i<DIM;i++){grms1[i]*=gsc; grms2[i]*=gsc;}
+            adam_t++;
+            adam_update(Wq, gWq, &aWq, adam_t, lr, adam_b1, adam_b2, adam_eps);
+            adam_update(Wk, gWk, &aWk, adam_t, lr, adam_b1, adam_b2, adam_eps);
+            adam_update(Wv, gWv, &aWv, adam_t, lr, adam_b1, adam_b2, adam_eps);
+            adam_update(Wo, gWo, &aWo, adam_t, lr, adam_b1, adam_b2, adam_eps);
+            adam_update(W1, gW1, &aW1, adam_t, lr, adam_b1, adam_b2, adam_eps);
+            adam_update(W2, gW2, &aW2, adam_t, lr, adam_b1, adam_b2, adam_eps);
+            adam_update(W3, gW3, &aW3, adam_t, lr, adam_b1, adam_b2, adam_eps);
+            adam_update(rms1_w, grms1, &arms1, adam_t, lr, adam_b1, adam_b2, adam_eps);
+            adam_update(rms2_w, grms2, &arms2, adam_t, lr, adam_b1, adam_b2, adam_eps);
+
+            printf("  [batch %d: compile=%.0fms train=%.1fms (%.1fms/step) compiles=%d]\n",
+                   steps_batch, cms, tms, tms/steps_batch, g_compile_count);
+            printf("    ane=%.1f io=%.1f elem=%.1f rms=%.1f cblas_wait=%.1f ms/step\n",
+                   t_ane/steps_batch, t_io/steps_batch, t_elem/steps_batch,
+                   t_rms/steps_batch, t_cblas_wait/steps_batch);
+        }
+
+        // === Efficiency Report ===
+        double wall = tb_ms(mach_absolute_time() - t_wall_start);
+        total_compile_ms += cum_compile; total_train_ms += cum_train;
+        wall += cum_wall; total_steps_done += cum_steps; total_batches += cum_batches;
+        double fwd_flops = 4.0*2*DIM*DIM*SEQ + 2.0*2*DIM*HIDDEN*SEQ + 2.0*HIDDEN*DIM*SEQ;
+        double bwd_dx_flops = fwd_flops;
+        double sdpa_flops = 2.0*HEADS*5*SEQ*SEQ*HD;
+        double ane_flops = (fwd_flops + bwd_dx_flops + sdpa_flops) * total_steps_done;
+        double total_flops = (fwd_flops*3 + sdpa_flops) * total_steps_done; // fwd+bwd_dx+bwd_dw+sdpa
+
+        printf("\n=== Efficiency Report ===\n");
+        printf("Total steps:     %d\n", total_steps_done);
+        printf("Wall time:       %.0f ms (%.1f s)\n", wall, wall/1000);
+        printf("Compile time:    %.0f ms (%.1f%%)\n", total_compile_ms, 100*total_compile_ms/wall);
+        printf("Train time:      %.0f ms (%.1f%%)\n", total_train_ms, 100*total_train_ms/wall);
+        printf("Avg compile:     %.0f ms per batch (5 kernels)\n", total_compile_ms/total_batches);
+        printf("Avg train:       %.1f ms/step\n", total_train_ms/total_steps_done);
+        printf("ANE TFLOPS:      %.2f sustained\n", ane_flops / (total_train_ms * 1e9));
+        printf("Total TFLOPS:    %.2f (ANE+CPU)\n", total_flops / (total_train_ms * 1e9));
+        printf("ANE utilization: %.1f%% of 15.8 TFLOPS\n", 100*ane_flops/(total_train_ms*1e9)/15.8);
+        printf("Params:          %.2fM  Weights: %.1fMB FP16\n", total_params/1e6, total_params*2.0/1e6);
+
+        // Cleanup
+        free_kern(kFwdAttn);free_kern(kFwdFFN);free_kern(kFFNBwd);
+        free_kern(kSdpaBwd1);free_kern(kSdpaBwd2);free_kern(kQKVb);
+        free(Wq);free(Wk);free(Wv);free(Wo);free(W1);free(W2);free(W3);
+        free(rms1_w);free(rms2_w);free(x_in);free(y_tgt);
+        free(xnorm);free(Q);free(K);free(V);free(attn_out);free(o_out);
+        free(x2);free(x2norm);free(h1);free(h3);free(silu_out);free(ffn_out);free(y_out);
+        free(dy);free(dffn);free(dh1);free(dh3);free(dx_ffn);free(dx2);
+        free(do_out_buf);free(dattn);free(dq);free(dk);free(dv);free(dx_attn);
+        free(probs_flat);free(dp_flat);
+        free(gWq);free(gWk);free(gWv);free(gWo);free(gW1);free(gW2);free(gW3);
+        free(grms1);free(grms2);
+        adam_free(&aWq);adam_free(&aWk);adam_free(&aWv);adam_free(&aWo);
+        adam_free(&aW1);adam_free(&aW2);adam_free(&aW3);adam_free(&arms1);adam_free(&arms2);
+        unlink(CKPT_PATH);
+    }
+    return 0;
+}