diff --git a/training/Makefile b/training/Makefile
index 90c2977..226bb39 100644
--- a/training/Makefile
+++ b/training/Makefile
@@ -3,10 +3,18 @@ CFLAGS = -O2 -Wall -Wno-deprecated-declarations -fobjc-arc
 FRAMEWORKS = -framework Foundation -framework CoreML -framework IOSurface
 LDFLAGS = $(FRAMEWORKS) -ldl
 
+HEADERS_LARGE = stories_config.h stories_io.h stories_mil.h stories_cpu_ops.h
+
 train: train.m ane_runtime.h ane_mil_gen.h model.h forward.h backward.h
 	$(CC) $(CFLAGS) -o $@ train.m $(LDFLAGS)
 
-clean:
-	rm -f train
+train_large: train_large.m $(HEADERS_LARGE)
+	$(CC) $(CFLAGS) -o $@ train_large.m $(LDFLAGS) -framework Accelerate
 
-.PHONY: clean
+tokenize:
+	python3 tokenize.py
+
+clean:
+	rm -f train train_large
+
+.PHONY: clean tokenize
diff --git a/training/README.md b/training/README.md
new file mode 100644
index 0000000..53edbb9
--- /dev/null
+++ b/training/README.md
@@ -0,0 +1,69 @@
+# ANE Training — Stories110M on Apple Neural Engine
+
+Training a 109M-parameter Llama2-architecture transformer (Stories110M) directly on Apple's Neural Engine using private ANE APIs.
+
+![Dashboard](dashboard.gif)
+
+## Architecture
+
+- **Model**: Stories110M — dim=768, hidden=2048, heads=12, layers=12, vocab=32000, seq=256
+- **109.53M params** (84.95M transformer + 24.58M embedding)
+- **72 ANE kernels** per compile (60 weight-bearing, 12 weight-free sdpaBwd2)
+- **6 kernel types per layer**: fwdAttn, fwdFFN, ffnBwd, sdpaBwd1, sdpaBwd2, qkvBwd
+
+## Performance
+
+| Component | Time (ms/step) |
+|-----------|---------------|
+| ANE eval | 9.6 |
+| IO (fp16 conversion) | 4.1 |
+| Classifier (cblas) | 9.1 |
+| Cross-entropy + residuals | 14.4 |
+| RMSNorm | 0.1 |
+| **Total** | **107 ms/step** |
+
+## Files
+
+| File | Description |
+|------|-------------|
+| `train_large.m` | Main training loop — 12-layer forward/backward, checkpoint, exec() restart |
+| `stories_config.h` | Model config, structs, alloc helpers |
+| `stories_io.h` | IOSurface I/O, NEON fp16 conversion, kernel compile/eval |
+| `stories_mil.h` | MIL program generators for all 6 ANE kernel types |
+| `stories_cpu_ops.h` | vDSP-vectorized RMSNorm, cross-entropy, Adam, embedding ops |
+| `dashboard.py` | TUI dashboard — loss curve, power/CPU/memory graphs, text generation |
+| `tokenize.py` | Extract pretokenized TinyStories data |
+| `Makefile` | Build targets |
+
+## How it works
+
+1. **Forward pass**: Each layer runs fwdAttn (QKV + SDPA + Wo) and fwdFFN (W1 + SiLU(W3) + W2) on ANE via MIL-compiled kernels. Final RMSNorm + classifier matmul on CPU (cblas).
+
+2. **Backward pass**: Reverse layer order. ffnBwd, sdpaBwd1, sdpaBwd2, qkvBwd on ANE. Weight gradients (dW) via async cblas_sgemm on CPU. RMSNorm backward via vDSP.
+
+3. **Compile budget**: ANE has a ~119 compile limit per process. With 72 kernels per batch, we run 10 accumulation steps then `exec()` restart with checkpoint resume.
+
+4. **Data**: Real TinyStories text (20M tokens), mmap'd uint16 token IDs, random position sampling per step.
+
+## Usage
+
+```bash
+# Extract tokenized data
+python3 tokenize.py
+
+# Build and train
+make train_large
+./train_large                    # fresh start
+./train_large --resume           # resume from checkpoint
+
+# Monitor with dashboard
+pip install blessed psutil numpy
+python3 dashboard.py --resume    # needs sudo for powermetrics
+```
+
+## Key techniques
+
+- **NEON vectorized fp16<->fp32**: ARM NEON intrinsics for fast IOSurface data transfer
+- **vDSP cross-entropy**: `vDSP_mtrans` + `vvexpf` + `vDSP_sve` — 8x faster than scalar
+- **Async weight gradients**: cblas_sgemm dispatched to background queue, overlapped with ANE
+- **SDPA causal mask workaround**: ANE hardware ignores attn_mask, so we decompose attention into Q@K^T (ANE conv) + mask+softmax (CPU) + scores@V (ANE conv)
diff --git a/training/dashboard.gif b/training/dashboard.gif
new file mode 100644
index 0000000..120f7d5
Binary files /dev/null and b/training/dashboard.gif differ
diff --git a/training/dashboard.py b/training/dashboard.py
new file mode 100644
index 0000000..a3a1503
--- /dev/null
+++ b/training/dashboard.py
@@ -0,0 +1,882 @@
+"""TUI dashboard for ANE training (train_large). Uses blessed for terminal UI."""
+
+import argparse, fcntl, math, os, re, select, signal, struct, subprocess, sys, time, threading
+from collections import deque
+from pathlib import Path
+
+import numpy as np
+
+try:
+    from blessed import Terminal
+except ImportError:
+    print('pip install blessed')
+    sys.exit(1)
+
+try:
+    import psutil
+    HAS_PSUTIL = True
+except ImportError:
+    HAS_PSUTIL = False
+
+DIM, HIDDEN, HEADS, SEQ, VOCAB, NLAYERS = 768, 2048, 12, 256, 32000, 12
+HD = DIM // HEADS
+CKPT_PATH = 'ane_stories110M_ckpt.bin'
+TOKENIZER_PATH = str(Path(__file__).resolve().parent.parent.parent / 'assets' / 'models' / 'tokenizer.bin')
+
+
+class State:
+    def __init__(self):
+        self.model_config = {}
+        self.params = {}
+        self.kernels = {}
+        self.training = {}
+        self.flops = {}
+        self.step = 0
+        self.total_steps = 0
+        self.loss = 0.0
+        self.best_loss = float('inf')
+        self.loss_history = []
+        self.ms_per_step = 0.0
+        self.compile_pct = 0.0
+        self.compiles = 0
+        self.component_timing = {}
+        self.power = {'ane': 0.0, 'cpu': 0.0, 'gpu': 0.0}
+        self.power_history_ane = deque(maxlen=300)
+        self.power_history_cpu = deque(maxlen=300)
+        self.logs = deque(maxlen=2000)
+        self.log_scroll = 0
+        self.auto_scroll = True
+        self.batch_num = 0
+        self.efficiency = {}
+        self.gen_text = ''
+        self.gen_step = 0
+        self.gen_status = 'idle'
+        self.gen_lock = threading.Lock()
+        self.cpu_pct_history = deque(maxlen=300)
+        self.mem_mb_history = deque(maxlen=300)
+        self.proc_mem_mb_history = deque(maxlen=300)
+        self.train_pid = None
+
+S = State()
+
+
+class Tokenizer:
+    def __init__(self, path):
+        self.vocab = []
+        self.scores = []
+        with open(path, 'rb') as f:
+            max_len = struct.unpack('i', f.read(4))[0]
+            for _ in range(VOCAB):
+                score = struct.unpack('f', f.read(4))[0]
+                slen = struct.unpack('i', f.read(4))[0]
+                tok = f.read(slen).decode('utf-8', errors='replace')
+                self.vocab.append(tok)
+                self.scores.append(score)
+
+    def decode(self, token_id):
+        if 0 <= token_id < len(self.vocab):
+            s = self.vocab[token_id]
+            if s.startswith('<0x') and s.endswith('>'):
+                try:
+                    return chr(int(s[3:-1], 16))
+                except:
+                    return s
+            return s
+        return ''
+
+_tokenizer = None
+def get_tokenizer():
+    global _tokenizer
+    if _tokenizer is None:
+        try:
+            _tokenizer = Tokenizer(TOKENIZER_PATH)
+        except Exception as e:
+            S.logs.append(f'[gen] tokenizer load failed: {e}')
+            return None
+    return _tokenizer
+
+
+def load_weights_from_ckpt(path):
+    try:
+        with open(path, 'rb') as f:
+            # CkptHdr: 96 bytes (verified with sizeof)
+            hdr = f.read(96)
+            if len(hdr) < 96:
+                return None
+            wq_sz = DIM * DIM
+            wo_sz = DIM * DIM
+            w1_sz = HIDDEN * DIM
+            w2_sz = DIM * HIDDEN
+            w3_sz = HIDDEN * DIM
+            # Per-layer: weights + adam state (m,v for each)
+            adam_per_layer = (wq_sz*2 + wq_sz*2 + wq_sz*2 + wo_sz*2 +
+                              w1_sz*2 + w2_sz*2 + w3_sz*2 + DIM*2 + DIM*2)
+            W = {}
+            for L in range(NLAYERS):
+                W[f'Wq{L}'] = np.frombuffer(f.read(wq_sz * 4), dtype=np.float32).reshape(DIM, DIM).copy()
+                W[f'Wk{L}'] = np.frombuffer(f.read(wq_sz * 4), dtype=np.float32).reshape(DIM, DIM).copy()
+                W[f'Wv{L}'] = np.frombuffer(f.read(wq_sz * 4), dtype=np.float32).reshape(DIM, DIM).copy()
+                W[f'Wo{L}'] = np.frombuffer(f.read(wo_sz * 4), dtype=np.float32).reshape(DIM, DIM).copy()
+                W[f'W1_{L}'] = np.frombuffer(f.read(w1_sz * 4), dtype=np.float32).reshape(HIDDEN, DIM).copy()
+                W[f'W2_{L}'] = np.frombuffer(f.read(w2_sz * 4), dtype=np.float32).reshape(DIM, HIDDEN).copy()
+                W[f'W3_{L}'] = np.frombuffer(f.read(w3_sz * 4), dtype=np.float32).reshape(HIDDEN, DIM).copy()
+                W[f'rms1_{L}'] = np.frombuffer(f.read(DIM * 4), dtype=np.float32).copy()
+                W[f'rms2_{L}'] = np.frombuffer(f.read(DIM * 4), dtype=np.float32).copy()
+                # Skip adam state for this layer
+                f.seek(adam_per_layer * 4, 1)
+            W['rms_final'] = np.frombuffer(f.read(DIM * 4), dtype=np.float32).copy()
+            f.seek(DIM * 2 * 4, 1)  # skip rms_final adam
+            W['embed'] = np.frombuffer(f.read(VOCAB * DIM * 4), dtype=np.float32).reshape(VOCAB, DIM).copy()
+            return W
+    except Exception as e:
+        S.logs.append(f'[gen] ckpt load failed: {e}')
+        return None
+
+
+def rmsnorm(x, w):
+    ss = np.mean(x * x) + 1e-5
+    return x * (1.0 / math.sqrt(ss)) * w
+
+def softmax(x):
+    x = x - np.max(x)
+    e = np.exp(x)
+    return e / np.sum(e)
+
+def generate_text(W, tok, max_tokens=64, temperature=0.8):
+    tokenizer = get_tokenizer()
+    if tokenizer is None:
+        return '[no tokenizer]'
+
+    tokens = [1]
+    text_parts = []
+
+    # Precompute RoPE frequencies
+    freqs = np.zeros((SEQ, HD // 2), dtype=np.float32)
+    for pos in range(SEQ):
+        for i in range(HD // 2):
+            freq = 1.0 / (10000.0 ** (2.0 * i / HD))
+            freqs[pos, i] = pos * freq
+
+    for step in range(max_tokens):
+        seq_len = len(tokens)
+        if seq_len > SEQ:
+            break
+
+        x = W['embed'][tokens[-1]].copy()
+
+        for L in range(NLAYERS):
+            # RMSNorm + QKV
+            xn = rmsnorm(x, W[f'rms1_{L}'])
+            q = W[f'Wq{L}'] @ xn
+            k = W[f'Wk{L}'] @ xn
+            v = W[f'Wv{L}'] @ xn
+
+            # RoPE
+            pos = seq_len - 1
+            for h in range(HEADS):
+                for i in range(HD // 2):
+                    freq = freqs[pos, i]
+                    cos_v, sin_v = math.cos(freq), math.sin(freq)
+                    qi, qi1 = q[h * HD + 2 * i], q[h * HD + 2 * i + 1]
+                    q[h * HD + 2 * i] = qi * cos_v - qi1 * sin_v
+                    q[h * HD + 2 * i + 1] = qi * sin_v + qi1 * cos_v
+                    ki, ki1 = k[h * HD + 2 * i], k[h * HD + 2 * i + 1]
+                    k[h * HD + 2 * i] = ki * cos_v - ki1 * sin_v
+                    k[h * HD + 2 * i + 1] = ki * sin_v + ki1 * cos_v
+
+            # Attention (single token)
+            o = np.zeros(DIM, dtype=np.float32)
+            for h in range(HEADS):
+                qh = q[h * HD:(h + 1) * HD]
+                kh = k[h * HD:(h + 1) * HD]
+                vh = v[h * HD:(h + 1) * HD]
+                score = np.dot(qh, kh) / math.sqrt(HD)
+                o[h * HD:(h + 1) * HD] = vh
+
+            # Residual + output projection
+            x2 = x + W[f'Wo{L}'] @ o
+
+            # FFN
+            x2n = rmsnorm(x2, W[f'rms2_{L}'])
+            h1 = W[f'W1_{L}'] @ x2n
+            h3 = W[f'W3_{L}'] @ x2n
+            # SiLU
+            h1 = h1 * (1.0 / (1.0 + np.exp(-h1))) * h3
+            ffn_out = W[f'W2_{L}'] @ h1
+
+            x = x2 + ffn_out
+
+        x = rmsnorm(x, W['rms_final'])
+
+        # Logits
+        logits = W['embed'] @ x
+
+        if temperature < 0.01:
+            next_tok = int(np.argmax(logits))
+        else:
+            logits = logits / temperature
+            probs = softmax(logits)
+            next_tok = int(np.random.choice(VOCAB, p=probs))
+
+        if next_tok == 2:
+            break
+        tokens.append(next_tok)
+        piece = tokenizer.decode(next_tok)
+        text_parts.append(piece)
+
+    return ''.join(text_parts)
+
+
+def generation_thread():
+    last_gen_step = -1
+    while True:
+        time.sleep(5)
+        if S.step <= last_gen_step + 99:
+            continue
+        if not os.path.exists(CKPT_PATH):
+            continue
+        with S.gen_lock:
+            S.gen_status = 'generating'
+            S.gen_step = S.step
+        try:
+            W = load_weights_from_ckpt(CKPT_PATH)
+            if W is None:
+                with S.gen_lock:
+                    S.gen_status = 'idle'
+                continue
+            text = generate_text(W, get_tokenizer(), max_tokens=64, temperature=0.8)
+            with S.gen_lock:
+                S.gen_text = text
+                S.gen_step = S.step
+                S.gen_status = 'done'
+            S.step  # just to reference
+        except Exception as e:
+            with S.gen_lock:
+                S.gen_text = f'[error: {e}]'
+                S.gen_status = 'done'
+        last_gen_step = S.step
+
+
+def sysmetrics_thread():
+    while True:
+        time.sleep(1)
+        if not HAS_PSUTIL:
+            continue
+        now = time.monotonic()
+        S.cpu_pct_history.append(psutil.cpu_percent(interval=None))
+        mem = psutil.virtual_memory()
+        S.mem_mb_history.append(mem.used / (1024 * 1024))
+        pid = S.train_pid
+        if pid:
+            try:
+                p = psutil.Process(pid)
+                S.proc_mem_mb_history.append(p.memory_info().rss / (1024 * 1024))
+            except (psutil.NoSuchProcess, psutil.AccessDenied):
+                pass
+
+
+RE_CONFIG = re.compile(r'dim=(\d+) hidden=(\d+) heads=(\d+) seq=(\d+) vocab=(\d+) layers=(\d+)')
+RE_PARAMS = re.compile(r'Params: ([\d.]+)M \(transformer ([\d.]+)M \+ embed ([\d.]+)M\)')
+RE_KERNELS = re.compile(r'Kernels: (\d+).*?(\d+) weight-bearing')
+RE_ACCUM = re.compile(r'Accum (\d+).*LR=([\d.e+-]+)')
+RE_STEP = re.compile(r'step\s+(\d+)\s+loss=([\d.]+)')
+RE_BATCH = re.compile(r'\[batch (\d+): compile=([\d.]+)ms train=([\d.]+)ms \(([\d.]+)ms/step\) compiles=(\d+)\]')
+RE_TIMING = re.compile(r'ane=([\d.]+) io=([\d.]+) cls=([\d.]+) elem=([\d.]+) rms=([\d.]+) cblas_wait=([\d.]+)')
+RE_RESTART = re.compile(r'\[exec\(\) restart step (\d+)')
+RE_RESUME = re.compile(r'\[RESUMED step (\d+), loss=([\d.]+)\]')
+RE_FLOPS = re.compile(r'FLOPs/step: fwd=([\d.]+)M bwd_dx=([\d.]+)M bwd_dW=([\d.]+)M sdpa_bwd=([\d.]+)M total=([\d.]+)M')
+RE_ANE_FLOPS = re.compile(r'ANE FLOPs/step: ([\d.]+)M')
+RE_ANE_TFLOPS = re.compile(r'ANE TFLOPS:\s+([\d.]+)')
+RE_ANE_UTIL = re.compile(r'ANE utilization:\s+([\d.]+)%')
+RE_EFFICIENCY = re.compile(r'(Total steps|Wall time|Compile time|Train time|Avg compile|Avg train|ANE TFLOPS|Total TFLOPS|ANE utilization):?\s+(.+)')
+RE_ANE_POWER = re.compile(r'ANE Power:\s+([\d.]+)\s*mW')
+RE_CPU_POWER = re.compile(r'CPU Power:\s+([\d.]+)\s*mW')
+RE_GPU_POWER = re.compile(r'GPU Power:\s+([\d.]+)\s*mW')
+
+def parse_line(line):
+    S.logs.append(line)
+    m = RE_CONFIG.search(line)
+    if m:
+        S.model_config = dict(zip(['dim', 'hidden', 'heads', 'seq', 'vocab', 'layers'], map(int, m.groups())))
+        return
+    m = RE_PARAMS.search(line)
+    if m:
+        S.params = {'total': float(m[1]), 'transformer': float(m[2]), 'embed': float(m[3])}
+        return
+    m = RE_KERNELS.search(line)
+    if m:
+        S.kernels = {'total': int(m[1]), 'weight_bearing': int(m[2])}
+        return
+    m = RE_ACCUM.search(line)
+    if m:
+        S.training = {'accum': int(m[1]), 'lr': m[2]}
+        return
+    m = RE_FLOPS.search(line)
+    if m:
+        S.flops.update(fwd=float(m[1]), bwd_dx=float(m[2]), bwd_dw=float(m[3]),
+                       sdpa_bwd=float(m[4]), total=float(m[5]))
+        return
+    m = RE_ANE_FLOPS.search(line)
+    if m:
+        S.flops['ane'] = float(m[1])
+        return
+    m = RE_STEP.search(line)
+    if m:
+        S.step, S.loss = int(m[1]), float(m[2])
+        S.loss_history.append((S.step, S.loss))
+        S.best_loss = min(S.best_loss, S.loss)
+        return
+    m = RE_BATCH.search(line)
+    if m:
+        S.batch_num = int(m[1])
+        compile_ms, train_ms = float(m[2]), float(m[3])
+        S.ms_per_step = float(m[4])
+        S.compiles = int(m[5])
+        S.compile_pct = 100 * compile_ms / (compile_ms + train_ms) if compile_ms + train_ms > 0 else 0
+        return
+    m = RE_TIMING.search(line)
+    if m:
+        S.component_timing = dict(zip(['ane', 'io', 'cls', 'elem', 'rms', 'cblas_wait'], map(float, m.groups())))
+        return
+    m = RE_ANE_TFLOPS.search(line)
+    if m:
+        S.flops['ane_tflops'] = float(m[1])
+        return
+    m = RE_ANE_UTIL.search(line)
+    if m:
+        S.flops['ane_util'] = float(m[1])
+        return
+    m = RE_EFFICIENCY.search(line)
+    if m:
+        S.efficiency[m[1].strip()] = m[2].strip()
+        return
+
+
+def parse_powermetrics_text(text):
+    now = time.monotonic()
+    m = RE_ANE_POWER.search(text)
+    if m:
+        S.power['ane'] = float(m[1]) / 1000.0
+        S.power_history_ane.append((now, S.power['ane']))
+    m = RE_CPU_POWER.search(text)
+    if m:
+        S.power['cpu'] = float(m[1]) / 1000.0
+        S.power_history_cpu.append((now, S.power['cpu']))
+    m = RE_GPU_POWER.search(text)
+    if m:
+        S.power['gpu'] = float(m[1]) / 1000.0
+
+
+BRAILLE_BASE = 0x2800
+
+BRAILLE_MAP = [
+    [1, 8],
+    [2, 16],
+    [4, 32],
+    [64, 128],
+]
+
+def braille_chart(values, width, height, label_fmt='{:.1f}', y_range=None):
+    if not values or width < 8 or height < 2:
+        return ['(no data)'] * max(1, height)
+    chart_w = width - 6
+    if chart_w < 2:
+        return ['(no data)'] * max(1, height)
+    points_x = chart_w * 2
+    points_y = height * 4
+    data = values[-points_x:] if len(values) > points_x else values
+    lo, hi = min(data), max(data)
+    if y_range:
+        lo, hi = y_range
+    if hi - lo < 0.001:
+        lo, hi = lo - 0.5, hi + 0.5
+    margin = (hi - lo) * 0.05
+    lo -= margin
+    hi += margin
+
+    grid = [[0] * chart_w for _ in range(height)]
+
+    def plot(px, py):
+        px = max(0, min(points_x - 1, px))
+        py = max(0, min(points_y - 1, py))
+        grid[py // 4][px // 2] |= BRAILLE_MAP[py % 4][px % 2]
+
+    def val_to_y(v):
+        return int((1 - (v - lo) / (hi - lo)) * (points_y - 1))
+
+    for i in range(len(data)):
+        if i >= points_x:
+            break
+        y0 = val_to_y(data[i])
+        plot(i, y0)
+        if i > 0:
+            y_prev = val_to_y(data[i - 1])
+            y_lo, y_hi = min(y_prev, y0), max(y_prev, y0)
+            for yy in range(y_lo, y_hi + 1):
+                if y_hi != y_lo:
+                    t = (yy - y_prev) / (y0 - y_prev)
+                    xx = int(i - 1 + t)
+                else:
+                    xx = i
+                plot(xx, yy)
+
+    lines = []
+    for r in range(height):
+        if r == 0:
+            label = label_fmt.format(hi)[:5].rjust(5)
+        elif r == height - 1:
+            label = label_fmt.format(lo)[:5].rjust(5)
+        elif r == height // 2:
+            label = label_fmt.format((hi + lo) / 2)[:5].rjust(5)
+        else:
+            label = '     '
+        row_str = ''.join(chr(BRAILLE_BASE | grid[r][c]) for c in range(chart_w))
+        lines.append(f'{label}\u2502{row_str}')
+
+    lines.append('     \u2514' + '\u2500' * chart_w)
+    return lines
+
+
+def draw(term):
+    w, h = term.width, term.height
+    if w < 40 or h < 15:
+        print(term.home + term.clear + 'Terminal too small', end='', flush=True)
+        return
+
+    buf = []
+
+    def put(y, x, text, style=''):
+        if 0 <= y < h and x < w:
+            text = text[:w - x]
+            if style:
+                buf.append(term.move(y, x) + style + text + term.normal)
+                return
+            buf.append(term.move(y, x) + text)
+
+    buf.append(term.home + term.clear)
+
+    mid_x = w // 2
+    right_w = w - mid_x - 1
+    left_w = mid_x - 1
+
+    row = 0
+
+    # Model Config header
+    hdr = '\u2500 Model Config '
+    put(row, 0, '\u250c' + hdr + '\u2500' * max(0, w - len(hdr) - 2) + '\u2510', term.cyan)
+    row += 1
+
+    cfg = S.model_config
+    if cfg:
+        line1 = f"stories110M  dim={cfg.get('dim', '')} hidden={cfg.get('hidden', '')} heads={cfg.get('heads', '')} seq={cfg.get('seq', '')} layers={cfg.get('layers', '')}"
+        put(row, 0, '\u2502', term.cyan)
+        put(row, 2, line1)
+        put(row, w - 1, '\u2502', term.cyan)
+        row += 1
+        p, k, t = S.params, S.kernels, S.training
+        line2 = f"{p.get('total', '?')}M params ({p.get('transformer', '?')}M xfmr + {p.get('embed', '?')}M embed)"
+        put(row, 0, '\u2502', term.cyan)
+        put(row, 2, line2)
+        put(row, w - 1, '\u2502', term.cyan)
+        row += 1
+        line3 = f"{k.get('total', '?')} kernels ({k.get('weight_bearing', '?')} wt-bearing) | Accum {t.get('accum', '?')} | Adam LR={t.get('lr', '?')}"
+        put(row, 0, '\u2502', term.cyan)
+        put(row, 2, line3)
+        put(row, w - 1, '\u2502', term.cyan)
+        row += 1
+    else:
+        put(row, 0, '\u2502', term.cyan)
+        put(row, 2, 'Waiting for model config...')
+        put(row, w - 1, '\u2502', term.cyan)
+        row += 1
+
+    remaining = h - row - 1
+    # Allocate: loss curve ~40%, logs ~30%, power/cpu/mem/gen share rest
+    power_h = max(3, remaining // 8)
+    gen_h = max(2, remaining // 10)
+    extra_panels = power_h + power_h + gen_h + 6  # power + cpu/mem + gen + dividers
+    log_h_min = max(5, remaining // 5)
+    curve_h = max(5, remaining - extra_panels - log_h_min)
+
+    # Loss Curve + Training Stats divider
+    put(row, 0, '\u251c\u2500 Loss Curve ' + '\u2500' * max(0, left_w - 13) + '\u252c\u2500 Training Stats ' + '\u2500' * max(0, right_w - 17) + '\u2524', term.cyan)
+    row += 1
+
+    # Loss curve
+    loss_vals = [l for _, l in S.loss_history]
+    curve_lines = braille_chart(loss_vals, left_w - 1, curve_h)
+    for i, cl in enumerate(curve_lines):
+        put(row + i, 0, '\u2502', term.cyan)
+        put(row + i, 1, cl, term.green)
+        put(row + i, mid_x, '\u2502', term.cyan)
+        put(row + i, w - 1, '\u2502', term.cyan)
+
+    # Training stats (right panel)
+    sr = row
+    step_str = f'{S.step}' + (f'/{S.total_steps}' if S.total_steps and S.total_steps < 999999 else '')
+    put(sr, mid_x + 1, f' Step: {step_str}  Loss: {S.loss:.4f}' if S.loss else ' Step: --', term.yellow)
+    sr += 1
+    put(sr, mid_x + 1, f' Best: {S.best_loss:.4f}   ms/step: {S.ms_per_step:.1f}' if S.best_loss < float('inf') else ' Best: --')
+    sr += 1
+    ane_tflops = S.flops.get('ane_tflops', 0)
+    ane_util = S.flops.get('ane_util', 0)
+    if ane_tflops:
+        put(sr, mid_x + 1, f' ANE: {ane_tflops:.2f}T  Compile: {S.compile_pct:.0f}%  Util: {ane_util:.1f}%')
+    else:
+        put(sr, mid_x + 1, f' Compile: {S.compile_pct:.0f}%')
+    sr += 1
+    ct = S.component_timing
+    if ct:
+        put(sr, mid_x + 1, f' ane={ct.get("ane", 0):.1f} io={ct.get("io", 0):.1f} cls={ct.get("cls", 0):.1f} elem={ct.get("elem", 0):.1f}')
+        sr += 1
+        put(sr, mid_x + 1, f' rms={ct.get("rms", 0):.1f} cblas_wait={ct.get("cblas_wait", 0):.1f} ms/step')
+        sr += 1
+    pw = S.power
+    if any(pw.values()):
+        put(sr, mid_x + 1, '\u2500 Power ' + '\u2500' * max(0, right_w - 9), term.cyan)
+        sr += 1
+        put(sr, mid_x + 1, f' ANE: {pw["ane"]:.1f}W  CPU: {pw["cpu"]:.1f}W  GPU: {pw["gpu"]:.1f}W', term.magenta)
+        sr += 1
+    if S.batch_num:
+        put(sr, mid_x + 1, f' Batch: {S.batch_num}  Compiles: {S.compiles}')
+        sr += 1
+
+    # Fill vertical borders between loss curve and stats
+    top_end = row + len(curve_lines)
+    for r in range(row, max(top_end, sr)):
+        if r >= top_end:
+            put(r, 0, '\u2502', term.cyan)
+        if r >= sr:
+            put(r, mid_x, '\u2502', term.cyan)
+        put(r, w - 1, '\u2502', term.cyan)
+    row = max(top_end, sr)
+
+    # Power charts
+    has_power = len(S.power_history_ane) > 1 or len(S.power_history_cpu) > 1
+    if has_power:
+        put(row, 0, '\u251c\u2500 ANE Power (W) ' + '\u2500' * max(0, left_w - 16) + '\u252c\u2500 CPU Power (W) ' + '\u2500' * max(0, right_w - 17) + '\u2524', term.cyan)
+        row += 1
+        ane_vals = [v for _, v in S.power_history_ane]
+        cpu_vals = [v for _, v in S.power_history_cpu]
+        ane_lines = braille_chart(ane_vals, left_w - 1, power_h, label_fmt='{:.1f}')
+        cpu_lines = braille_chart(cpu_vals, right_w - 1, power_h, label_fmt='{:.1f}')
+        max_lines = max(len(ane_lines), len(cpu_lines))
+        while len(ane_lines) < max_lines:
+            ane_lines.append(' ' * (left_w - 1))
+        while len(cpu_lines) < max_lines:
+            cpu_lines.append(' ' * (right_w - 1))
+        for i in range(max_lines):
+            put(row + i, 0, '\u2502', term.cyan)
+            put(row + i, 1, ane_lines[i], term.red)
+            put(row + i, mid_x, '\u2502', term.cyan)
+            put(row + i, mid_x + 1, cpu_lines[i], term.blue)
+            put(row + i, w - 1, '\u2502', term.cyan)
+        row += max_lines
+
+    # CPU / Memory charts
+    has_sysmetrics = len(S.cpu_pct_history) > 0
+    if has_sysmetrics:
+        put(row, 0, '\u251c\u2500 CPU % ' + '\u2500' * max(0, left_w - 8) + '\u252c\u2500 Memory (MB) ' + '\u2500' * max(0, right_w - 15) + '\u2524', term.cyan)
+        row += 1
+        cpu_vals = list(S.cpu_pct_history)
+        mem_vals = list(S.proc_mem_mb_history) if S.proc_mem_mb_history else list(S.mem_mb_history)
+        mem_label = 'proc' if S.proc_mem_mb_history else 'sys'
+        cpu_lines = braille_chart(cpu_vals, left_w - 1, power_h, label_fmt='{:.0f}', y_range=(0, 100))
+        mem_lines = braille_chart(mem_vals, right_w - 1, power_h, label_fmt='{:.0f}')
+        max_lines = max(len(cpu_lines), len(mem_lines))
+        while len(cpu_lines) < max_lines:
+            cpu_lines.append(' ' * (left_w - 1))
+        while len(mem_lines) < max_lines:
+            mem_lines.append(' ' * (right_w - 1))
+        for i in range(max_lines):
+            put(row + i, 0, '\u2502', term.cyan)
+            put(row + i, 1, cpu_lines[i], term.yellow)
+            put(row + i, mid_x, '\u2502', term.cyan)
+            put(row + i, mid_x + 1, mem_lines[i], term.magenta)
+            put(row + i, w - 1, '\u2502', term.cyan)
+        row += max_lines
+
+    # Generated text
+    with S.gen_lock:
+        gen_text = S.gen_text
+        gen_step = S.gen_step
+        gen_status = S.gen_status
+    if gen_text or gen_status == 'generating':
+        status_tag = ' (generating...)' if gen_status == 'generating' else f' (step {gen_step})'
+        put(row, 0, '\u251c\u2500 Generated Text' + status_tag + ' ' + '\u2500' * max(0, w - 20 - len(status_tag)) + '\u2524', term.cyan)
+        row += 1
+        if gen_text:
+            line_w = w - 3
+            text = gen_text.replace('\n', ' ')
+            wrapped = [text[i:i + line_w] for i in range(0, len(text), line_w)]
+            for i, tl in enumerate(wrapped[:gen_h]):
+                put(row, 0, '\u2502', term.cyan)
+                put(row, 2, tl, term.white)
+                put(row, w - 1, '\u2502', term.cyan)
+                row += 1
+        else:
+            put(row, 0, '\u2502', term.cyan)
+            put(row, 2, '...')
+            put(row, w - 1, '\u2502', term.cyan)
+            row += 1
+
+    # Logs
+    log_h = h - row - 1
+    scroll_hint = ' (scroll) ' if not S.auto_scroll else ' '
+    put(row, 0, '\u251c\u2500 Logs' + scroll_hint + '\u2500' * max(0, w - 8 - len(scroll_hint)) + '\u2524', term.cyan)
+    row += 1
+
+    logs = list(S.logs)
+    if log_h > 0 and logs:
+        if S.auto_scroll:
+            start = max(0, len(logs) - log_h)
+        else:
+            start = max(0, min(S.log_scroll, len(logs) - log_h))
+        visible = logs[start:start + log_h]
+        for i, line in enumerate(visible):
+            put(row + i, 0, '\u2502', term.cyan)
+            if RE_STEP.search(line):
+                put(row + i, 1, line[:w - 2], term.yellow)
+            elif line.strip().startswith('[batch'):
+                put(row + i, 1, line[:w - 2], term.blue)
+            elif 'FAIL' in line or 'error' in line.lower():
+                put(row + i, 1, line[:w - 2], term.red)
+            else:
+                put(row + i, 1, line[:w - 2])
+            put(row + i, w - 1, '\u2502', term.cyan)
+        for i in range(len(visible), log_h):
+            put(row + i, 0, '\u2502', term.cyan)
+            put(row + i, w - 1, '\u2502', term.cyan)
+
+    # Bottom border
+    put(h - 1, 0, '\u2514' + '\u2500' * (w - 2) + '\u2518', term.cyan)
+
+    sys.stdout.write(''.join(buf))
+    sys.stdout.flush()
+
+
+def set_nonblock(fd):
+    fl = fcntl.fcntl(fd, fcntl.F_GETFL)
+    fcntl.fcntl(fd, fcntl.F_SETFL, fl | os.O_NONBLOCK)
+
+def spawn_training(resume=False, steps=10000):
+    cmd = 'make train_large 2>&1 && ./train_large'
+    if resume:
+        cmd += ' --resume'
+    cmd += f' --steps {steps}'
+    proc = subprocess.Popen(
+        ['bash', '-c', cmd],
+        stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
+        cwd=os.path.dirname(os.path.abspath(__file__)) or '.')
+    set_nonblock(proc.stdout.fileno())
+    return proc
+
+def spawn_powermetrics():
+    try:
+        proc = subprocess.Popen(
+            ['sudo', 'powermetrics', '--samplers', 'cpu_power,gpu_power,ane_power', '-i', '1000'],
+            stdout=subprocess.PIPE, stderr=subprocess.DEVNULL)
+        set_nonblock(proc.stdout.fileno())
+        return proc
+    except (FileNotFoundError, PermissionError):
+        return None
+
+def main():
+    parser = argparse.ArgumentParser(description='ANE Training Dashboard (stories110M)')
+    parser.add_argument('--resume', action='store_true', help='Resume from checkpoint')
+    parser.add_argument('--infinite', action='store_true', help='Train indefinitely')
+    parser.add_argument('--no-powermetrics', action='store_true')
+    parser.add_argument('--no-generate', action='store_true', help='Disable text generation')
+    parser.add_argument('--steps', type=int, default=10000, help='Total steps (default: 10000)')
+    args = parser.parse_args()
+
+    if args.infinite:
+        args.steps = 999999999
+    S.total_steps = args.steps
+
+    term = Terminal()
+    procs = []
+
+    train_proc = spawn_training(resume=args.resume, steps=args.steps)
+    S.train_pid = train_proc.pid
+    procs.append(train_proc)
+
+    if HAS_PSUTIL:
+        psutil.cpu_percent(interval=None)  # prime the counter
+        sys_t = threading.Thread(target=sysmetrics_thread, daemon=True)
+        sys_t.start()
+
+    pm_proc = None
+    if not args.no_powermetrics:
+        pm_proc = spawn_powermetrics()
+        if pm_proc:
+            procs.append(pm_proc)
+
+    if not args.no_generate:
+        gen_t = threading.Thread(target=generation_thread, daemon=True)
+        gen_t.start()
+
+    pm_buf = ''
+    train_buf = ''
+
+    def cleanup():
+        for p in procs:
+            try:
+                p.terminate()
+            except Exception:
+                pass
+
+    signal.signal(signal.SIGINT, lambda *a: cleanup())
+    signal.signal(signal.SIGTERM, lambda *a: cleanup())
+
+    resized = [False]
+    def on_resize(*a):
+        resized[0] = True
+
+    signal.signal(signal.SIGWINCH, on_resize)
+
+    with term.fullscreen(), term.cbreak(), term.hidden_cursor():
+        draw(term)
+        last_draw = time.monotonic()
+
+        while True:
+            fds = []
+            fd_map = {}
+            if train_proc and train_proc.stdout:
+                fd = train_proc.stdout.fileno()
+                fds.append(fd)
+                fd_map[fd] = 'train'
+            if pm_proc and pm_proc.stdout:
+                fd = pm_proc.stdout.fileno()
+                fds.append(fd)
+                fd_map[fd] = 'pm'
+            fds.append(sys.stdin.fileno())
+            fd_map[sys.stdin.fileno()] = 'stdin'
+
+            try:
+                readable, _, _ = select.select(fds, [], [], 0.25)
+            except (ValueError, OSError):
+                continue
+
+            need_draw = resized[0]
+            resized[0] = False
+
+            train_finished = False
+
+            for fd in readable:
+                kind = fd_map.get(fd)
+                if kind == 'train':
+                    try:
+                        data = os.read(fd, 65536)
+                    except BlockingIOError:
+                        continue
+                    except (OSError, ValueError):
+                        data = b''
+                    if not data:
+                        if train_proc.poll() is not None:
+                            try:
+                                rest = train_proc.stdout.read()
+                                if rest:
+                                    for line in rest.decode('utf-8', errors='replace').split('\n'):
+                                        if line:
+                                            parse_line(line)
+                            except Exception:
+                                pass
+                            S.logs.append('[dashboard] Training finished. Press q to exit.')
+                            train_finished = True
+                        continue
+                    train_buf += data.decode('utf-8', errors='replace')
+                    while '\n' in train_buf:
+                        line, train_buf = train_buf.split('\n', 1)
+                        parse_line(line)
+                    need_draw = True
+
+                elif kind == 'pm':
+                    try:
+                        data = os.read(fd, 65536).decode('utf-8', errors='replace')
+                    except BlockingIOError:
+                        continue
+                    except (OSError, ValueError):
+                        data = ''
+                    if not data:
+                        continue
+                    pm_buf += data
+                    while '\n\n' in pm_buf or '*** ' in pm_buf:
+                        end = pm_buf.find('\n*** ', 1)
+                        if end < 0:
+                            end = pm_buf.find('\n\n', 1)
+                            if end < 0:
+                                break
+                        chunk = pm_buf[:end]
+                        pm_buf = pm_buf[end:]
+                        parse_powermetrics_text(chunk)
+                    if len(pm_buf) > 16384:
+                        pm_buf = pm_buf[-8192:]
+                    need_draw = True
+
+                elif kind == 'stdin':
+                    key = term.inkey(timeout=0)
+                    if not key:
+                        continue
+                    if key == 'q':
+                        cleanup()
+                        return
+                    elif key.name == 'KEY_UP':
+                        S.auto_scroll = False
+                        S.log_scroll = max(0, S.log_scroll - 1)
+                        need_draw = True
+                    elif key.name == 'KEY_DOWN':
+                        S.log_scroll += 1
+                        need_draw = True
+                    elif key == 'p':
+                        S.auto_scroll = not S.auto_scroll
+                        if S.auto_scroll:
+                            S.log_scroll = max(0, len(S.logs) - 10)
+                        need_draw = True
+                    elif key == 'r':
+                        if train_proc:
+                            train_proc.terminate()
+                            train_proc.wait()
+                        train_proc = spawn_training(resume=True, steps=args.steps)
+                        S.train_pid = train_proc.pid
+                        procs = [p for p in procs if p.poll() is None]
+                        procs.append(train_proc)
+                        S.logs.append('[dashboard] Restarted with --resume')
+                        need_draw = True
+                    elif key == 'g':
+                        with S.gen_lock:
+                            S.gen_status = 'generating'
+                            S.gen_step = S.step
+                        def force_gen():
+                            try:
+                                W = load_weights_from_ckpt(CKPT_PATH)
+                                if W:
+                                    text = generate_text(W, get_tokenizer(), max_tokens=64, temperature=0.8)
+                                    with S.gen_lock:
+                                        S.gen_text = text
+                                        S.gen_step = S.step
+                                        S.gen_status = 'done'
+                            except Exception as e:
+                                with S.gen_lock:
+                                    S.gen_text = f'[error: {e}]'
+                                    S.gen_status = 'done'
+                        threading.Thread(target=force_gen, daemon=True).start()
+                        need_draw = True
+
+            now = time.monotonic()
+            if not need_draw and now - last_draw > 1.0:
+                need_draw = True
+            if need_draw and now - last_draw > 0.066:
+                draw(term)
+                last_draw = now
+
+            if train_finished:
+                draw(term)
+                while True:
+                    key = term.inkey(timeout=1)
+                    if key == 'q':
+                        cleanup()
+                        return
+
+if __name__ == '__main__':
+    main()
diff --git a/training/stories_config.h b/training/stories_config.h
new file mode 100644
index 0000000..f967974
--- /dev/null
+++ b/training/stories_config.h
@@ -0,0 +1,189 @@
+// stories_config.h — Stories110M model config and structures
+#pragma once
+#import <Foundation/Foundation.h>
+#import <objc/runtime.h>
+#import <objc/message.h>
+#import <dlfcn.h>
+#import <IOSurface/IOSurface.h>
+#import <mach/mach_time.h>
+#import <Accelerate/Accelerate.h>
+#include <math.h>
+#include <unistd.h>
+#include <dispatch/dispatch.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+
+// Stories110M config
+#define DIM 768
+#define HIDDEN 2048
+#define HEADS 12
+#define HD (DIM/HEADS)
+#define SEQ 256
+#define NLAYERS 12
+#define VOCAB 32000
+#define ACCUM_STEPS 10
+#define MAX_COMPILES 100
+
+// Per compile: 5 weight-bearing kernels per layer + 1 classifier = 5*12+1 = 61
+// Plus 1 static (sdpaBwd2 per layer, no weights) = 12 more but those are weight-free
+// Actually sdpaBwd2 has no weights, compile once per layer
+// Weight-bearing: fwdAttn(1) + fwdFFN(1) + ffnBwd(1) + sdpaBwd1(1) + qkvBwd(1) = 5 per layer
+// 5 * 12 = 60 weight-bearing compiles per batch
+// With MAX_COMPILES=100, we get 1 batch of ACCUM_STEPS before restart
+#define KERNELS_PER_LAYER 5
+#define TOTAL_WEIGHT_KERNELS (KERNELS_PER_LAYER * NLAYERS)
+
+// Attention score channels for SDPA backward
+#define SCORE_CH (HEADS*SEQ)
+
+// Weight sizes per layer
+#define WQ_SZ (DIM*DIM)
+#define WO_SZ (DIM*DIM)
+#define W1_SZ (HIDDEN*DIM)
+#define W2_SZ (DIM*HIDDEN)
+#define W3_SZ (HIDDEN*DIM)
+#define LAYER_PARAMS (4*WQ_SZ + W1_SZ + W2_SZ + W3_SZ + 2*DIM)
+#define TOTAL_PARAMS (NLAYERS * LAYER_PARAMS + DIM + VOCAB*DIM)  // +rms_final+embed
+
+// Per-layer weight and optimizer state
+typedef struct {
+    float *Wq, *Wk, *Wv, *Wo;
+    float *W1, *W2, *W3;
+    float *rms_att, *rms_ffn;
+} LayerWeights;
+
+typedef struct {
+    float *m, *v;
+    size_t n;
+} AdamState;
+
+typedef struct {
+    AdamState Wq, Wk, Wv, Wo;
+    AdamState W1, W2, W3;
+    AdamState rms_att, rms_ffn;
+} LayerAdam;
+
+// Per-layer activation buffers (saved for backward)
+typedef struct {
+    float *layer_in;    // [DIM, SEQ] input to this layer (for rmsnorm1 bwd)
+    float *xnorm;      // [DIM, SEQ] rmsnorm1 output
+    float *Q, *K, *V;  // [DIM, SEQ] QKV projections
+    float *attn_out;    // [DIM, SEQ] attention output (before Wo)
+    float *o_out;       // [DIM, SEQ] Wo output
+    float *x2;          // [DIM, SEQ] residual after attn
+    float *x2norm;      // [DIM, SEQ] rmsnorm2 output
+    float *h1, *h3;     // [HIDDEN, SEQ] FFN intermediates
+    float *silu_out;    // [HIDDEN, SEQ] SiLU(h1)*h3
+    float *ffn_out;     // [DIM, SEQ] FFN output
+} LayerActs;
+
+// Per-layer gradient accumulators
+typedef struct {
+    float *Wq, *Wk, *Wv, *Wo;
+    float *W1, *W2, *W3;
+    float *rms_att, *rms_ffn;
+} LayerGrads;
+
+// ANE kernels per layer
+typedef struct { void *model; IOSurfaceRef ioIn, ioOut; void *request; void *tmpDir; } Kern;
+typedef struct {
+    Kern *fwdAttn, *fwdFFN, *ffnBwd, *sdpaBwd1, *sdpaBwd2, *qkvBwd;
+} LayerKernels;
+
+// Checkpoint header
+typedef struct {
+    int magic;          // 0x424C5A54 "BLZT"
+    int version;        // 2
+    int step, total_steps;
+    int n_layers, vocab_size, dim, hidden_dim, n_heads, seq_len;
+    float lr, loss;
+    double cum_compile, cum_train, cum_wall;
+    int cum_steps, cum_batches;
+    int adam_t;
+    int pad[3];         // alignment
+} CkptHdr;
+
+// llama2.c model file header
+typedef struct {
+    int dim, hidden_dim, n_layers, n_heads, n_kv_heads, vocab_size, seq_len;
+} Llama2Config;
+
+// Globals
+static Class g_D, g_I, g_AR, g_AIO;
+static mach_timebase_info_data_t g_tb;
+static int g_compile_count = 0;
+
+static void ane_init(void) {
+    dlopen("/System/Library/PrivateFrameworks/AppleNeuralEngine.framework/AppleNeuralEngine", RTLD_NOW);
+    g_D  = NSClassFromString(@"_ANEInMemoryModelDescriptor");
+    g_I  = NSClassFromString(@"_ANEInMemoryModel");
+    g_AR = NSClassFromString(@"_ANERequest");
+    g_AIO= NSClassFromString(@"_ANEIOSurfaceObject");
+}
+static double tb_ms(uint64_t t) { return (double)t * g_tb.numer / g_tb.denom / 1e6; }
+
+// Alloc helpers
+static AdamState adam_alloc(size_t n) { AdamState s; s.m=(float*)calloc(n,4); s.v=(float*)calloc(n,4); s.n=n; return s; }
+static void adam_free(AdamState *s) { free(s->m); free(s->v); }
+
+static LayerWeights layer_weights_alloc(void) {
+    LayerWeights w;
+    w.Wq=(float*)malloc(WQ_SZ*4); w.Wk=(float*)malloc(WQ_SZ*4);
+    w.Wv=(float*)malloc(WQ_SZ*4); w.Wo=(float*)malloc(WO_SZ*4);
+    w.W1=(float*)malloc(W1_SZ*4); w.W2=(float*)malloc(W2_SZ*4); w.W3=(float*)malloc(W3_SZ*4);
+    w.rms_att=(float*)malloc(DIM*4); w.rms_ffn=(float*)malloc(DIM*4);
+    return w;
+}
+static void layer_weights_free(LayerWeights *w) {
+    free(w->Wq);free(w->Wk);free(w->Wv);free(w->Wo);
+    free(w->W1);free(w->W2);free(w->W3);
+    free(w->rms_att);free(w->rms_ffn);
+}
+static LayerAdam layer_adam_alloc(void) {
+    LayerAdam a;
+    a.Wq=adam_alloc(WQ_SZ); a.Wk=adam_alloc(WQ_SZ); a.Wv=adam_alloc(WQ_SZ); a.Wo=adam_alloc(WO_SZ);
+    a.W1=adam_alloc(W1_SZ); a.W2=adam_alloc(W2_SZ); a.W3=adam_alloc(W3_SZ);
+    a.rms_att=adam_alloc(DIM); a.rms_ffn=adam_alloc(DIM);
+    return a;
+}
+static void layer_adam_free(LayerAdam *a) {
+    adam_free(&a->Wq);adam_free(&a->Wk);adam_free(&a->Wv);adam_free(&a->Wo);
+    adam_free(&a->W1);adam_free(&a->W2);adam_free(&a->W3);
+    adam_free(&a->rms_att);adam_free(&a->rms_ffn);
+}
+static LayerActs layer_acts_alloc(void) {
+    LayerActs a;
+    a.layer_in=(float*)malloc(SEQ*DIM*4);
+    a.xnorm=(float*)malloc(SEQ*DIM*4); a.Q=(float*)malloc(SEQ*DIM*4);
+    a.K=(float*)malloc(SEQ*DIM*4); a.V=(float*)malloc(SEQ*DIM*4);
+    a.attn_out=(float*)malloc(SEQ*DIM*4); a.o_out=(float*)malloc(SEQ*DIM*4);
+    a.x2=(float*)malloc(SEQ*DIM*4); a.x2norm=(float*)malloc(SEQ*DIM*4);
+    a.h1=(float*)malloc(SEQ*HIDDEN*4); a.h3=(float*)malloc(SEQ*HIDDEN*4);
+    a.silu_out=(float*)malloc(SEQ*HIDDEN*4); a.ffn_out=(float*)malloc(SEQ*DIM*4);
+    return a;
+}
+static void layer_acts_free(LayerActs *a) {
+    free(a->layer_in);free(a->xnorm);free(a->Q);free(a->K);free(a->V);
+    free(a->attn_out);free(a->o_out);free(a->x2);free(a->x2norm);
+    free(a->h1);free(a->h3);free(a->silu_out);free(a->ffn_out);
+}
+static LayerGrads layer_grads_alloc(void) {
+    LayerGrads g;
+    g.Wq=(float*)calloc(WQ_SZ,4); g.Wk=(float*)calloc(WQ_SZ,4);
+    g.Wv=(float*)calloc(WQ_SZ,4); g.Wo=(float*)calloc(WO_SZ,4);
+    g.W1=(float*)calloc(W1_SZ,4); g.W2=(float*)calloc(W2_SZ,4); g.W3=(float*)calloc(W3_SZ,4);
+    g.rms_att=(float*)calloc(DIM,4); g.rms_ffn=(float*)calloc(DIM,4);
+    return g;
+}
+static void layer_grads_zero(LayerGrads *g) {
+    memset(g->Wq,0,WQ_SZ*4);memset(g->Wk,0,WQ_SZ*4);
+    memset(g->Wv,0,WQ_SZ*4);memset(g->Wo,0,WO_SZ*4);
+    memset(g->W1,0,W1_SZ*4);memset(g->W2,0,W2_SZ*4);memset(g->W3,0,W3_SZ*4);
+    memset(g->rms_att,0,DIM*4);memset(g->rms_ffn,0,DIM*4);
+}
+static void layer_grads_free(LayerGrads *g) {
+    free(g->Wq);free(g->Wk);free(g->Wv);free(g->Wo);
+    free(g->W1);free(g->W2);free(g->W3);
+    free(g->rms_att);free(g->rms_ffn);
+}
diff --git a/training/stories_cpu_ops.h b/training/stories_cpu_ops.h
new file mode 100644
index 0000000..c9f2cfa
--- /dev/null
+++ b/training/stories_cpu_ops.h
@@ -0,0 +1,129 @@
+// stories_cpu_ops.h — CPU operations: RMSNorm, cross-entropy, Adam, softmax
+#pragma once
+#include "stories_config.h"
+
+static float *g_rms_tmp = NULL;
+
+static void rmsnorm(float *out, const float *x, const float *w, int d, int S) {
+    if (!g_rms_tmp) g_rms_tmp = (float*)malloc(S*4);
+    float *ss = (float*)calloc(S, sizeof(float));
+    for (int i=0; i<d; i++) {
+        vDSP_vmul(x+i*S, 1, x+i*S, 1, g_rms_tmp, 1, (vDSP_Length)S);
+        vDSP_vadd(g_rms_tmp, 1, ss, 1, ss, 1, (vDSP_Length)S);
+    }
+    float invd = 1.0f/d, eps=1e-5f;
+    vDSP_vsmsa(ss, 1, &invd, &eps, ss, 1, (vDSP_Length)S);
+    int n = S; vvrsqrtf(ss, ss, &n);
+    for (int i=0; i<d; i++) {
+        vDSP_vmul(x+i*S, 1, ss, 1, out+i*S, 1, (vDSP_Length)S);
+        vDSP_vsmul(out+i*S, 1, &w[i], out+i*S, 1, (vDSP_Length)S);
+    }
+    free(ss);
+}
+
+static void rmsnorm_bwd(float *dx, float *dw, const float *dy, const float *x, const float *w, int d, int S) {
+    if (!g_rms_tmp) g_rms_tmp = (float*)malloc(S*4);
+    float *ss = (float*)calloc(S, sizeof(float));
+    for (int i=0; i<d; i++) {
+        vDSP_vmul(x+i*S, 1, x+i*S, 1, g_rms_tmp, 1, (vDSP_Length)S);
+        vDSP_vadd(g_rms_tmp, 1, ss, 1, ss, 1, (vDSP_Length)S);
+    }
+    float invd = 1.0f/d, eps=1e-5f;
+    vDSP_vsmsa(ss, 1, &invd, &eps, ss, 1, (vDSP_Length)S);
+    float *rrms = (float*)malloc(S*4);
+    int n = S; vvrsqrtf(rrms, ss, &n);
+    float *dot = (float*)calloc(S, sizeof(float));
+    for (int i=0; i<d; i++) {
+        vDSP_vmul(dy+i*S, 1, x+i*S, 1, g_rms_tmp, 1, (vDSP_Length)S);
+        vDSP_vsma(g_rms_tmp, 1, &w[i], dot, 1, dot, 1, (vDSP_Length)S);
+    }
+    vDSP_vmul(rrms, 1, rrms, 1, ss, 1, (vDSP_Length)S);
+    vDSP_vsmul(ss, 1, &invd, ss, 1, (vDSP_Length)S);
+    vDSP_vmul(dot, 1, ss, 1, dot, 1, (vDSP_Length)S);
+    for (int i=0; i<d; i++) {
+        vDSP_vmul(x+i*S, 1, dot, 1, g_rms_tmp, 1, (vDSP_Length)S);
+        vDSP_vsub(g_rms_tmp, 1, dy+i*S, 1, g_rms_tmp, 1, (vDSP_Length)S);
+        vDSP_vmul(g_rms_tmp, 1, rrms, 1, g_rms_tmp, 1, (vDSP_Length)S);
+        vDSP_vsmul(g_rms_tmp, 1, &w[i], dx+i*S, 1, (vDSP_Length)S);
+        vDSP_vmul(dy+i*S, 1, x+i*S, 1, g_rms_tmp, 1, (vDSP_Length)S);
+        vDSP_vmul(g_rms_tmp, 1, rrms, 1, g_rms_tmp, 1, (vDSP_Length)S);
+        float s; vDSP_sve(g_rms_tmp, 1, &s, (vDSP_Length)S);
+        dw[i] += s;
+    }
+    free(ss); free(rrms); free(dot);
+}
+
+static void adam_update(float *w, const float *g, AdamState *s, int t, float lr, float b1, float b2, float eps) {
+    float bc1 = 1.0f - powf(b1, t), bc2 = 1.0f - powf(b2, t);
+    for (size_t i=0; i<s->n; i++) {
+        s->m[i] = b1*s->m[i] + (1-b1)*g[i];
+        s->v[i] = b2*s->v[i] + (1-b2)*g[i]*g[i];
+        float mh = s->m[i]/bc1, vh = s->v[i]/bc2;
+        w[i] -= lr * mh / (sqrtf(vh) + eps);
+    }
+}
+
+// Cross-entropy loss + gradient for logits (column-major: [VOCAB, SEQ])
+// logits[v*SEQ+t] = logit for vocab v, position t
+// targets[t] = target token id for position t
+// Returns mean CE loss, writes dlogits = softmax(logits) - one_hot(targets)
+// Data is column-major [V, S], but we process per-column (stride=1 within col is v*S+t, stride between v's is S)
+// For vDSP: transpose to row-major scratch [S, V] to vectorize softmax per position
+static float cross_entropy_loss(float *dlogits, const float *logits, const uint16_t *targets, int V, int S) {
+    // Work in transposed layout [S, V] where each row is one position's logits (contiguous)
+    float *buf = (float*)malloc(S * V * 4);
+    // Transpose [V,S] → [S,V]: buf[t*V+v] = logits[v*S+t]
+    vDSP_mtrans(logits, 1, buf, 1, (vDSP_Length)S, (vDSP_Length)V);
+
+    float total_loss = 0;
+    float invS = 1.0f / S;
+    for (int t = 0; t < S; t++) {
+        float *row = buf + t * V;
+        // max
+        float maxv;
+        vDSP_maxv(row, 1, &maxv, (vDSP_Length)V);
+        // row -= maxv
+        float neg_max = -maxv;
+        vDSP_vsadd(row, 1, &neg_max, row, 1, (vDSP_Length)V);
+        // exp in-place
+        int n = V;
+        vvexpf(row, row, &n);
+        // sum
+        float sum;
+        vDSP_sve(row, 1, &sum, (vDSP_Length)V);
+        // normalize
+        float inv_sum = 1.0f / sum;
+        vDSP_vsmul(row, 1, &inv_sum, row, 1, (vDSP_Length)V);
+        // loss
+        int tgt = targets[t];
+        total_loss -= logf(row[tgt] + 1e-10f);
+        // gradient: softmax - one_hot, then /S
+        row[tgt] -= 1.0f;
+        vDSP_vsmul(row, 1, &invS, row, 1, (vDSP_Length)V);
+    }
+    // Transpose back [S,V] → [V,S]
+    vDSP_mtrans(buf, 1, dlogits, 1, (vDSP_Length)V, (vDSP_Length)S);
+    free(buf);
+    return total_loss / S;
+}
+
+// Embedding lookup: token_ids → x [DIM, SEQ] (channel-first)
+// embed is [VOCAB, DIM] row-major (vocab_size rows, dim cols)
+static void embed_lookup(float *x, const float *embed, const uint16_t *tokens, int dim, int seq) {
+    for (int t = 0; t < seq; t++) {
+        int tok = tokens[t];
+        for (int d = 0; d < dim; d++) {
+            x[d*seq + t] = embed[tok*dim + d];
+        }
+    }
+}
+
+// Embedding backward: accumulate dE[tok] += dx[:,t] for each position
+static void embed_backward(float *d_embed, const float *dx, const uint16_t *tokens, int dim, int seq) {
+    for (int t = 0; t < seq; t++) {
+        int tok = tokens[t];
+        for (int d = 0; d < dim; d++) {
+            d_embed[tok*dim + d] += dx[d*seq + t];
+        }
+    }
+}
diff --git a/training/stories_io.h b/training/stories_io.h
new file mode 100644
index 0000000..017d8a8
--- /dev/null
+++ b/training/stories_io.h
@@ -0,0 +1,134 @@
+// stories_io.h — IOSurface helpers, blob builders, NEON conversion
+#pragma once
+#include "stories_config.h"
+#include <arm_neon.h>
+
+static IOSurfaceRef make_surface(size_t bytes) {
+    return IOSurfaceCreate((__bridge CFDictionaryRef)@{
+        (id)kIOSurfaceWidth:@(bytes), (id)kIOSurfaceHeight:@1,
+        (id)kIOSurfaceBytesPerElement:@1, (id)kIOSurfaceBytesPerRow:@(bytes),
+        (id)kIOSurfaceAllocSize:@(bytes), (id)kIOSurfacePixelFormat:@0});
+}
+
+static NSData *build_blob(const float *w, int rows, int cols) {
+    int ws=rows*cols*2, tot=128+ws;
+    uint8_t *b=(uint8_t*)calloc(tot,1);
+    b[0]=1;b[4]=2;b[64]=0xEF;b[65]=0xBE;b[66]=0xAD;b[67]=0xDE;b[68]=1;
+    *(uint32_t*)(b+72)=ws;*(uint32_t*)(b+80)=128;
+    _Float16 *fp16=(_Float16*)(b+128);
+    for(int i=0;i<rows*cols;i++) fp16[i]=(_Float16)w[i];
+    return [NSData dataWithBytesNoCopy:b length:tot freeWhenDone:YES];
+}
+static NSData *build_blob_t(const float *w, int rows, int cols) {
+    int ws=cols*rows*2, tot=128+ws;
+    uint8_t *b=(uint8_t*)calloc(tot,1);
+    b[0]=1;b[4]=2;b[64]=0xEF;b[65]=0xBE;b[66]=0xAD;b[67]=0xDE;b[68]=1;
+    *(uint32_t*)(b+72)=ws;*(uint32_t*)(b+80)=128;
+    _Float16 *fp16=(_Float16*)(b+128);
+    for(int i=0;i<rows;i++) for(int j=0;j<cols;j++) fp16[j*rows+i]=(_Float16)w[i*cols+j];
+    return [NSData dataWithBytesNoCopy:b length:tot freeWhenDone:YES];
+}
+static NSData *build_blob_fp16(_Float16 *d, int cnt) {
+    int ws=cnt*2, tot=128+ws;
+    uint8_t *b=(uint8_t*)calloc(tot,1);
+    b[0]=1;b[4]=2;b[64]=0xEF;b[65]=0xBE;b[66]=0xAD;b[67]=0xDE;b[68]=1;
+    *(uint32_t*)(b+72)=ws;*(uint32_t*)(b+80)=128;
+    memcpy(b+128,d,ws);
+    return [NSData dataWithBytesNoCopy:b length:tot freeWhenDone:YES];
+}
+
+// NEON vectorized conversion
+static void cvt_f16_f32(float *dst, const _Float16 *src, int n) {
+    int i = 0;
+    for (; i+7 < n; i += 8) {
+        float16x8_t h = vld1q_f16((const __fp16*)(src+i));
+        vst1q_f32(dst+i,   vcvt_f32_f16(vget_low_f16(h)));
+        vst1q_f32(dst+i+4, vcvt_f32_f16(vget_high_f16(h)));
+    }
+    for (; i < n; i++) dst[i] = (float)src[i];
+}
+static void cvt_f32_f16(_Float16 *dst, const float *src, int n) {
+    int i = 0;
+    for (; i+7 < n; i += 8) {
+        float16x8_t h = vcombine_f16(vcvt_f16_f32(vld1q_f32(src+i)),
+                                      vcvt_f16_f32(vld1q_f32(src+i+4)));
+        vst1q_f16((__fp16*)(dst+i), h);
+    }
+    for (; i < n; i++) dst[i] = (_Float16)src[i];
+}
+
+// IOSurface I/O (channel-first [C,S] layout)
+static void io_write_fp16(IOSurfaceRef s, const float *data, int channels, int sp) {
+    IOSurfaceLock(s, 0, NULL);
+    cvt_f32_f16((_Float16*)IOSurfaceGetBaseAddress(s), data, channels * sp);
+    IOSurfaceUnlock(s, 0, NULL);
+}
+static void io_read_fp16(IOSurfaceRef s, float *data, int ch_off, int channels, int sp) {
+    IOSurfaceLock(s, kIOSurfaceLockReadOnly, NULL);
+    cvt_f16_f32(data, (_Float16*)IOSurfaceGetBaseAddress(s) + ch_off * sp, channels * sp);
+    IOSurfaceUnlock(s, kIOSurfaceLockReadOnly, NULL);
+}
+static void io_copy(IOSurfaceRef dst, int dst_ch, IOSurfaceRef src, int src_ch, int channels, int sp) {
+    IOSurfaceLock(dst, 0, NULL);
+    IOSurfaceLock(src, kIOSurfaceLockReadOnly, NULL);
+    memcpy((_Float16*)IOSurfaceGetBaseAddress(dst) + dst_ch*sp,
+           (_Float16*)IOSurfaceGetBaseAddress(src) + src_ch*sp,
+           channels * sp * sizeof(_Float16));
+    IOSurfaceUnlock(src, kIOSurfaceLockReadOnly, NULL);
+    IOSurfaceUnlock(dst, 0, NULL);
+}
+static void io_write_fp16_at(IOSurfaceRef s, int ch_off, const float *data, int channels, int sp) {
+    IOSurfaceLock(s, 0, NULL);
+    cvt_f32_f16((_Float16*)IOSurfaceGetBaseAddress(s) + ch_off * sp, data, channels * sp);
+    IOSurfaceUnlock(s, 0, NULL);
+}
+
+// Kernel compile/eval
+static Kern *compile_kern_mil_w(NSString *mil, NSDictionary *weights, int ic_bytes, int oc_bytes) {
+    @autoreleasepool {
+    NSData *md = [mil dataUsingEncoding:NSUTF8StringEncoding];
+    id desc = ((id(*)(Class,SEL,id,id,id))objc_msgSend)(g_D, @selector(modelWithMILText:weights:optionsPlist:), md, weights, nil);
+    if (!desc) { printf("  [compile] desc=NULL\n"); return NULL; }
+    id mdl = ((id(*)(Class,SEL,id))objc_msgSend)(g_I, @selector(inMemoryModelWithDescriptor:), desc);
+    id hx = ((id(*)(id,SEL))objc_msgSend)(mdl, @selector(hexStringIdentifier));
+    NSString *td = [NSTemporaryDirectory() stringByAppendingPathComponent:hx];
+    [[NSFileManager defaultManager] createDirectoryAtPath:[td stringByAppendingPathComponent:@"weights"] withIntermediateDirectories:YES attributes:nil error:nil];
+    [md writeToFile:[td stringByAppendingPathComponent:@"model.mil"] atomically:YES];
+    for (NSString *path in weights) {
+        NSString *rel = [path stringByReplacingOccurrencesOfString:@"@model_path/" withString:@""];
+        [weights[path][@"data"] writeToFile:[td stringByAppendingPathComponent:rel] atomically:YES];
+    }
+    NSError *e = nil;
+    if (!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(compileWithQoS:options:error:), 21, @{}, &e)) {
+        printf("  [compile] FAIL: %s\n", e ? [[e description] UTF8String] : "no error"); return NULL;
+    }
+    if (!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(loadWithQoS:options:error:), 21, @{}, &e)) {
+        printf("  [compile] load FAIL\n"); return NULL;
+    }
+    __sync_fetch_and_add(&g_compile_count, 1);
+    Kern *k = (Kern*)calloc(1, sizeof(Kern));
+    k->model = (void*)CFBridgingRetain(mdl);
+    k->ioIn = make_surface(ic_bytes);
+    k->ioOut = make_surface(oc_bytes);
+    id wI = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), k->ioIn);
+    id wO = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), k->ioOut);
+    k->request = (void*)CFBridgingRetain(((id(*)(Class,SEL,id,id,id,id,id,id,id))objc_msgSend)(g_AR,
+        @selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:),
+        @[wI], @[@0], @[wO], @[@0], nil, nil, @0));
+    k->tmpDir = (void*)CFBridgingRetain(td);
+    return k;
+    }
+}
+static void free_kern(Kern *k) {
+    if (!k) return;
+    id mdl = (__bridge id)k->model; NSError *e = nil;
+    ((BOOL(*)(id,SEL,unsigned int,NSError**))objc_msgSend)(mdl, @selector(unloadWithQoS:error:), 21, &e);
+    CFRelease(k->ioIn); CFRelease(k->ioOut);
+    [[NSFileManager defaultManager] removeItemAtPath:(__bridge id)k->tmpDir error:nil];
+    CFRelease(k->model); CFRelease(k->request); CFRelease(k->tmpDir);
+    free(k);
+}
+static void ane_eval(Kern *k) {
+    id mdl = (__bridge id)k->model; id req = (__bridge id)k->request; NSError *e = nil;
+    ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)(mdl, @selector(evaluateWithQoS:options:request:error:), 21, @{}, req, &e);
+}
diff --git a/training/stories_mil.h b/training/stories_mil.h
new file mode 100644
index 0000000..dccca44
--- /dev/null
+++ b/training/stories_mil.h
@@ -0,0 +1,286 @@
+// stories_mil.h — MIL program generators for ANE kernels
+// Same architecture as single-layer train_large.m but parameterized
+#pragma once
+#include "stories_io.h"
+
+#define MIL_HDR \
+    @"program(1.3)\n[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, " \
+    "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, " \
+    "{\"coremltools-version\", \"9.0\"}})]\n{\n"
+#define CONV_CONST \
+    "        string pt = const()[name=string(\"pt\"), val=string(\"valid\")];\n" \
+    "        tensor<int32, [2]> st = const()[name=string(\"st\"), val=tensor<int32, [2]>([1,1])];\n" \
+    "        tensor<int32, [4]> pd = const()[name=string(\"pd\"), val=tensor<int32, [4]>([0,0,0,0])];\n" \
+    "        tensor<int32, [2]> dl = const()[name=string(\"dl\"), val=tensor<int32, [2]>([1,1])];\n" \
+    "        int32 gr = const()[name=string(\"gr\"), val=int32(1)];\n"
+
+// SDPA forward + taps: x_in → rmsnorm → QKV+SDPA+Wo → concat(o_out, Q, K, V, attn_out, xnorm)
+static NSString *gen_sdpa_fwd_taps(void) {
+    float sc = 1.0f/sqrtf((float)HD);
+    float invd = 1.0f/(float)DIM;
+    NSMutableString *m = [NSMutableString string];
+    [m appendString:MIL_HDR];
+    [m appendFormat:@"    func main<ios18>(tensor<fp16, [1, %d, 1, %d]> x) {\n", DIM, SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> sq = mul(x=x,y=x)[name=string(\"sq\")];\n", DIM, SEQ];
+    [m appendFormat:@"        tensor<int32, [1]> rax = const()[name=string(\"rax\"), val=tensor<int32, [1]>([1])];\n"];
+    [m appendFormat:@"        bool kd = const()[name=string(\"kd\"), val=bool(true)];\n"];
+    [m appendFormat:@"        tensor<fp16, [1,1,1,%d]> ss = reduce_sum(x=sq,axes=rax,keep_dims=kd)[name=string(\"ss\")];\n", SEQ];
+    [m appendFormat:@"        fp16 invd = const()[name=string(\"invd\"), val=fp16(%f)];\n", invd];
+    [m appendFormat:@"        tensor<fp16, [1,1,1,%d]> ss2 = mul(x=ss,y=invd)[name=string(\"ss2\")];\n", SEQ];
+    [m appendFormat:@"        fp16 eps = const()[name=string(\"eps\"), val=fp16(0.00001)];\n"];
+    [m appendFormat:@"        tensor<fp16, [1,1,1,%d]> ss3 = add(x=ss2,y=eps)[name=string(\"ss3\")];\n", SEQ];
+    [m appendFormat:@"        fp16 nhalf = const()[name=string(\"nhalf\"), val=fp16(-0.5)];\n"];
+    [m appendFormat:@"        tensor<fp16, [1,1,1,%d]> rrms = pow(x=ss3,y=nhalf)[name=string(\"rrms\")];\n", SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> xr = mul(x=x,y=rrms)[name=string(\"xr\")];\n", DIM, SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,1]> rw = const()[name=string(\"rw\"), val=tensor<fp16, [1,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/rms1.bin\"), offset=uint64(64)))];\n", DIM, DIM];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> xn = mul(x=xr,y=rw)[name=string(\"xn\")];\n", DIM, SEQ];
+    [m appendString:@CONV_CONST];
+    [m appendFormat:@"        tensor<fp16, [%d,%d,1,1]> Wq = const()[name=string(\"Wq\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/wq.bin\"), offset=uint64(64)))];\n", DIM,DIM,DIM,DIM];
+    [m appendFormat:@"        tensor<fp16, [%d,%d,1,1]> Wk = const()[name=string(\"Wk\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/wk.bin\"), offset=uint64(64)))];\n", DIM,DIM,DIM,DIM];
+    [m appendFormat:@"        tensor<fp16, [%d,%d,1,1]> Wv = const()[name=string(\"Wv\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/wv.bin\"), offset=uint64(64)))];\n", DIM,DIM,DIM,DIM];
+    [m appendFormat:@"        tensor<fp16, [%d,%d,1,1]> Wo = const()[name=string(\"Wo\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/wo.bin\"), offset=uint64(64)))];\n", DIM,DIM,DIM,DIM];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> qf = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wq,x=xn)[name=string(\"cq\")];\n", DIM,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> kf = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wk,x=xn)[name=string(\"ck\")];\n", DIM,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> vf = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wv,x=xn)[name=string(\"cv\")];\n", DIM,SEQ];
+    [m appendFormat:@"        tensor<int32, [4]> qsh = const()[name=string(\"qsh\"), val=tensor<int32, [4]>([1,%d,%d,%d])];\n", HEADS,HD,SEQ];
+    [m appendString:@"        tensor<int32, [4]> pm = const()[name=string(\"pm\"), val=tensor<int32, [4]>([0,1,3,2])];\n"];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> q4 = reshape(shape=qsh,x=qf)[name=string(\"rq\")];\n", HEADS,HD,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> q = transpose(perm=pm,x=q4)[name=string(\"tq\")];\n", HEADS,SEQ,HD];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> k4 = reshape(shape=qsh,x=kf)[name=string(\"rk\")];\n", HEADS,HD,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> k = transpose(perm=pm,x=k4)[name=string(\"tk\")];\n", HEADS,SEQ,HD];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> v4 = reshape(shape=qsh,x=vf)[name=string(\"rv\")];\n", HEADS,HD,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> v = transpose(perm=pm,x=v4)[name=string(\"tv\")];\n", HEADS,SEQ,HD];
+    [m appendString:@"        bool tx = const()[name=string(\"tx\"), val=bool(false)];\n"];
+    [m appendString:@"        bool ty = const()[name=string(\"ty\"), val=bool(true)];\n"];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> sc1 = matmul(transpose_x=tx,transpose_y=ty,x=q,y=k)[name=string(\"mm1\")];\n", HEADS,SEQ,SEQ];
+    [m appendFormat:@"        fp16 scv = const()[name=string(\"scv\"), val=fp16(%f)];\n", sc];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> sc2 = mul(x=sc1,y=scv)[name=string(\"scl\")];\n", HEADS,SEQ,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,1,%d,%d]> cm = const()[name=string(\"cm\"), val=tensor<fp16, [1,1,%d,%d]>(BLOBFILE(path=string(\"@model_path/weights/mask.bin\"), offset=uint64(64)))];\n", SEQ,SEQ,SEQ,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> ms = add(x=sc2,y=cm)[name=string(\"msk\")];\n", HEADS,SEQ,SEQ];
+    [m appendString:@"        int32 sax = const()[name=string(\"sax\"), val=int32(-1)];\n"];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> aw = softmax(axis=sax,x=ms)[name=string(\"sm\")];\n", HEADS,SEQ,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> a4 = matmul(transpose_x=tx,transpose_y=tx,x=aw,y=v)[name=string(\"mm2\")];\n", HEADS,SEQ,HD];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> at = transpose(perm=pm,x=a4)[name=string(\"ta\")];\n", HEADS,HD,SEQ];
+    [m appendFormat:@"        tensor<int32, [4]> os = const()[name=string(\"os\"), val=tensor<int32, [4]>([1,%d,1,%d])];\n", DIM,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> af = reshape(shape=os,x=at)[name=string(\"ra\")];\n", DIM,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> oo = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wo,x=af)[name=string(\"co\")];\n", DIM,SEQ];
+    [m appendString:@"        int32 cax = const()[name=string(\"cax\"), val=int32(1)];\n"];
+    [m appendString:@"        bool cid = const()[name=string(\"cid\"), val=bool(false)];\n"];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> out = concat(axis=cax,interleave=cid,values=(oo,qf,kf,vf,af,xn))[name=string(\"cat\")];\n", 6*DIM,SEQ];
+    [m appendString:@"    } -> (out);\n}\n"];
+    return m;
+}
+
+// FFN forward + taps: x2 → rmsnorm → FFN → concat(ffn_out, h1, h3, silu_out, x2norm)
+static NSString *gen_ffn_fwd_taps(void) {
+    float invd = 1.0f/(float)DIM;
+    NSMutableString *m = [NSMutableString string];
+    [m appendString:MIL_HDR];
+    [m appendFormat:@"    func main<ios18>(tensor<fp16, [1, %d, 1, %d]> x) {\n", DIM, SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> sq = mul(x=x,y=x)[name=string(\"sq\")];\n", DIM, SEQ];
+    [m appendFormat:@"        tensor<int32, [1]> rax = const()[name=string(\"rax\"), val=tensor<int32, [1]>([1])];\n"];
+    [m appendFormat:@"        bool kd = const()[name=string(\"kd\"), val=bool(true)];\n"];
+    [m appendFormat:@"        tensor<fp16, [1,1,1,%d]> ss = reduce_sum(x=sq,axes=rax,keep_dims=kd)[name=string(\"ss\")];\n", SEQ];
+    [m appendFormat:@"        fp16 invd = const()[name=string(\"invd\"), val=fp16(%f)];\n", invd];
+    [m appendFormat:@"        tensor<fp16, [1,1,1,%d]> ss2 = mul(x=ss,y=invd)[name=string(\"ss2\")];\n", SEQ];
+    [m appendFormat:@"        fp16 eps = const()[name=string(\"eps\"), val=fp16(0.00001)];\n"];
+    [m appendFormat:@"        tensor<fp16, [1,1,1,%d]> ss3 = add(x=ss2,y=eps)[name=string(\"ss3\")];\n", SEQ];
+    [m appendFormat:@"        fp16 nhalf = const()[name=string(\"nhalf\"), val=fp16(-0.5)];\n"];
+    [m appendFormat:@"        tensor<fp16, [1,1,1,%d]> rrms = pow(x=ss3,y=nhalf)[name=string(\"rrms\")];\n", SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> xr = mul(x=x,y=rrms)[name=string(\"xr\")];\n", DIM, SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,1]> rw = const()[name=string(\"rw\"), val=tensor<fp16, [1,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/rms2.bin\"), offset=uint64(64)))];\n", DIM, DIM];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> xn = mul(x=xr,y=rw)[name=string(\"xn\")];\n", DIM, SEQ];
+    [m appendString:@CONV_CONST];
+    [m appendFormat:@"        tensor<fp16, [%d,%d,1,1]> W1 = const()[name=string(\"W1\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/w1.bin\"), offset=uint64(64)))];\n", HIDDEN,DIM,HIDDEN,DIM];
+    [m appendFormat:@"        tensor<fp16, [%d,%d,1,1]> W3 = const()[name=string(\"W3\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/w3.bin\"), offset=uint64(64)))];\n", HIDDEN,DIM,HIDDEN,DIM];
+    [m appendFormat:@"        tensor<fp16, [%d,%d,1,1]> W2 = const()[name=string(\"W2\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/w2.bin\"), offset=uint64(64)))];\n", DIM,HIDDEN,DIM,HIDDEN];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> h1 = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W1,x=xn)[name=string(\"c1\")];\n", HIDDEN,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> h3 = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W3,x=xn)[name=string(\"c3\")];\n", HIDDEN,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> sig = sigmoid(x=h1)[name=string(\"sg\")];\n", HIDDEN,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> silu = mul(x=h1,y=sig)[name=string(\"si\")];\n", HIDDEN,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> gate = mul(x=silu,y=h3)[name=string(\"gt\")];\n", HIDDEN,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> y = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W2,x=gate)[name=string(\"c2\")];\n", DIM,SEQ];
+    [m appendString:@"        int32 cax = const()[name=string(\"cax\"), val=int32(1)];\n"];
+    [m appendString:@"        bool cid = const()[name=string(\"cid\"), val=bool(false)];\n"];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> out = concat(axis=cax,interleave=cid,values=(y,h1,h3,gate,xn))[name=string(\"cat\")];\n", 2*DIM+3*HIDDEN,SEQ];
+    [m appendString:@"    } -> (out);\n}\n"];
+    return m;
+}
+
+// FFN backward: concat(dffn,h1,h3) → concat(dx,dh1,dh3)
+static NSString *gen_ffn_bwd(void) {
+    NSMutableString *m = [NSMutableString string];
+    [m appendString:MIL_HDR];
+    [m appendFormat:@"    func main<ios18>(tensor<fp16, [1, %d, 1, %d]> x) {\n", DIM+2*HIDDEN, SEQ];
+    [m appendString:@CONV_CONST];
+    [m appendString:@"        tensor<int32, [4]> bd = const()[name=string(\"bd\"), val=tensor<int32, [4]>([0,0,0,0])];\n"];
+    [m appendFormat:@"        tensor<int32, [4]> sd = const()[name=string(\"sd\"), val=tensor<int32, [4]>([1,%d,1,%d])];\n", DIM, SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dffn = slice_by_size(x=x,begin=bd,size=sd)[name=string(\"s0\")];\n", DIM, SEQ];
+    [m appendFormat:@"        tensor<int32, [4]> b1 = const()[name=string(\"b1\"), val=tensor<int32, [4]>([0,%d,0,0])];\n", DIM];
+    [m appendFormat:@"        tensor<int32, [4]> s1 = const()[name=string(\"s1\"), val=tensor<int32, [4]>([1,%d,1,%d])];\n", HIDDEN, SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> h1 = slice_by_size(x=x,begin=b1,size=s1)[name=string(\"s1x\")];\n", HIDDEN, SEQ];
+    [m appendFormat:@"        tensor<int32, [4]> b3 = const()[name=string(\"b3\"), val=tensor<int32, [4]>([0,%d,0,0])];\n", DIM+HIDDEN];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> h3 = slice_by_size(x=x,begin=b3,size=s1)[name=string(\"s3x\")];\n", HIDDEN, SEQ];
+    [m appendFormat:@"        tensor<fp16, [%d,%d,1,1]> W2t = const()[name=string(\"W2t\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/w2t.bin\"), offset=uint64(64)))];\n", HIDDEN, DIM, HIDDEN, DIM];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dsilu = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W2t,x=dffn)[name=string(\"cw2\")];\n", HIDDEN, SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> sig = sigmoid(x=h1)[name=string(\"sg\")];\n", HIDDEN, SEQ];
+    [m appendString:@"        fp16 one = const()[name=string(\"one\"), val=fp16(1.0)];\n"];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> oms = sub(x=one,y=sig)[name=string(\"oms\")];\n", HIDDEN, SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> homs = mul(x=h1,y=oms)[name=string(\"homs\")];\n", HIDDEN, SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> brk = add(x=one,y=homs)[name=string(\"brk\")];\n", HIDDEN, SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dsd = mul(x=sig,y=brk)[name=string(\"dsd\")];\n", HIDDEN, SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> t1 = mul(x=dsilu,y=h3)[name=string(\"t1\")];\n", HIDDEN, SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dh1 = mul(x=t1,y=dsd)[name=string(\"dh1\")];\n", HIDDEN, SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> slh = mul(x=h1,y=sig)[name=string(\"slh\")];\n", HIDDEN, SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dh3 = mul(x=dsilu,y=slh)[name=string(\"dh3\")];\n", HIDDEN, SEQ];
+    [m appendFormat:@"        tensor<fp16, [%d,%d,1,1]> W1t = const()[name=string(\"W1t\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/w1t.bin\"), offset=uint64(64)))];\n", DIM, HIDDEN, DIM, HIDDEN];
+    [m appendFormat:@"        tensor<fp16, [%d,%d,1,1]> W3t = const()[name=string(\"W3t\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/w3t.bin\"), offset=uint64(64)))];\n", DIM, HIDDEN, DIM, HIDDEN];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dx1 = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W1t,x=dh1)[name=string(\"cw1\")];\n", DIM, SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dx3 = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W3t,x=dh3)[name=string(\"cw3\")];\n", DIM, SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dx = add(x=dx1,y=dx3)[name=string(\"adx\")];\n", DIM, SEQ];
+    [m appendString:@"        int32 cax = const()[name=string(\"cax\"), val=int32(1)];\n"];
+    [m appendString:@"        bool cid = const()[name=string(\"cid\"), val=bool(false)];\n"];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> out = concat(axis=cax,interleave=cid,values=(dx,dh1,dh3))[name=string(\"cat\")];\n", DIM+2*HIDDEN, SEQ];
+    [m appendString:@"    } -> (out);\n}\n"];
+    return m;
+}
+
+// QKV backward: concat(dq,dk,dv) → dx
+static NSString *gen_qkvb(void) {
+    NSMutableString *m = [NSMutableString string];
+    [m appendString:MIL_HDR];
+    [m appendFormat:@"    func main<ios18>(tensor<fp16, [1, %d, 1, %d]> x) {\n", 3*DIM, SEQ];
+    [m appendString:@CONV_CONST];
+    [m appendFormat:@"        tensor<int32, [4]> sz = const()[name=string(\"sz\"), val=tensor<int32, [4]>([1,%d,1,%d])];\n", DIM, SEQ];
+    [m appendString:@"        tensor<int32, [4]> b0 = const()[name=string(\"b0\"), val=tensor<int32, [4]>([0,0,0,0])];\n"];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dq = slice_by_size(x=x,begin=b0,size=sz)[name=string(\"s0\")];\n", DIM,SEQ];
+    [m appendFormat:@"        tensor<int32, [4]> b1 = const()[name=string(\"b1\"), val=tensor<int32, [4]>([0,%d,0,0])];\n", DIM];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dk = slice_by_size(x=x,begin=b1,size=sz)[name=string(\"s1\")];\n", DIM,SEQ];
+    [m appendFormat:@"        tensor<int32, [4]> b2 = const()[name=string(\"b2\"), val=tensor<int32, [4]>([0,%d,0,0])];\n", 2*DIM];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dv = slice_by_size(x=x,begin=b2,size=sz)[name=string(\"s2\")];\n", DIM,SEQ];
+    [m appendFormat:@"        tensor<fp16, [%d,%d,1,1]> Wqt = const()[name=string(\"Wqt\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/wqt.bin\"), offset=uint64(64)))];\n", DIM,DIM,DIM,DIM];
+    [m appendFormat:@"        tensor<fp16, [%d,%d,1,1]> Wkt = const()[name=string(\"Wkt\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/wkt.bin\"), offset=uint64(64)))];\n", DIM,DIM,DIM,DIM];
+    [m appendFormat:@"        tensor<fp16, [%d,%d,1,1]> Wvt = const()[name=string(\"Wvt\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/wvt.bin\"), offset=uint64(64)))];\n", DIM,DIM,DIM,DIM];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dxq = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wqt,x=dq)[name=string(\"cq\")];\n", DIM,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dxk = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wkt,x=dk)[name=string(\"ck\")];\n", DIM,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dxv = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wvt,x=dv)[name=string(\"cv\")];\n", DIM,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dxqk = add(x=dxq,y=dxk)[name=string(\"aqk\")];\n", DIM,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> out = add(x=dxqk,y=dxv)[name=string(\"out\")];\n", DIM,SEQ];
+    [m appendString:@"    } -> (out);\n}\n"];
+    return m;
+}
+
+// SDPA backward part 1 + Wo^T
+static NSString *gen_sdpa_bwd1(void) {
+    float sc = 1.0f/sqrtf((float)HD);
+    NSMutableString *m = [NSMutableString string];
+    [m appendString:MIL_HDR];
+    [m appendFormat:@"    func main<ios18>(tensor<fp16, [1, %d, 1, %d]> x) {\n", 4*DIM, SEQ];
+    [m appendString:@CONV_CONST];
+    [m appendFormat:@"        tensor<int32, [4]> sz = const()[name=string(\"sz\"), val=tensor<int32, [4]>([1,%d,1,%d])];\n", DIM, SEQ];
+    [m appendString:@"        tensor<int32, [4]> b0 = const()[name=string(\"b0\"), val=tensor<int32, [4]>([0,0,0,0])];\n"];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> qf = slice_by_size(x=x,begin=b0,size=sz)[name=string(\"s0\")];\n", DIM,SEQ];
+    [m appendFormat:@"        tensor<int32, [4]> b1 = const()[name=string(\"b1\"), val=tensor<int32, [4]>([0,%d,0,0])];\n", DIM];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> kf = slice_by_size(x=x,begin=b1,size=sz)[name=string(\"s1\")];\n", DIM,SEQ];
+    [m appendFormat:@"        tensor<int32, [4]> b2 = const()[name=string(\"b2\"), val=tensor<int32, [4]>([0,%d,0,0])];\n", 2*DIM];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> vf = slice_by_size(x=x,begin=b2,size=sz)[name=string(\"s2\")];\n", DIM,SEQ];
+    [m appendFormat:@"        tensor<int32, [4]> b3 = const()[name=string(\"b3\"), val=tensor<int32, [4]>([0,%d,0,0])];\n", 3*DIM];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dx2f = slice_by_size(x=x,begin=b3,size=sz)[name=string(\"s3\")];\n", DIM,SEQ];
+    [m appendFormat:@"        tensor<fp16, [%d,%d,1,1]> Wot = const()[name=string(\"Wot\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/wot.bin\"), offset=uint64(64)))];\n", DIM,DIM,DIM,DIM];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> df = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wot,x=dx2f)[name=string(\"cwo\")];\n", DIM,SEQ];
+    [m appendFormat:@"        tensor<int32, [4]> rsh = const()[name=string(\"rsh\"), val=tensor<int32, [4]>([1,%d,%d,%d])];\n", HEADS,HD,SEQ];
+    [m appendString:@"        tensor<int32, [4]> pm = const()[name=string(\"pm\"), val=tensor<int32, [4]>([0,1,3,2])];\n"];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> qr = reshape(shape=rsh,x=qf)[name=string(\"rq\")];\n", HEADS,HD,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> q = transpose(perm=pm,x=qr)[name=string(\"tq\")];\n", HEADS,SEQ,HD];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> kr = reshape(shape=rsh,x=kf)[name=string(\"rk\")];\n", HEADS,HD,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> k = transpose(perm=pm,x=kr)[name=string(\"tk\")];\n", HEADS,SEQ,HD];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> vr = reshape(shape=rsh,x=vf)[name=string(\"rv\")];\n", HEADS,HD,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> v = transpose(perm=pm,x=vr)[name=string(\"tv\")];\n", HEADS,SEQ,HD];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> dr = reshape(shape=rsh,x=df)[name=string(\"rd\")];\n", HEADS,HD,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> da = transpose(perm=pm,x=dr)[name=string(\"td\")];\n", HEADS,SEQ,HD];
+    [m appendString:@"        bool bF = const()[name=string(\"bF\"), val=bool(false)];\n"];
+    [m appendString:@"        bool bT = const()[name=string(\"bT\"), val=bool(true)];\n"];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> sc1 = matmul(transpose_x=bF,transpose_y=bT,x=q,y=k)[name=string(\"mm1\")];\n", HEADS,SEQ,SEQ];
+    [m appendFormat:@"        fp16 scv = const()[name=string(\"scv\"), val=fp16(%f)];\n", sc];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> sc2 = mul(x=sc1,y=scv)[name=string(\"scl\")];\n", HEADS,SEQ,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,1,%d,%d]> cm = const()[name=string(\"cm\"), val=tensor<fp16, [1,1,%d,%d]>(BLOBFILE(path=string(\"@model_path/weights/mask.bin\"), offset=uint64(64)))];\n", SEQ,SEQ,SEQ,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> ms = add(x=sc2,y=cm)[name=string(\"msk\")];\n", HEADS,SEQ,SEQ];
+    [m appendString:@"        int32 sax = const()[name=string(\"sax\"), val=int32(-1)];\n"];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> probs = softmax(axis=sax,x=ms)[name=string(\"sm\")];\n", HEADS,SEQ,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> dv4 = matmul(transpose_x=bT,transpose_y=bF,x=probs,y=da)[name=string(\"dv\")];\n", HEADS,SEQ,HD];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> dp4 = matmul(transpose_x=bF,transpose_y=bT,x=da,y=v)[name=string(\"dp\")];\n", HEADS,SEQ,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> dvt = transpose(perm=pm,x=dv4)[name=string(\"dvt\")];\n", HEADS,HD,SEQ];
+    [m appendFormat:@"        tensor<int32, [4]> dvs = const()[name=string(\"dvs\"), val=tensor<int32, [4]>([1,%d,1,%d])];\n", DIM,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dvf = reshape(shape=dvs,x=dvt)[name=string(\"dvf\")];\n", DIM,SEQ];
+    [m appendFormat:@"        tensor<int32, [4]> scs = const()[name=string(\"scs\"), val=tensor<int32, [4]>([1,%d,1,%d])];\n", SCORE_CH,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> pf = reshape(shape=scs,x=probs)[name=string(\"pf\")];\n", SCORE_CH,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dpf = reshape(shape=scs,x=dp4)[name=string(\"dpf\")];\n", SCORE_CH,SEQ];
+    [m appendString:@"        int32 cax = const()[name=string(\"cax\"), val=int32(1)];\n"];
+    [m appendString:@"        bool cid = const()[name=string(\"cid\"), val=bool(false)];\n"];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> out = concat(axis=cax,interleave=cid,values=(dvf,pf,dpf))[name=string(\"cat\")];\n", DIM+2*SCORE_CH,SEQ];
+    [m appendString:@"    } -> (out);\n}\n"];
+    return m;
+}
+
+// SDPA backward part 2: concat(probs,dp,Q,K) → concat(dQ,dK)
+static NSString *gen_sdpa_bwd2(void) {
+    float sc = 1.0f/sqrtf((float)HD);
+    int bwd2_in = 2*SCORE_CH + 2*DIM;
+    NSMutableString *m = [NSMutableString string];
+    [m appendString:MIL_HDR];
+    [m appendFormat:@"    func main<ios18>(tensor<fp16, [1, %d, 1, %d]> x) {\n", bwd2_in, SEQ];
+    [m appendFormat:@"        tensor<int32, [4]> sz_sc = const()[name=string(\"szsc\"), val=tensor<int32, [4]>([1,%d,1,%d])];\n", SCORE_CH, SEQ];
+    [m appendString:@"        tensor<int32, [4]> b0 = const()[name=string(\"b0\"), val=tensor<int32, [4]>([0,0,0,0])];\n"];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> pf = slice_by_size(x=x,begin=b0,size=sz_sc)[name=string(\"s0\")];\n", SCORE_CH,SEQ];
+    [m appendFormat:@"        tensor<int32, [4]> b1 = const()[name=string(\"b1\"), val=tensor<int32, [4]>([0,%d,0,0])];\n", SCORE_CH];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dpf = slice_by_size(x=x,begin=b1,size=sz_sc)[name=string(\"s1\")];\n", SCORE_CH,SEQ];
+    [m appendFormat:@"        tensor<int32, [4]> sz_d = const()[name=string(\"szd\"), val=tensor<int32, [4]>([1,%d,1,%d])];\n", DIM, SEQ];
+    [m appendFormat:@"        tensor<int32, [4]> b2 = const()[name=string(\"b2\"), val=tensor<int32, [4]>([0,%d,0,0])];\n", 2*SCORE_CH];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> qf = slice_by_size(x=x,begin=b2,size=sz_d)[name=string(\"s2\")];\n", DIM,SEQ];
+    [m appendFormat:@"        tensor<int32, [4]> b3 = const()[name=string(\"b3\"), val=tensor<int32, [4]>([0,%d,0,0])];\n", 2*SCORE_CH+DIM];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> kf = slice_by_size(x=x,begin=b3,size=sz_d)[name=string(\"s3\")];\n", DIM,SEQ];
+    [m appendFormat:@"        tensor<int32, [4]> ssh = const()[name=string(\"ssh\"), val=tensor<int32, [4]>([1,%d,%d,%d])];\n", HEADS,SEQ,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> probs = reshape(shape=ssh,x=pf)[name=string(\"rp\")];\n", HEADS,SEQ,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> dp = reshape(shape=ssh,x=dpf)[name=string(\"rdp\")];\n", HEADS,SEQ,SEQ];
+    [m appendFormat:@"        tensor<int32, [4]> rsh = const()[name=string(\"rsh\"), val=tensor<int32, [4]>([1,%d,%d,%d])];\n", HEADS,HD,SEQ];
+    [m appendString:@"        tensor<int32, [4]> pm = const()[name=string(\"pm\"), val=tensor<int32, [4]>([0,1,3,2])];\n"];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> qr = reshape(shape=rsh,x=qf)[name=string(\"rq\")];\n", HEADS,HD,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> q = transpose(perm=pm,x=qr)[name=string(\"tq\")];\n", HEADS,SEQ,HD];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> kr = reshape(shape=rsh,x=kf)[name=string(\"rk\")];\n", HEADS,HD,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> k = transpose(perm=pm,x=kr)[name=string(\"tk\")];\n", HEADS,SEQ,HD];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> pdp = mul(x=probs,y=dp)[name=string(\"pdp\")];\n", HEADS,SEQ,SEQ];
+    [m appendString:@"        tensor<int32, [1]> rax = const()[name=string(\"rax\"), val=tensor<int32, [1]>([-1])];\n"];
+    [m appendString:@"        bool kd = const()[name=string(\"kd\"), val=bool(true)];\n"];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,1]> spdp = reduce_sum(x=pdp,axes=rax,keep_dims=kd)[name=string(\"rs\")];\n", HEADS,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> dps = sub(x=dp,y=spdp)[name=string(\"dps\")];\n", HEADS,SEQ,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> ds0 = mul(x=probs,y=dps)[name=string(\"ds0\")];\n", HEADS,SEQ,SEQ];
+    [m appendFormat:@"        fp16 scv = const()[name=string(\"scv\"), val=fp16(%f)];\n", sc];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> ds = mul(x=ds0,y=scv)[name=string(\"ds\")];\n", HEADS,SEQ,SEQ];
+    [m appendString:@"        bool bF = const()[name=string(\"bF\"), val=bool(false)];\n"];
+    [m appendString:@"        bool bT = const()[name=string(\"bT\"), val=bool(true)];\n"];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> dq4 = matmul(transpose_x=bF,transpose_y=bF,x=ds,y=k)[name=string(\"dq\")];\n", HEADS,SEQ,HD];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> dk4 = matmul(transpose_x=bT,transpose_y=bF,x=ds,y=q)[name=string(\"dk\")];\n", HEADS,SEQ,HD];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> dqt = transpose(perm=pm,x=dq4)[name=string(\"dqt\")];\n", HEADS,HD,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> dkt = transpose(perm=pm,x=dk4)[name=string(\"dkt\")];\n", HEADS,HD,SEQ];
+    [m appendFormat:@"        tensor<int32, [4]> fs = const()[name=string(\"fs\"), val=tensor<int32, [4]>([1,%d,1,%d])];\n", DIM,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dqf = reshape(shape=fs,x=dqt)[name=string(\"dqf\")];\n", DIM,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dkf = reshape(shape=fs,x=dkt)[name=string(\"dkf\")];\n", DIM,SEQ];
+    [m appendString:@"        int32 cax = const()[name=string(\"cax\"), val=int32(1)];\n"];
+    [m appendString:@"        bool cid = const()[name=string(\"cid\"), val=bool(false)];\n"];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> out = concat(axis=cax,interleave=cid,values=(dqf,dkf))[name=string(\"cat\")];\n", 2*DIM,SEQ];
+    [m appendString:@"    } -> (out);\n}\n"];
+    return m;
+}
+
+// Mask blob (causal mask [SEQ,SEQ])
+static NSData *g_mask_blob = nil;
+static NSData *get_mask_blob(void) {
+    if (!g_mask_blob) {
+        _Float16 *mask = (_Float16*)calloc(SEQ*SEQ, sizeof(_Float16));
+        for(int t=0;t<SEQ;t++) for(int t2=0;t2<SEQ;t2++)
+            mask[t*SEQ+t2] = (t2<=t) ? (_Float16)0.0f : (_Float16)(-65504.0f);
+        g_mask_blob = build_blob_fp16(mask, SEQ*SEQ);
+        free(mask);
+    }
+    return g_mask_blob;
+}
diff --git a/training/tokenize.py b/training/tokenize.py
new file mode 100644
index 0000000..219cb21
--- /dev/null
+++ b/training/tokenize.py
@@ -0,0 +1,36 @@
+#!/usr/bin/env python3
+"""Extract pretokenized TinyStories data from zip.
+Data format: flat uint16 token IDs (llama2.c BPE, 32K vocab).
+Source: ~/tiny_stories_data_pretokenized.zip"""
+
+import os, struct, zipfile
+from pathlib import Path
+
+ZIP_PATH = os.path.expanduser('~/tiny_stories_data_pretokenized.zip')
+OUTPUT_PATH = str(Path(__file__).resolve().parent / 'tinystories_data00.bin')
+
+def main():
+    if os.path.exists(OUTPUT_PATH):
+        n = os.path.getsize(OUTPUT_PATH) // 2
+        print(f"{OUTPUT_PATH} already exists ({n} tokens, {os.path.getsize(OUTPUT_PATH)/1e6:.1f} MB)")
+        return
+
+    print(f"Extracting data00.bin from {ZIP_PATH}...")
+    with zipfile.ZipFile(ZIP_PATH, 'r') as z:
+        with z.open('data00.bin') as src, open(OUTPUT_PATH, 'wb') as dst:
+            while True:
+                chunk = src.read(1 << 20)
+                if not chunk:
+                    break
+                dst.write(chunk)
+
+    n = os.path.getsize(OUTPUT_PATH) // 2
+    print(f"Written {OUTPUT_PATH} ({n} tokens, {os.path.getsize(OUTPUT_PATH)/1e6:.1f} MB)")
+
+    # Sanity check
+    with open(OUTPUT_PATH, 'rb') as f:
+        tokens = struct.unpack('<10H', f.read(20))
+        print(f"First 10 tokens: {tokens}")
+
+if __name__ == '__main__':
+    main()
diff --git a/training/train_large.m b/training/train_large.m
index 55c1cf8..ee7e860 100644
--- a/training/train_large.m
+++ b/training/train_large.m
@@ -1,717 +1,311 @@
-// train_large.m — Train a single transformer layer FULLY on ANE
-// 7 ANE kernels per step:
-//   Forward:  kFwdAttn (QKV+SDPA+Wo, taps Q,K,V,attn_out) + kFwdFFN (W1+W3+SiLU+W2, taps h1,h3,silu_out)
-//   Backward: kFFNBwd (W2^T+SiLU_bwd+W1^T+W3^T) + kSdpaBwd1 (Wo^T+SDPA) + kSdpaBwd2 + kQKVb (Wq^T+Wk^T+Wv^T)
-// CPU: RMSNorm (fwd+bwd), residuals, loss, dW accumulation (cblas), SGD update
-// NO CPU recompute of Q,K,V,h1,h3 — all exposed via forward taps
-#import <Foundation/Foundation.h>
-#import <objc/runtime.h>
-#import <objc/message.h>
-#import <dlfcn.h>
-#import <IOSurface/IOSurface.h>
-#import <mach/mach_time.h>
-#import <Accelerate/Accelerate.h>
-#include <math.h>
-#include <unistd.h>
-#include <dispatch/dispatch.h>
+// train_large.m — Train stories110M (12 layers, 768dim, 3072hidden) on ANE
+// Uses pretokenized TinyStories data with cross-entropy loss
+// 5 weight-bearing ANE kernels per layer × 12 layers = 60 per compile batch
+#include "stories_io.h"
+#include "stories_mil.h"
+#include "stories_cpu_ops.h"
 
-#define DIM 768
-#define HIDDEN 2048
-#define HEADS 12
-#define HD (DIM/HEADS)
-#define SEQ 512
-#define ACCUM_STEPS 100
-#define MAX_COMPILES 100
-#define NUM_KERNELS 6
-#define CKPT_PATH "/tmp/ane_large_ckpt.bin"
+#define CKPT_PATH "ane_stories110M_ckpt.bin"
+#define MODEL_PATH "../../assets/models/stories110M.bin"
+#define DATA_PATH "tinystories_data00.bin"
 
-static Class g_D, g_I, g_AR, g_AIO;
-static mach_timebase_info_data_t g_tb;
-static int g_compile_count = 0;
-
-static void ane_init(void) {
-    dlopen("/System/Library/PrivateFrameworks/AppleNeuralEngine.framework/AppleNeuralEngine", RTLD_NOW);
-    g_D  = NSClassFromString(@"_ANEInMemoryModelDescriptor");
-    g_I  = NSClassFromString(@"_ANEInMemoryModel");
-    g_AR = NSClassFromString(@"_ANERequest");
-    g_AIO= NSClassFromString(@"_ANEIOSurfaceObject");
-}
-static double tb_ms(uint64_t t) { return (double)t * g_tb.numer / g_tb.denom / 1e6; }
-static IOSurfaceRef make_surface(size_t bytes) {
-    return IOSurfaceCreate((__bridge CFDictionaryRef)@{
-        (id)kIOSurfaceWidth:@(bytes), (id)kIOSurfaceHeight:@1,
-        (id)kIOSurfaceBytesPerElement:@1, (id)kIOSurfaceBytesPerRow:@(bytes),
-        (id)kIOSurfaceAllocSize:@(bytes), (id)kIOSurfacePixelFormat:@0});
-}
-static NSData *build_blob(const float *w, int rows, int cols) {
-    int ws=rows*cols*2, tot=128+ws;
-    uint8_t *b=(uint8_t*)calloc(tot,1);
-    b[0]=1;b[4]=2;b[64]=0xEF;b[65]=0xBE;b[66]=0xAD;b[67]=0xDE;b[68]=1;
-    *(uint32_t*)(b+72)=ws;*(uint32_t*)(b+80)=128;
-    _Float16 *fp16=(_Float16*)(b+128);
-    for(int i=0;i<rows*cols;i++) fp16[i]=(_Float16)w[i];
-    return [NSData dataWithBytesNoCopy:b length:tot freeWhenDone:YES];
-}
-static NSData *build_blob_t(const float *w, int rows, int cols) {
-    int ws=cols*rows*2, tot=128+ws;
-    uint8_t *b=(uint8_t*)calloc(tot,1);
-    b[0]=1;b[4]=2;b[64]=0xEF;b[65]=0xBE;b[66]=0xAD;b[67]=0xDE;b[68]=1;
-    *(uint32_t*)(b+72)=ws;*(uint32_t*)(b+80)=128;
-    _Float16 *fp16=(_Float16*)(b+128);
-    for(int i=0;i<rows;i++) for(int j=0;j<cols;j++) fp16[j*rows+i]=(_Float16)w[i*cols+j];
-    return [NSData dataWithBytesNoCopy:b length:tot freeWhenDone:YES];
-}
-static NSData *build_blob_fp16(_Float16 *d, int cnt) {
-    int ws=cnt*2, tot=128+ws;
-    uint8_t *b=(uint8_t*)calloc(tot,1);
-    b[0]=1;b[4]=2;b[64]=0xEF;b[65]=0xBE;b[66]=0xAD;b[67]=0xDE;b[68]=1;
-    *(uint32_t*)(b+72)=ws;*(uint32_t*)(b+80)=128;
-    memcpy(b+128,d,ws);
-    return [NSData dataWithBytesNoCopy:b length:tot freeWhenDone:YES];
-}
-
-// ===== MIL generators =====
-
-#define MIL_HDR \
-    @"program(1.3)\n[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, " \
-    "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, " \
-    "{\"coremltools-version\", \"9.0\"}})]\n{\n"
-#define CONV_CONST \
-    "        string pt = const()[name=string(\"pt\"), val=string(\"valid\")];\n" \
-    "        tensor<int32, [2]> st = const()[name=string(\"st\"), val=tensor<int32, [2]>([1,1])];\n" \
-    "        tensor<int32, [4]> pd = const()[name=string(\"pd\"), val=tensor<int32, [4]>([0,0,0,0])];\n" \
-    "        tensor<int32, [2]> dl = const()[name=string(\"dl\"), val=tensor<int32, [2]>([1,1])];\n" \
-    "        int32 gr = const()[name=string(\"gr\"), val=int32(1)];\n"
-
-// SDPA forward + taps: x_in → rmsnorm → QKV+SDPA+Wo → concat(o_out, Q, K, V, attn_out, xnorm) fp16
-static NSString *gen_sdpa_fwd_taps(void) {
-    float sc = 1.0f/sqrtf((float)HD);
-    float invd = 1.0f/(float)DIM;
-    NSMutableString *m = [NSMutableString string];
-    [m appendString:MIL_HDR];
-    [m appendFormat:@"    func main<ios18>(tensor<fp16, [1, %d, 1, %d]> x) {\n", DIM, SEQ];
-    // --- RMSNorm: x → xn ---
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> sq = mul(x=x,y=x)[name=string(\"sq\")];\n", DIM, SEQ];
-    [m appendFormat:@"        tensor<int32, [1]> rax = const()[name=string(\"rax\"), val=tensor<int32, [1]>([1])];\n"];
-    [m appendFormat:@"        bool kd = const()[name=string(\"kd\"), val=bool(true)];\n"];
-    [m appendFormat:@"        tensor<fp16, [1,1,1,%d]> ss = reduce_sum(x=sq,axes=rax,keep_dims=kd)[name=string(\"ss\")];\n", SEQ];
-    [m appendFormat:@"        fp16 invd = const()[name=string(\"invd\"), val=fp16(%f)];\n", invd];
-    [m appendFormat:@"        tensor<fp16, [1,1,1,%d]> ss2 = mul(x=ss,y=invd)[name=string(\"ss2\")];\n", SEQ];
-    [m appendFormat:@"        fp16 eps = const()[name=string(\"eps\"), val=fp16(0.00001)];\n"];
-    [m appendFormat:@"        tensor<fp16, [1,1,1,%d]> ss3 = add(x=ss2,y=eps)[name=string(\"ss3\")];\n", SEQ];
-    [m appendFormat:@"        fp16 nhalf = const()[name=string(\"nhalf\"), val=fp16(-0.5)];\n"];
-    [m appendFormat:@"        tensor<fp16, [1,1,1,%d]> rrms = pow(x=ss3,y=nhalf)[name=string(\"rrms\")];\n", SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> xr = mul(x=x,y=rrms)[name=string(\"xr\")];\n", DIM, SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,1]> rw = const()[name=string(\"rw\"), val=tensor<fp16, [1,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/rms1.bin\"), offset=uint64(64)))];\n", DIM, DIM];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> xn = mul(x=xr,y=rw)[name=string(\"xn\")];\n", DIM, SEQ];
-    // --- QKV + SDPA + Wo (operates on xn) ---
-    [m appendString:@CONV_CONST];
-    [m appendFormat:@"        tensor<fp16, [%d,%d,1,1]> Wq = const()[name=string(\"Wq\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/wq.bin\"), offset=uint64(64)))];\n", DIM,DIM,DIM,DIM];
-    [m appendFormat:@"        tensor<fp16, [%d,%d,1,1]> Wk = const()[name=string(\"Wk\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/wk.bin\"), offset=uint64(64)))];\n", DIM,DIM,DIM,DIM];
-    [m appendFormat:@"        tensor<fp16, [%d,%d,1,1]> Wv = const()[name=string(\"Wv\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/wv.bin\"), offset=uint64(64)))];\n", DIM,DIM,DIM,DIM];
-    [m appendFormat:@"        tensor<fp16, [%d,%d,1,1]> Wo = const()[name=string(\"Wo\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/wo.bin\"), offset=uint64(64)))];\n", DIM,DIM,DIM,DIM];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> qf = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wq,x=xn)[name=string(\"cq\")];\n", DIM,SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> kf = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wk,x=xn)[name=string(\"ck\")];\n", DIM,SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> vf = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wv,x=xn)[name=string(\"cv\")];\n", DIM,SEQ];
-    [m appendFormat:@"        tensor<int32, [4]> qsh = const()[name=string(\"qsh\"), val=tensor<int32, [4]>([1,%d,%d,%d])];\n", HEADS,HD,SEQ];
-    [m appendString:@"        tensor<int32, [4]> pm = const()[name=string(\"pm\"), val=tensor<int32, [4]>([0,1,3,2])];\n"];
-    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> q4 = reshape(shape=qsh,x=qf)[name=string(\"rq\")];\n", HEADS,HD,SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> q = transpose(perm=pm,x=q4)[name=string(\"tq\")];\n", HEADS,SEQ,HD];
-    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> k4 = reshape(shape=qsh,x=kf)[name=string(\"rk\")];\n", HEADS,HD,SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> k = transpose(perm=pm,x=k4)[name=string(\"tk\")];\n", HEADS,SEQ,HD];
-    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> v4 = reshape(shape=qsh,x=vf)[name=string(\"rv\")];\n", HEADS,HD,SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> v = transpose(perm=pm,x=v4)[name=string(\"tv\")];\n", HEADS,SEQ,HD];
-    [m appendString:@"        bool tx = const()[name=string(\"tx\"), val=bool(false)];\n"];
-    [m appendString:@"        bool ty = const()[name=string(\"ty\"), val=bool(true)];\n"];
-    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> sc1 = matmul(transpose_x=tx,transpose_y=ty,x=q,y=k)[name=string(\"mm1\")];\n", HEADS,SEQ,SEQ];
-    [m appendFormat:@"        fp16 scv = const()[name=string(\"scv\"), val=fp16(%f)];\n", sc];
-    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> sc2 = mul(x=sc1,y=scv)[name=string(\"scl\")];\n", HEADS,SEQ,SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,1,%d,%d]> cm = const()[name=string(\"cm\"), val=tensor<fp16, [1,1,%d,%d]>(BLOBFILE(path=string(\"@model_path/weights/mask.bin\"), offset=uint64(64)))];\n", SEQ,SEQ,SEQ,SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> ms = add(x=sc2,y=cm)[name=string(\"msk\")];\n", HEADS,SEQ,SEQ];
-    [m appendString:@"        int32 sax = const()[name=string(\"sax\"), val=int32(-1)];\n"];
-    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> aw = softmax(axis=sax,x=ms)[name=string(\"sm\")];\n", HEADS,SEQ,SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> a4 = matmul(transpose_x=tx,transpose_y=tx,x=aw,y=v)[name=string(\"mm2\")];\n", HEADS,SEQ,HD];
-    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> at = transpose(perm=pm,x=a4)[name=string(\"ta\")];\n", HEADS,HD,SEQ];
-    [m appendFormat:@"        tensor<int32, [4]> os = const()[name=string(\"os\"), val=tensor<int32, [4]>([1,%d,1,%d])];\n", DIM,SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> af = reshape(shape=os,x=at)[name=string(\"ra\")];\n", DIM,SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> oo = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wo,x=af)[name=string(\"co\")];\n", DIM,SEQ];
-    [m appendString:@"        int32 cax = const()[name=string(\"cax\"), val=int32(1)];\n"];
-    [m appendString:@"        bool cid = const()[name=string(\"cid\"), val=bool(false)];\n"];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> out = concat(axis=cax,interleave=cid,values=(oo,qf,kf,vf,af,xn))[name=string(\"cat\")];\n", 6*DIM,SEQ];
-    [m appendString:@"    } -> (out);\n}\n"];
-    return m;
-}
-
-// FFN forward + taps: x2 → rmsnorm → FFN → concat(ffn_out, h1, h3, silu_out, x2norm) fp16
-static NSString *gen_ffn_fwd_taps(void) {
-    float invd = 1.0f/(float)DIM;
-    NSMutableString *m = [NSMutableString string];
-    [m appendString:MIL_HDR];
-    [m appendFormat:@"    func main<ios18>(tensor<fp16, [1, %d, 1, %d]> x) {\n", DIM, SEQ];
-    // --- RMSNorm: x → xn ---
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> sq = mul(x=x,y=x)[name=string(\"sq\")];\n", DIM, SEQ];
-    [m appendFormat:@"        tensor<int32, [1]> rax = const()[name=string(\"rax\"), val=tensor<int32, [1]>([1])];\n"];
-    [m appendFormat:@"        bool kd = const()[name=string(\"kd\"), val=bool(true)];\n"];
-    [m appendFormat:@"        tensor<fp16, [1,1,1,%d]> ss = reduce_sum(x=sq,axes=rax,keep_dims=kd)[name=string(\"ss\")];\n", SEQ];
-    [m appendFormat:@"        fp16 invd = const()[name=string(\"invd\"), val=fp16(%f)];\n", invd];
-    [m appendFormat:@"        tensor<fp16, [1,1,1,%d]> ss2 = mul(x=ss,y=invd)[name=string(\"ss2\")];\n", SEQ];
-    [m appendFormat:@"        fp16 eps = const()[name=string(\"eps\"), val=fp16(0.00001)];\n"];
-    [m appendFormat:@"        tensor<fp16, [1,1,1,%d]> ss3 = add(x=ss2,y=eps)[name=string(\"ss3\")];\n", SEQ];
-    [m appendFormat:@"        fp16 nhalf = const()[name=string(\"nhalf\"), val=fp16(-0.5)];\n"];
-    [m appendFormat:@"        tensor<fp16, [1,1,1,%d]> rrms = pow(x=ss3,y=nhalf)[name=string(\"rrms\")];\n", SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> xr = mul(x=x,y=rrms)[name=string(\"xr\")];\n", DIM, SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,1]> rw = const()[name=string(\"rw\"), val=tensor<fp16, [1,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/rms2.bin\"), offset=uint64(64)))];\n", DIM, DIM];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> xn = mul(x=xr,y=rw)[name=string(\"xn\")];\n", DIM, SEQ];
-    // --- FFN (operates on xn) ---
-    [m appendString:@CONV_CONST];
-    [m appendFormat:@"        tensor<fp16, [%d,%d,1,1]> W1 = const()[name=string(\"W1\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/w1.bin\"), offset=uint64(64)))];\n", HIDDEN,DIM,HIDDEN,DIM];
-    [m appendFormat:@"        tensor<fp16, [%d,%d,1,1]> W3 = const()[name=string(\"W3\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/w3.bin\"), offset=uint64(64)))];\n", HIDDEN,DIM,HIDDEN,DIM];
-    [m appendFormat:@"        tensor<fp16, [%d,%d,1,1]> W2 = const()[name=string(\"W2\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/w2.bin\"), offset=uint64(64)))];\n", DIM,HIDDEN,DIM,HIDDEN];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> h1 = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W1,x=xn)[name=string(\"c1\")];\n", HIDDEN,SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> h3 = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W3,x=xn)[name=string(\"c3\")];\n", HIDDEN,SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> sig = sigmoid(x=h1)[name=string(\"sg\")];\n", HIDDEN,SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> silu = mul(x=h1,y=sig)[name=string(\"si\")];\n", HIDDEN,SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> gate = mul(x=silu,y=h3)[name=string(\"gt\")];\n", HIDDEN,SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> y = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W2,x=gate)[name=string(\"c2\")];\n", DIM,SEQ];
-    [m appendString:@"        int32 cax = const()[name=string(\"cax\"), val=int32(1)];\n"];
-    [m appendString:@"        bool cid = const()[name=string(\"cid\"), val=bool(false)];\n"];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> out = concat(axis=cax,interleave=cid,values=(y,h1,h3,gate,xn))[name=string(\"cat\")];\n", 2*DIM+3*HIDDEN,SEQ];
-    [m appendString:@"    } -> (out);\n}\n"];
-    return m;
-}
-
-// Fused FFN backward: concat(dffn,h1,h3) → concat(dx,dh1,dh3) fp16
-static NSString *gen_ffn_bwd(void) {
-    NSMutableString *m = [NSMutableString string];
-    [m appendString:MIL_HDR];
-    [m appendFormat:@"    func main<ios18>(tensor<fp16, [1, %d, 1, %d]> x) {\n", DIM+2*HIDDEN, SEQ];
-    [m appendString:@CONV_CONST];
-    [m appendString:@"        tensor<int32, [4]> bd = const()[name=string(\"bd\"), val=tensor<int32, [4]>([0,0,0,0])];\n"];
-    [m appendFormat:@"        tensor<int32, [4]> sd = const()[name=string(\"sd\"), val=tensor<int32, [4]>([1,%d,1,%d])];\n", DIM, SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dffn = slice_by_size(x=x,begin=bd,size=sd)[name=string(\"s0\")];\n", DIM, SEQ];
-    [m appendFormat:@"        tensor<int32, [4]> b1 = const()[name=string(\"b1\"), val=tensor<int32, [4]>([0,%d,0,0])];\n", DIM];
-    [m appendFormat:@"        tensor<int32, [4]> s1 = const()[name=string(\"s1\"), val=tensor<int32, [4]>([1,%d,1,%d])];\n", HIDDEN, SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> h1 = slice_by_size(x=x,begin=b1,size=s1)[name=string(\"s1x\")];\n", HIDDEN, SEQ];
-    [m appendFormat:@"        tensor<int32, [4]> b3 = const()[name=string(\"b3\"), val=tensor<int32, [4]>([0,%d,0,0])];\n", DIM+HIDDEN];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> h3 = slice_by_size(x=x,begin=b3,size=s1)[name=string(\"s3x\")];\n", HIDDEN, SEQ];
-    [m appendFormat:@"        tensor<fp16, [%d,%d,1,1]> W2t = const()[name=string(\"W2t\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/w2t.bin\"), offset=uint64(64)))];\n", HIDDEN, DIM, HIDDEN, DIM];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dsilu = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W2t,x=dffn)[name=string(\"cw2\")];\n", HIDDEN, SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> sig = sigmoid(x=h1)[name=string(\"sg\")];\n", HIDDEN, SEQ];
-    [m appendString:@"        fp16 one = const()[name=string(\"one\"), val=fp16(1.0)];\n"];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> oms = sub(x=one,y=sig)[name=string(\"oms\")];\n", HIDDEN, SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> homs = mul(x=h1,y=oms)[name=string(\"homs\")];\n", HIDDEN, SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> brk = add(x=one,y=homs)[name=string(\"brk\")];\n", HIDDEN, SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dsd = mul(x=sig,y=brk)[name=string(\"dsd\")];\n", HIDDEN, SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> t1 = mul(x=dsilu,y=h3)[name=string(\"t1\")];\n", HIDDEN, SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dh1 = mul(x=t1,y=dsd)[name=string(\"dh1\")];\n", HIDDEN, SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> slh = mul(x=h1,y=sig)[name=string(\"slh\")];\n", HIDDEN, SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dh3 = mul(x=dsilu,y=slh)[name=string(\"dh3\")];\n", HIDDEN, SEQ];
-    [m appendFormat:@"        tensor<fp16, [%d,%d,1,1]> W1t = const()[name=string(\"W1t\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/w1t.bin\"), offset=uint64(64)))];\n", DIM, HIDDEN, DIM, HIDDEN];
-    [m appendFormat:@"        tensor<fp16, [%d,%d,1,1]> W3t = const()[name=string(\"W3t\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/w3t.bin\"), offset=uint64(64)))];\n", DIM, HIDDEN, DIM, HIDDEN];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dx1 = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W1t,x=dh1)[name=string(\"cw1\")];\n", DIM, SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dx3 = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W3t,x=dh3)[name=string(\"cw3\")];\n", DIM, SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dx = add(x=dx1,y=dx3)[name=string(\"adx\")];\n", DIM, SEQ];
-    [m appendString:@"        int32 cax = const()[name=string(\"cax\"), val=int32(1)];\n"];
-    [m appendString:@"        bool cid = const()[name=string(\"cid\"), val=bool(false)];\n"];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> out = concat(axis=cax,interleave=cid,values=(dx,dh1,dh3))[name=string(\"cat\")];\n", DIM+2*HIDDEN, SEQ];
-    [m appendString:@"    } -> (out);\n}\n"];
-    return m;
-}
-
-// Fused QKV backward: concat(dq,dk,dv) → dx fp16
-static NSString *gen_qkvb(void) {
-    NSMutableString *m = [NSMutableString string];
-    [m appendString:MIL_HDR];
-    [m appendFormat:@"    func main<ios18>(tensor<fp16, [1, %d, 1, %d]> x) {\n", 3*DIM, SEQ];
-    [m appendString:@CONV_CONST];
-    [m appendFormat:@"        tensor<int32, [4]> sz = const()[name=string(\"sz\"), val=tensor<int32, [4]>([1,%d,1,%d])];\n", DIM, SEQ];
-    [m appendString:@"        tensor<int32, [4]> b0 = const()[name=string(\"b0\"), val=tensor<int32, [4]>([0,0,0,0])];\n"];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dq = slice_by_size(x=x,begin=b0,size=sz)[name=string(\"s0\")];\n", DIM,SEQ];
-    [m appendFormat:@"        tensor<int32, [4]> b1 = const()[name=string(\"b1\"), val=tensor<int32, [4]>([0,%d,0,0])];\n", DIM];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dk = slice_by_size(x=x,begin=b1,size=sz)[name=string(\"s1\")];\n", DIM,SEQ];
-    [m appendFormat:@"        tensor<int32, [4]> b2 = const()[name=string(\"b2\"), val=tensor<int32, [4]>([0,%d,0,0])];\n", 2*DIM];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dv = slice_by_size(x=x,begin=b2,size=sz)[name=string(\"s2\")];\n", DIM,SEQ];
-    [m appendFormat:@"        tensor<fp16, [%d,%d,1,1]> Wqt = const()[name=string(\"Wqt\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/wqt.bin\"), offset=uint64(64)))];\n", DIM,DIM,DIM,DIM];
-    [m appendFormat:@"        tensor<fp16, [%d,%d,1,1]> Wkt = const()[name=string(\"Wkt\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/wkt.bin\"), offset=uint64(64)))];\n", DIM,DIM,DIM,DIM];
-    [m appendFormat:@"        tensor<fp16, [%d,%d,1,1]> Wvt = const()[name=string(\"Wvt\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/wvt.bin\"), offset=uint64(64)))];\n", DIM,DIM,DIM,DIM];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dxq = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wqt,x=dq)[name=string(\"cq\")];\n", DIM,SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dxk = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wkt,x=dk)[name=string(\"ck\")];\n", DIM,SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dxv = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wvt,x=dv)[name=string(\"cv\")];\n", DIM,SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dxqk = add(x=dxq,y=dxk)[name=string(\"aqk\")];\n", DIM,SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> out = add(x=dxqk,y=dxv)[name=string(\"out\")];\n", DIM,SEQ];
-    [m appendString:@"    } -> (out);\n}\n"];
-    return m;
-}
-
-// SDPA backward part 1 + Wo^T: concat(Q,K,V,dx2) → Wo^T(dx2) → concat(dV, probs_flat, dp_flat) fp16
-// SCORE_CH: channels needed for flattened attention scores [HEADS,SEQ,SEQ] → [HEADS*SEQ, SEQ]
-#define SCORE_CH (HEADS*SEQ)
-
-static NSString *gen_sdpa_bwd1(void) {
-    float sc = 1.0f/sqrtf((float)HD);
-    NSMutableString *m = [NSMutableString string];
-    [m appendString:MIL_HDR];
-    [m appendFormat:@"    func main<ios18>(tensor<fp16, [1, %d, 1, %d]> x) {\n", 4*DIM, SEQ];
-    [m appendString:@CONV_CONST];
-    [m appendFormat:@"        tensor<int32, [4]> sz = const()[name=string(\"sz\"), val=tensor<int32, [4]>([1,%d,1,%d])];\n", DIM, SEQ];
-    [m appendString:@"        tensor<int32, [4]> b0 = const()[name=string(\"b0\"), val=tensor<int32, [4]>([0,0,0,0])];\n"];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> qf = slice_by_size(x=x,begin=b0,size=sz)[name=string(\"s0\")];\n", DIM,SEQ];
-    [m appendFormat:@"        tensor<int32, [4]> b1 = const()[name=string(\"b1\"), val=tensor<int32, [4]>([0,%d,0,0])];\n", DIM];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> kf = slice_by_size(x=x,begin=b1,size=sz)[name=string(\"s1\")];\n", DIM,SEQ];
-    [m appendFormat:@"        tensor<int32, [4]> b2 = const()[name=string(\"b2\"), val=tensor<int32, [4]>([0,%d,0,0])];\n", 2*DIM];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> vf = slice_by_size(x=x,begin=b2,size=sz)[name=string(\"s2\")];\n", DIM,SEQ];
-    [m appendFormat:@"        tensor<int32, [4]> b3 = const()[name=string(\"b3\"), val=tensor<int32, [4]>([0,%d,0,0])];\n", 3*DIM];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dx2f = slice_by_size(x=x,begin=b3,size=sz)[name=string(\"s3\")];\n", DIM,SEQ];
-    // Wo^T backward: dx2 → dattn
-    [m appendFormat:@"        tensor<fp16, [%d,%d,1,1]> Wot = const()[name=string(\"Wot\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/wot.bin\"), offset=uint64(64)))];\n", DIM,DIM,DIM,DIM];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> df = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wot,x=dx2f)[name=string(\"cwo\")];\n", DIM,SEQ];
-    [m appendFormat:@"        tensor<int32, [4]> rsh = const()[name=string(\"rsh\"), val=tensor<int32, [4]>([1,%d,%d,%d])];\n", HEADS,HD,SEQ];
-    [m appendString:@"        tensor<int32, [4]> pm = const()[name=string(\"pm\"), val=tensor<int32, [4]>([0,1,3,2])];\n"];
-    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> qr = reshape(shape=rsh,x=qf)[name=string(\"rq\")];\n", HEADS,HD,SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> q = transpose(perm=pm,x=qr)[name=string(\"tq\")];\n", HEADS,SEQ,HD];
-    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> kr = reshape(shape=rsh,x=kf)[name=string(\"rk\")];\n", HEADS,HD,SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> k = transpose(perm=pm,x=kr)[name=string(\"tk\")];\n", HEADS,SEQ,HD];
-    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> vr = reshape(shape=rsh,x=vf)[name=string(\"rv\")];\n", HEADS,HD,SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> v = transpose(perm=pm,x=vr)[name=string(\"tv\")];\n", HEADS,SEQ,HD];
-    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> dr = reshape(shape=rsh,x=df)[name=string(\"rd\")];\n", HEADS,HD,SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> da = transpose(perm=pm,x=dr)[name=string(\"td\")];\n", HEADS,SEQ,HD];
-    [m appendString:@"        bool bF = const()[name=string(\"bF\"), val=bool(false)];\n"];
-    [m appendString:@"        bool bT = const()[name=string(\"bT\"), val=bool(true)];\n"];
-    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> sc1 = matmul(transpose_x=bF,transpose_y=bT,x=q,y=k)[name=string(\"mm1\")];\n", HEADS,SEQ,SEQ];
-    [m appendFormat:@"        fp16 scv = const()[name=string(\"scv\"), val=fp16(%f)];\n", sc];
-    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> sc2 = mul(x=sc1,y=scv)[name=string(\"scl\")];\n", HEADS,SEQ,SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,1,%d,%d]> cm = const()[name=string(\"cm\"), val=tensor<fp16, [1,1,%d,%d]>(BLOBFILE(path=string(\"@model_path/weights/mask.bin\"), offset=uint64(64)))];\n", SEQ,SEQ,SEQ,SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> ms = add(x=sc2,y=cm)[name=string(\"msk\")];\n", HEADS,SEQ,SEQ];
-    [m appendString:@"        int32 sax = const()[name=string(\"sax\"), val=int32(-1)];\n"];
-    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> probs = softmax(axis=sax,x=ms)[name=string(\"sm\")];\n", HEADS,SEQ,SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> dv4 = matmul(transpose_x=bT,transpose_y=bF,x=probs,y=da)[name=string(\"dv\")];\n", HEADS,SEQ,HD];
-    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> dp4 = matmul(transpose_x=bF,transpose_y=bT,x=da,y=v)[name=string(\"dp\")];\n", HEADS,SEQ,SEQ];
-    // Flatten dv back to [1,DIM,1,SEQ]
-    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> dvt = transpose(perm=pm,x=dv4)[name=string(\"dvt\")];\n", HEADS,HD,SEQ];
-    [m appendFormat:@"        tensor<int32, [4]> dvs = const()[name=string(\"dvs\"), val=tensor<int32, [4]>([1,%d,1,%d])];\n", DIM,SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dvf = reshape(shape=dvs,x=dvt)[name=string(\"dvf\")];\n", DIM,SEQ];
-    // Flatten probs [1,H,S,S] → [1,H*S,1,S] and dp [1,H,S,S] → [1,H*S,1,S]
-    [m appendFormat:@"        tensor<int32, [4]> scs = const()[name=string(\"scs\"), val=tensor<int32, [4]>([1,%d,1,%d])];\n", SCORE_CH,SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> pf = reshape(shape=scs,x=probs)[name=string(\"pf\")];\n", SCORE_CH,SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dpf = reshape(shape=scs,x=dp4)[name=string(\"dpf\")];\n", SCORE_CH,SEQ];
-    [m appendString:@"        int32 cax = const()[name=string(\"cax\"), val=int32(1)];\n"];
-    [m appendString:@"        bool cid = const()[name=string(\"cid\"), val=bool(false)];\n"];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> out = concat(axis=cax,interleave=cid,values=(dvf,pf,dpf))[name=string(\"cat\")];\n", DIM+2*SCORE_CH,SEQ];
-    [m appendString:@"    } -> (out);\n}\n"];
-    return m;
-}
-
-// SDPA backward part 2: concat(probs[SCORE_CH],dp[SCORE_CH],Q[DIM],K[DIM]) → concat(dQ,dK) fp16
-static NSString *gen_sdpa_bwd2(void) {
-    float sc = 1.0f/sqrtf((float)HD);
-    int bwd2_in = 2*SCORE_CH + 2*DIM;
-    NSMutableString *m = [NSMutableString string];
-    [m appendString:MIL_HDR];
-    [m appendFormat:@"    func main<ios18>(tensor<fp16, [1, %d, 1, %d]> x) {\n", bwd2_in, SEQ];
-    // Slice probs
-    [m appendFormat:@"        tensor<int32, [4]> sz_sc = const()[name=string(\"szsc\"), val=tensor<int32, [4]>([1,%d,1,%d])];\n", SCORE_CH, SEQ];
-    [m appendString:@"        tensor<int32, [4]> b0 = const()[name=string(\"b0\"), val=tensor<int32, [4]>([0,0,0,0])];\n"];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> pf = slice_by_size(x=x,begin=b0,size=sz_sc)[name=string(\"s0\")];\n", SCORE_CH,SEQ];
-    // Slice dp
-    [m appendFormat:@"        tensor<int32, [4]> b1 = const()[name=string(\"b1\"), val=tensor<int32, [4]>([0,%d,0,0])];\n", SCORE_CH];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dpf = slice_by_size(x=x,begin=b1,size=sz_sc)[name=string(\"s1\")];\n", SCORE_CH,SEQ];
-    // Slice Q
-    [m appendFormat:@"        tensor<int32, [4]> sz_d = const()[name=string(\"szd\"), val=tensor<int32, [4]>([1,%d,1,%d])];\n", DIM, SEQ];
-    [m appendFormat:@"        tensor<int32, [4]> b2 = const()[name=string(\"b2\"), val=tensor<int32, [4]>([0,%d,0,0])];\n", 2*SCORE_CH];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> qf = slice_by_size(x=x,begin=b2,size=sz_d)[name=string(\"s2\")];\n", DIM,SEQ];
-    // Slice K
-    [m appendFormat:@"        tensor<int32, [4]> b3 = const()[name=string(\"b3\"), val=tensor<int32, [4]>([0,%d,0,0])];\n", 2*SCORE_CH+DIM];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> kf = slice_by_size(x=x,begin=b3,size=sz_d)[name=string(\"s3\")];\n", DIM,SEQ];
-    // Reshape to multi-head
-    [m appendFormat:@"        tensor<int32, [4]> ssh = const()[name=string(\"ssh\"), val=tensor<int32, [4]>([1,%d,%d,%d])];\n", HEADS,SEQ,SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> probs = reshape(shape=ssh,x=pf)[name=string(\"rp\")];\n", HEADS,SEQ,SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> dp = reshape(shape=ssh,x=dpf)[name=string(\"rdp\")];\n", HEADS,SEQ,SEQ];
-    [m appendFormat:@"        tensor<int32, [4]> rsh = const()[name=string(\"rsh\"), val=tensor<int32, [4]>([1,%d,%d,%d])];\n", HEADS,HD,SEQ];
-    [m appendString:@"        tensor<int32, [4]> pm = const()[name=string(\"pm\"), val=tensor<int32, [4]>([0,1,3,2])];\n"];
-    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> qr = reshape(shape=rsh,x=qf)[name=string(\"rq\")];\n", HEADS,HD,SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> q = transpose(perm=pm,x=qr)[name=string(\"tq\")];\n", HEADS,SEQ,HD];
-    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> kr = reshape(shape=rsh,x=kf)[name=string(\"rk\")];\n", HEADS,HD,SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> k = transpose(perm=pm,x=kr)[name=string(\"tk\")];\n", HEADS,SEQ,HD];
-    // Softmax grad: ds = probs * (dp - sum(probs*dp)) * scale
-    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> pdp = mul(x=probs,y=dp)[name=string(\"pdp\")];\n", HEADS,SEQ,SEQ];
-    [m appendString:@"        tensor<int32, [1]> rax = const()[name=string(\"rax\"), val=tensor<int32, [1]>([-1])];\n"];
-    [m appendString:@"        bool kd = const()[name=string(\"kd\"), val=bool(true)];\n"];
-    [m appendFormat:@"        tensor<fp16, [1,%d,%d,1]> spdp = reduce_sum(x=pdp,axes=rax,keep_dims=kd)[name=string(\"rs\")];\n", HEADS,SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> dps = sub(x=dp,y=spdp)[name=string(\"dps\")];\n", HEADS,SEQ,SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> ds0 = mul(x=probs,y=dps)[name=string(\"ds0\")];\n", HEADS,SEQ,SEQ];
-    [m appendFormat:@"        fp16 scv = const()[name=string(\"scv\"), val=fp16(%f)];\n", sc];
-    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> ds = mul(x=ds0,y=scv)[name=string(\"ds\")];\n", HEADS,SEQ,SEQ];
-    [m appendString:@"        bool bF = const()[name=string(\"bF\"), val=bool(false)];\n"];
-    [m appendString:@"        bool bT = const()[name=string(\"bT\"), val=bool(true)];\n"];
-    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> dq4 = matmul(transpose_x=bF,transpose_y=bF,x=ds,y=k)[name=string(\"dq\")];\n", HEADS,SEQ,HD];
-    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> dk4 = matmul(transpose_x=bT,transpose_y=bF,x=ds,y=q)[name=string(\"dk\")];\n", HEADS,SEQ,HD];
-    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> dqt = transpose(perm=pm,x=dq4)[name=string(\"dqt\")];\n", HEADS,HD,SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> dkt = transpose(perm=pm,x=dk4)[name=string(\"dkt\")];\n", HEADS,HD,SEQ];
-    [m appendFormat:@"        tensor<int32, [4]> fs = const()[name=string(\"fs\"), val=tensor<int32, [4]>([1,%d,1,%d])];\n", DIM,SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dqf = reshape(shape=fs,x=dqt)[name=string(\"dqf\")];\n", DIM,SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dkf = reshape(shape=fs,x=dkt)[name=string(\"dkf\")];\n", DIM,SEQ];
-    [m appendString:@"        int32 cax = const()[name=string(\"cax\"), val=int32(1)];\n"];
-    [m appendString:@"        bool cid = const()[name=string(\"cid\"), val=bool(false)];\n"];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> out = concat(axis=cax,interleave=cid,values=(dqf,dkf))[name=string(\"cat\")];\n", 2*DIM,SEQ];
-    [m appendString:@"    } -> (out);\n}\n"];
-    return m;
-}
-
-// ===== Weight builders =====
-static NSData *g_mask_blob = nil;
-static NSData *get_mask_blob(void) {
-    if (!g_mask_blob) {
-        _Float16 *mask = (_Float16*)calloc(SEQ*SEQ, sizeof(_Float16));
-        for(int t=0;t<SEQ;t++) for(int t2=0;t2<SEQ;t2++)
-            mask[t*SEQ+t2] = (t2<=t) ? (_Float16)0.0f : (_Float16)(-65504.0f);
-        g_mask_blob = build_blob_fp16(mask, SEQ*SEQ);
-        free(mask);
+// ===== Weight loading from llama2.c format =====
+static bool load_pretrained(LayerWeights *lw, float *rms_final, float *embed, const char *path) {
+    FILE *f = fopen(path, "rb");
+    if (!f) { printf("Cannot open %s\n", path); return false; }
+    Llama2Config cfg;
+    fread(&cfg, sizeof(cfg), 1, f);
+    printf("  Model config: dim=%d hidden=%d layers=%d heads=%d vocab=%d seq=%d\n",
+           cfg.dim, cfg.hidden_dim, cfg.n_layers, cfg.n_heads, abs(cfg.vocab_size), cfg.seq_len);
+    if (cfg.dim != DIM || cfg.hidden_dim != HIDDEN || cfg.n_layers != NLAYERS) {
+        printf("  ERROR: Config mismatch! Expected dim=%d hidden=%d layers=%d\n", DIM, HIDDEN, NLAYERS);
+        fclose(f); return false;
     }
-    return g_mask_blob;
+    int V = abs(cfg.vocab_size);
+    bool shared = cfg.vocab_size > 0;
+
+    // Read in llama2.c order: embed, rms_att[all], wq[all], wk[all], wv[all], wo[all],
+    //                         rms_ffn[all], w1[all], w2[all], w3[all], rms_final, [wcls]
+    fread(embed, 4, V * DIM, f);
+
+    // rms_att weights for all layers (contiguous)
+    for (int L = 0; L < NLAYERS; L++) fread(lw[L].rms_att, 4, DIM, f);
+    // wq for all layers
+    for (int L = 0; L < NLAYERS; L++) fread(lw[L].Wq, 4, WQ_SZ, f);
+    // wk for all layers
+    for (int L = 0; L < NLAYERS; L++) fread(lw[L].Wk, 4, WQ_SZ, f);
+    // wv for all layers
+    for (int L = 0; L < NLAYERS; L++) fread(lw[L].Wv, 4, WQ_SZ, f);
+    // wo for all layers
+    for (int L = 0; L < NLAYERS; L++) fread(lw[L].Wo, 4, WO_SZ, f);
+    // rms_ffn weights for all layers
+    for (int L = 0; L < NLAYERS; L++) fread(lw[L].rms_ffn, 4, DIM, f);
+    // w1 for all layers
+    for (int L = 0; L < NLAYERS; L++) fread(lw[L].W1, 4, W1_SZ, f);
+    // w2 for all layers
+    for (int L = 0; L < NLAYERS; L++) fread(lw[L].W2, 4, W2_SZ, f);
+    // w3 for all layers
+    for (int L = 0; L < NLAYERS; L++) fread(lw[L].W3, 4, W3_SZ, f);
+    // rms_final
+    fread(rms_final, 4, DIM, f);
+    // wcls = embed if shared (we just use embed pointer)
+
+    fclose(f);
+    printf("  Loaded pretrained weights (%s)\n", shared ? "shared embed/cls" : "separate cls");
+    return true;
 }
 
-// ===== Kernel compilation and evaluation =====
-typedef struct { void *model; IOSurfaceRef ioIn, ioOut; void *request; void *tmpDir; } Kern;
+// ===== Compile one layer's kernels =====
+static bool compile_layer_kernels(LayerKernels *lk, LayerWeights *w) {
+    lk->fwdAttn = compile_kern_mil_w(gen_sdpa_fwd_taps(), (@{
+        @"@model_path/weights/rms1.bin": @{@"offset":@0, @"data":build_blob(w->rms_att,1,DIM)},
+        @"@model_path/weights/wq.bin": @{@"offset":@0, @"data":build_blob(w->Wq,DIM,DIM)},
+        @"@model_path/weights/wk.bin": @{@"offset":@0, @"data":build_blob(w->Wk,DIM,DIM)},
+        @"@model_path/weights/wv.bin": @{@"offset":@0, @"data":build_blob(w->Wv,DIM,DIM)},
+        @"@model_path/weights/wo.bin": @{@"offset":@0, @"data":build_blob(w->Wo,DIM,DIM)},
+        @"@model_path/weights/mask.bin": @{@"offset":@0, @"data":get_mask_blob()},
+    }), DIM*SEQ*2, 6*DIM*SEQ*2);
 
-static Kern *compile_kern_mil_w(NSString *mil, NSDictionary *weights, int ic_bytes, int oc_bytes) {
-    @autoreleasepool {
-    NSData *md = [mil dataUsingEncoding:NSUTF8StringEncoding];
-    id desc = ((id(*)(Class,SEL,id,id,id))objc_msgSend)(g_D, @selector(modelWithMILText:weights:optionsPlist:), md, weights, nil);
-    if (!desc) { printf("  [compile] desc=NULL\n"); return NULL; }
-    id mdl = ((id(*)(Class,SEL,id))objc_msgSend)(g_I, @selector(inMemoryModelWithDescriptor:), desc);
-    id hx = ((id(*)(id,SEL))objc_msgSend)(mdl, @selector(hexStringIdentifier));
-    NSString *td = [NSTemporaryDirectory() stringByAppendingPathComponent:hx];
-    [[NSFileManager defaultManager] createDirectoryAtPath:[td stringByAppendingPathComponent:@"weights"] withIntermediateDirectories:YES attributes:nil error:nil];
-    [md writeToFile:[td stringByAppendingPathComponent:@"model.mil"] atomically:YES];
-    for (NSString *path in weights) {
-        NSString *rel = [path stringByReplacingOccurrencesOfString:@"@model_path/" withString:@""];
-        [weights[path][@"data"] writeToFile:[td stringByAppendingPathComponent:rel] atomically:YES];
-    }
-    NSError *e = nil;
-    if (!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(compileWithQoS:options:error:), 21, @{}, &e)) {
-        printf("  [compile] FAIL: %s\n", e ? [[e description] UTF8String] : "no error"); return NULL;
-    }
-    if (!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(loadWithQoS:options:error:), 21, @{}, &e)) {
-        printf("  [compile] load FAIL\n"); return NULL;
-    }
-    __sync_fetch_and_add(&g_compile_count, 1);
-    Kern *k = calloc(1, sizeof(Kern));
-    k->model = CFBridgingRetain(mdl);
-    k->ioIn = make_surface(ic_bytes);
-    k->ioOut = make_surface(oc_bytes);
-    id wI = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), k->ioIn);
-    id wO = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), k->ioOut);
-    k->request = CFBridgingRetain(((id(*)(Class,SEL,id,id,id,id,id,id,id))objc_msgSend)(g_AR,
-        @selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:),
-        @[wI], @[@0], @[wO], @[@0], nil, nil, @0));
-    k->tmpDir = CFBridgingRetain(td);
-    return k;
-    }
-}
-static void free_kern(Kern *k) {
-    if (!k) return;
-    id mdl = (__bridge id)k->model; NSError *e = nil;
-    ((BOOL(*)(id,SEL,unsigned int,NSError**))objc_msgSend)(mdl, @selector(unloadWithQoS:error:), 21, &e);
-    CFRelease(k->ioIn); CFRelease(k->ioOut);
-    [[NSFileManager defaultManager] removeItemAtPath:(__bridge id)k->tmpDir error:nil];
-    CFRelease(k->model); CFRelease(k->request); CFRelease(k->tmpDir);
-    free(k);
-}
-static void ane_eval(Kern *k) {
-    id mdl = (__bridge id)k->model; id req = (__bridge id)k->request; NSError *e = nil;
-    ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)(mdl, @selector(evaluateWithQoS:options:request:error:), 21, @{}, req, &e);
+    lk->fwdFFN = compile_kern_mil_w(gen_ffn_fwd_taps(), (@{
+        @"@model_path/weights/rms2.bin": @{@"offset":@0, @"data":build_blob(w->rms_ffn,1,DIM)},
+        @"@model_path/weights/w1.bin": @{@"offset":@0, @"data":build_blob(w->W1,HIDDEN,DIM)},
+        @"@model_path/weights/w3.bin": @{@"offset":@0, @"data":build_blob(w->W3,HIDDEN,DIM)},
+        @"@model_path/weights/w2.bin": @{@"offset":@0, @"data":build_blob(w->W2,DIM,HIDDEN)},
+    }), DIM*SEQ*2, (2*DIM+3*HIDDEN)*SEQ*2);
+
+    lk->ffnBwd = compile_kern_mil_w(gen_ffn_bwd(), (@{
+        @"@model_path/weights/w2t.bin": @{@"offset":@0, @"data":build_blob_t(w->W2,DIM,HIDDEN)},
+        @"@model_path/weights/w1t.bin": @{@"offset":@0, @"data":build_blob_t(w->W1,HIDDEN,DIM)},
+        @"@model_path/weights/w3t.bin": @{@"offset":@0, @"data":build_blob_t(w->W3,HIDDEN,DIM)},
+    }), (DIM+2*HIDDEN)*SEQ*2, (DIM+2*HIDDEN)*SEQ*2);
+
+    lk->sdpaBwd1 = compile_kern_mil_w(gen_sdpa_bwd1(), (@{
+        @"@model_path/weights/mask.bin": @{@"offset":@0, @"data":get_mask_blob()},
+        @"@model_path/weights/wot.bin": @{@"offset":@0, @"data":build_blob_t(w->Wo,DIM,DIM)},
+    }), 4*DIM*SEQ*2, (DIM+2*SCORE_CH)*SEQ*2);
+
+    lk->qkvBwd = compile_kern_mil_w(gen_qkvb(), (@{
+        @"@model_path/weights/wqt.bin": @{@"offset":@0, @"data":build_blob_t(w->Wq,DIM,DIM)},
+        @"@model_path/weights/wkt.bin": @{@"offset":@0, @"data":build_blob_t(w->Wk,DIM,DIM)},
+        @"@model_path/weights/wvt.bin": @{@"offset":@0, @"data":build_blob_t(w->Wv,DIM,DIM)},
+    }), 3*DIM*SEQ*2, DIM*SEQ*2);
+
+    return lk->fwdAttn && lk->fwdFFN && lk->ffnBwd && lk->sdpaBwd1 && lk->qkvBwd;
 }
 
-// ===== Vectorized conversion helpers (NEON) =====
-#include <arm_neon.h>
-static void cvt_f16_f32(float *dst, const _Float16 *src, int n) {
-    int i = 0;
-    for (; i+7 < n; i += 8) {
-        float16x8_t h = vld1q_f16((const __fp16*)(src+i));
-        vst1q_f32(dst+i,   vcvt_f32_f16(vget_low_f16(h)));
-        vst1q_f32(dst+i+4, vcvt_f32_f16(vget_high_f16(h)));
-    }
-    for (; i < n; i++) dst[i] = (float)src[i];
-}
-static void cvt_f32_f16(_Float16 *dst, const float *src, int n) {
-    int i = 0;
-    for (; i+7 < n; i += 8) {
-        float16x8_t h = vcombine_f16(vcvt_f16_f32(vld1q_f32(src+i)),
-                                      vcvt_f16_f32(vld1q_f32(src+i+4)));
-        vst1q_f16((__fp16*)(dst+i), h);
-    }
-    for (; i < n; i++) dst[i] = (_Float16)src[i];
+// Compile weight-free sdpaBwd2 (only needs once, no weights)
+static Kern *compile_sdpa_bwd2(void) {
+    return compile_kern_mil_w(gen_sdpa_bwd2(), @{},
+        (2*SCORE_CH+2*DIM)*SEQ*2, 2*DIM*SEQ*2);
 }
 
-// ===== IOSurface I/O helpers (channel-first, no transpose) =====
-// All CPU buffers are [C,S] channel-first matching IOSurface [1,C,1,S]
-// Write fp32 [C,S] → fp16 [1,C,1,S] (just type conversion, no transpose)
-static void io_write_fp16(IOSurfaceRef s, const float *data, int channels, int sp) {
-    IOSurfaceLock(s, 0, NULL);
-    _Float16 *dst = (_Float16*)IOSurfaceGetBaseAddress(s);
-    cvt_f32_f16(dst, data, channels * sp);
-    IOSurfaceUnlock(s, 0, NULL);
-}
-// Write fp32 [C,S] → fp32 [1,C,1,S] (just memcpy)
-static void io_write_fp32(IOSurfaceRef s, const float *data, int channels, int sp) {
-    IOSurfaceLock(s, 0, NULL);
-    memcpy(IOSurfaceGetBaseAddress(s), data, channels * sp * sizeof(float));
-    IOSurfaceUnlock(s, 0, NULL);
-}
-// Read fp16 [1,C,1,S] → fp32 [C,S] at channel offset (just type conversion)
-static void io_read_fp16(IOSurfaceRef s, float *data, int ch_off, int channels, int sp) {
-    IOSurfaceLock(s, kIOSurfaceLockReadOnly, NULL);
-    _Float16 *src = (_Float16*)IOSurfaceGetBaseAddress(s) + ch_off * sp;
-    cvt_f16_f32(data, src, channels * sp);
-    IOSurfaceUnlock(s, kIOSurfaceLockReadOnly, NULL);
-}
-// Read fp32 [1,C,1,S] → fp32 [C,S] (just memcpy)
-static void io_read_fp32(IOSurfaceRef s, float *data, int channels, int sp) {
-    IOSurfaceLock(s, kIOSurfaceLockReadOnly, NULL);
-    memcpy(data, IOSurfaceGetBaseAddress(s), channels * sp * sizeof(float));
-    IOSurfaceUnlock(s, kIOSurfaceLockReadOnly, NULL);
-}
-// Write multiple fp32 [C,S] arrays concatenated along channel dim as fp16
-static void io_write_multi_fp16(IOSurfaceRef s, int sp, int n, ...) {
-    IOSurfaceLock(s, 0, NULL);
-    _Float16 *dst = (_Float16*)IOSurfaceGetBaseAddress(s);
-    va_list ap; va_start(ap, n);
-    int ch_off = 0;
-    for (int i=0; i<n; i++) {
-        const float *data = va_arg(ap, const float*);
-        int channels = va_arg(ap, int);
-        cvt_f32_f16(dst + ch_off*sp, data, channels * sp);
-        ch_off += channels;
-    }
-    va_end(ap);
-    IOSurfaceUnlock(s, 0, NULL);
-}
-// Direct copy between IOSurfaces (no format conversion — both fp16 channel-first)
-static void io_copy(IOSurfaceRef dst, int dst_ch, IOSurfaceRef src, int src_ch, int channels, int sp) {
-    IOSurfaceLock(dst, 0, NULL);
-    IOSurfaceLock(src, kIOSurfaceLockReadOnly, NULL);
-    memcpy((_Float16*)IOSurfaceGetBaseAddress(dst) + dst_ch*sp,
-           (_Float16*)IOSurfaceGetBaseAddress(src) + src_ch*sp,
-           channels * sp * sizeof(_Float16));
-    IOSurfaceUnlock(src, kIOSurfaceLockReadOnly, NULL);
-    IOSurfaceUnlock(dst, 0, NULL);
-}
-// Write one fp32 [C,S] array at specific channel offset in IOSurface as fp16
-static void io_write_fp16_at(IOSurfaceRef s, int ch_off, const float *data, int channels, int sp) {
-    IOSurfaceLock(s, 0, NULL);
-    _Float16 *dst = (_Float16*)IOSurfaceGetBaseAddress(s) + ch_off * sp;
-    cvt_f32_f16(dst, data, channels * sp);
-    IOSurfaceUnlock(s, 0, NULL);
+static void free_layer_kernels(LayerKernels *lk) {
+    free_kern(lk->fwdAttn); free_kern(lk->fwdFFN); free_kern(lk->ffnBwd);
+    free_kern(lk->sdpaBwd1); free_kern(lk->qkvBwd);
+    // sdpaBwd2 is shared, freed separately
+    lk->fwdAttn = lk->fwdFFN = lk->ffnBwd = lk->sdpaBwd1 = lk->qkvBwd = NULL;
 }
 
-// ===== CPU ops (channel-first [C,S] layout) =====
-// x[i*S+t] = channel i, position t
-// Process all positions in parallel using vectorized column ops
-static float *g_rms_tmp = NULL;
-static void rmsnorm(float *out, const float *x, const float *w, int d, int S) {
-    if (!g_rms_tmp) g_rms_tmp = malloc(S*4);
-    float *ss = calloc(S, sizeof(float));
-    for (int i=0; i<d; i++) {
-        vDSP_vmul(x+i*S, 1, x+i*S, 1, g_rms_tmp, 1, (vDSP_Length)S);
-        vDSP_vadd(g_rms_tmp, 1, ss, 1, ss, 1, (vDSP_Length)S);
+// ===== Checkpoint save/load =====
+static void save_checkpoint(const char *path, int step, int total_steps, float lr, float loss,
+                            double cc, double ct, double cw, int cs, int cb, int adam_t,
+                            LayerWeights *lw, LayerAdam *la, float *rms_final, AdamState *arms_final,
+                            float *embed, AdamState *aembed) {
+    FILE *f = fopen(path, "wb");
+    CkptHdr h = {0};
+    h.magic = 0x424C5A54; h.version = 2;
+    h.step = step; h.total_steps = total_steps;
+    h.n_layers = NLAYERS; h.vocab_size = VOCAB; h.dim = DIM;
+    h.hidden_dim = HIDDEN; h.n_heads = HEADS; h.seq_len = SEQ;
+    h.lr = lr; h.loss = loss;
+    h.cum_compile = cc; h.cum_train = ct; h.cum_wall = cw;
+    h.cum_steps = cs; h.cum_batches = cb; h.adam_t = adam_t;
+    fwrite(&h, sizeof(h), 1, f);
+    // Per-layer weights + adam
+    for (int L = 0; L < NLAYERS; L++) {
+        fwrite(lw[L].Wq,4,WQ_SZ,f); fwrite(lw[L].Wk,4,WQ_SZ,f);
+        fwrite(lw[L].Wv,4,WQ_SZ,f); fwrite(lw[L].Wo,4,WO_SZ,f);
+        fwrite(lw[L].W1,4,W1_SZ,f); fwrite(lw[L].W2,4,W2_SZ,f); fwrite(lw[L].W3,4,W3_SZ,f);
+        fwrite(lw[L].rms_att,4,DIM,f); fwrite(lw[L].rms_ffn,4,DIM,f);
+        // Adam state
+        fwrite(la[L].Wq.m,4,WQ_SZ,f); fwrite(la[L].Wq.v,4,WQ_SZ,f);
+        fwrite(la[L].Wk.m,4,WQ_SZ,f); fwrite(la[L].Wk.v,4,WQ_SZ,f);
+        fwrite(la[L].Wv.m,4,WQ_SZ,f); fwrite(la[L].Wv.v,4,WQ_SZ,f);
+        fwrite(la[L].Wo.m,4,WO_SZ,f); fwrite(la[L].Wo.v,4,WO_SZ,f);
+        fwrite(la[L].W1.m,4,W1_SZ,f); fwrite(la[L].W1.v,4,W1_SZ,f);
+        fwrite(la[L].W2.m,4,W2_SZ,f); fwrite(la[L].W2.v,4,W2_SZ,f);
+        fwrite(la[L].W3.m,4,W3_SZ,f); fwrite(la[L].W3.v,4,W3_SZ,f);
+        fwrite(la[L].rms_att.m,4,DIM,f); fwrite(la[L].rms_att.v,4,DIM,f);
+        fwrite(la[L].rms_ffn.m,4,DIM,f); fwrite(la[L].rms_ffn.v,4,DIM,f);
     }
-    float invd = 1.0f/d, eps=1e-5f;
-    vDSP_vsmsa(ss, 1, &invd, &eps, ss, 1, (vDSP_Length)S);
-    int n = S; vvrsqrtf(ss, ss, &n);
-    for (int i=0; i<d; i++) {
-        vDSP_vmul(x+i*S, 1, ss, 1, out+i*S, 1, (vDSP_Length)S);
-        vDSP_vsmul(out+i*S, 1, &w[i], out+i*S, 1, (vDSP_Length)S);
-    }
-    free(ss);
-}
-static void rmsnorm_bwd(float *dx, float *dw, const float *dy, const float *x, const float *w, int d, int S) {
-    if (!g_rms_tmp) g_rms_tmp = malloc(S*4);
-    float *ss = calloc(S, sizeof(float));
-    for (int i=0; i<d; i++) {
-        vDSP_vmul(x+i*S, 1, x+i*S, 1, g_rms_tmp, 1, (vDSP_Length)S);
-        vDSP_vadd(g_rms_tmp, 1, ss, 1, ss, 1, (vDSP_Length)S);
-    }
-    float invd = 1.0f/d, eps=1e-5f;
-    vDSP_vsmsa(ss, 1, &invd, &eps, ss, 1, (vDSP_Length)S);
-    float *rrms = malloc(S*4);
-    int n = S; vvrsqrtf(rrms, ss, &n);
-    // dot[t] = sum_i dy[i,t]*x[i,t]*w[i]
-    float *dot = calloc(S, sizeof(float));
-    for (int i=0; i<d; i++) {
-        vDSP_vmul(dy+i*S, 1, x+i*S, 1, g_rms_tmp, 1, (vDSP_Length)S);
-        // dot += tmp * w[i]  (vDSP_vsma: A*scalar+C→D)
-        vDSP_vsma(g_rms_tmp, 1, &w[i], dot, 1, dot, 1, (vDSP_Length)S);
-    }
-    // dot *= rrms^2/d
-    vDSP_vmul(rrms, 1, rrms, 1, ss, 1, (vDSP_Length)S);
-    vDSP_vsmul(ss, 1, &invd, ss, 1, (vDSP_Length)S);
-    vDSP_vmul(dot, 1, ss, 1, dot, 1, (vDSP_Length)S);
-    for (int i=0; i<d; i++) {
-        vDSP_vmul(x+i*S, 1, dot, 1, g_rms_tmp, 1, (vDSP_Length)S);
-        vDSP_vsub(g_rms_tmp, 1, dy+i*S, 1, g_rms_tmp, 1, (vDSP_Length)S);
-        vDSP_vmul(g_rms_tmp, 1, rrms, 1, g_rms_tmp, 1, (vDSP_Length)S);
-        vDSP_vsmul(g_rms_tmp, 1, &w[i], dx+i*S, 1, (vDSP_Length)S);
-        // dw[i] += sum_t dy[i,t]*x[i,t]*rrms[t]
-        vDSP_vmul(dy+i*S, 1, x+i*S, 1, g_rms_tmp, 1, (vDSP_Length)S);
-        vDSP_vmul(g_rms_tmp, 1, rrms, 1, g_rms_tmp, 1, (vDSP_Length)S);
-        float s; vDSP_sve(g_rms_tmp, 1, &s, (vDSP_Length)S);
-        dw[i] += s;
-    }
-    free(ss); free(rrms); free(dot);
+    fwrite(rms_final,4,DIM,f);
+    fwrite(arms_final->m,4,DIM,f); fwrite(arms_final->v,4,DIM,f);
+    fwrite(embed,4,VOCAB*DIM,f);
+    fwrite(aembed->m,4,VOCAB*DIM,f); fwrite(aembed->v,4,VOCAB*DIM,f);
+    fclose(f);
 }
 
-// ===== Checkpoint =====
-typedef struct {
-    int step, total_steps;
-    float lr, loss;
-    double cum_compile, cum_train, cum_wall;
-    int cum_steps, cum_batches;
-    int adam_t; // Adam timestep
-} CkptHdr;
-
-// Adam optimizer state
-typedef struct {
-    float *m, *v; // first and second moment
-    size_t n;
-} AdamState;
-static AdamState adam_alloc(size_t n) { return (AdamState){calloc(n,4), calloc(n,4), n}; }
-static void adam_free(AdamState *s) { free(s->m); free(s->v); }
-static void adam_update(float *w, const float *g, AdamState *s, int t, float lr, float b1, float b2, float eps) {
-    float bc1 = 1.0f - powf(b1, t), bc2 = 1.0f - powf(b2, t);
-    for (size_t i=0; i<s->n; i++) {
-        s->m[i] = b1*s->m[i] + (1-b1)*g[i];
-        s->v[i] = b2*s->v[i] + (1-b2)*g[i]*g[i];
-        float mh = s->m[i]/bc1, vh = s->v[i]/bc2;
-        w[i] -= lr * mh / (sqrtf(vh) + eps);
+static bool load_checkpoint(const char *path, int *step, int *total_steps, float *lr, float *loss,
+                             double *cc, double *ct, double *cw, int *cs, int *cb, int *adam_t,
+                             LayerWeights *lw, LayerAdam *la, float *rms_final, AdamState *arms_final,
+                             float *embed, AdamState *aembed) {
+    FILE *f = fopen(path, "rb");
+    if (!f) return false;
+    CkptHdr h;
+    fread(&h, sizeof(h), 1, f);
+    if (h.magic != 0x424C5A54 || h.version != 2) { fclose(f); return false; }
+    *step = h.step; *total_steps = h.total_steps; *lr = h.lr; *loss = h.loss;
+    *cc = h.cum_compile; *ct = h.cum_train; *cw = h.cum_wall;
+    *cs = h.cum_steps; *cb = h.cum_batches; *adam_t = h.adam_t;
+    for (int L = 0; L < NLAYERS; L++) {
+        fread(lw[L].Wq,4,WQ_SZ,f); fread(lw[L].Wk,4,WQ_SZ,f);
+        fread(lw[L].Wv,4,WQ_SZ,f); fread(lw[L].Wo,4,WO_SZ,f);
+        fread(lw[L].W1,4,W1_SZ,f); fread(lw[L].W2,4,W2_SZ,f); fread(lw[L].W3,4,W3_SZ,f);
+        fread(lw[L].rms_att,4,DIM,f); fread(lw[L].rms_ffn,4,DIM,f);
+        fread(la[L].Wq.m,4,WQ_SZ,f); fread(la[L].Wq.v,4,WQ_SZ,f);
+        fread(la[L].Wk.m,4,WQ_SZ,f); fread(la[L].Wk.v,4,WQ_SZ,f);
+        fread(la[L].Wv.m,4,WQ_SZ,f); fread(la[L].Wv.v,4,WQ_SZ,f);
+        fread(la[L].Wo.m,4,WO_SZ,f); fread(la[L].Wo.v,4,WO_SZ,f);
+        fread(la[L].W1.m,4,W1_SZ,f); fread(la[L].W1.v,4,W1_SZ,f);
+        fread(la[L].W2.m,4,W2_SZ,f); fread(la[L].W2.v,4,W2_SZ,f);
+        fread(la[L].W3.m,4,W3_SZ,f); fread(la[L].W3.v,4,W3_SZ,f);
+        fread(la[L].rms_att.m,4,DIM,f); fread(la[L].rms_att.v,4,DIM,f);
+        fread(la[L].rms_ffn.m,4,DIM,f); fread(la[L].rms_ffn.v,4,DIM,f);
     }
+    fread(rms_final,4,DIM,f);
+    fread(arms_final->m,4,DIM,f); fread(arms_final->v,4,DIM,f);
+    fread(embed,4,VOCAB*DIM,f);
+    fread(aembed->m,4,VOCAB*DIM,f); fread(aembed->v,4,VOCAB*DIM,f);
+    fclose(f);
+    return true;
 }
 
+// ===== Main =====
 int main(int argc, char *argv[]) {
     @autoreleasepool {
         setbuf(stdout, NULL);
         ane_init();
         mach_timebase_info(&g_tb);
 
-        int total_steps = 400;
-        float lr = 1e-3f;
+        int total_steps = 10000;
+        float lr = 3e-4f;
         float adam_b1=0.9f, adam_b2=0.999f, adam_eps=1e-8f;
-        int adam_t = 0;
-        int start_step = 0;
+        int adam_t = 0, start_step = 0;
 
-        size_t wq_sz = DIM*DIM, wo_sz = DIM*DIM;
-        size_t w1_sz = HIDDEN*DIM, w2_sz = DIM*HIDDEN, w3_sz = HIDDEN*DIM;
-        size_t total_params = 4*wq_sz + w1_sz + w2_sz + w3_sz;
+        // Parse args
+        bool do_resume = false;
+        for (int i=1; i<argc; i++) {
+            if (strcmp(argv[i], "--resume") == 0) do_resume = true;
+            else if (strcmp(argv[i], "--steps") == 0 && i+1<argc) total_steps = atoi(argv[++i]);
+            else if (strcmp(argv[i], "--lr") == 0 && i+1<argc) lr = atof(argv[++i]);
+        }
 
-        float *Wq=malloc(wq_sz*4), *Wk=malloc(wq_sz*4), *Wv=malloc(wq_sz*4), *Wo=malloc(wo_sz*4);
-        float *W1=malloc(w1_sz*4), *W2=malloc(w2_sz*4), *W3=malloc(w3_sz*4);
-        float *rms1_w=malloc(DIM*4), *rms2_w=malloc(DIM*4);
+        // Allocate per-layer state
+        LayerWeights lw[NLAYERS];
+        LayerAdam la[NLAYERS];
+        LayerActs acts[NLAYERS];
+        LayerGrads grads[NLAYERS];
+        LayerKernels kern[NLAYERS];
+        for (int L=0; L<NLAYERS; L++) {
+            lw[L] = layer_weights_alloc();
+            la[L] = layer_adam_alloc();
+            acts[L] = layer_acts_alloc();
+            grads[L] = layer_grads_alloc();
+            memset(&kern[L], 0, sizeof(LayerKernels));
+        }
 
-        // Adam optimizer states (m and v for each weight)
-        AdamState aWq=adam_alloc(wq_sz), aWk=adam_alloc(wq_sz), aWv=adam_alloc(wq_sz), aWo=adam_alloc(wo_sz);
-        AdamState aW1=adam_alloc(w1_sz), aW2=adam_alloc(w2_sz), aW3=adam_alloc(w3_sz);
-        AdamState arms1=adam_alloc(DIM), arms2=adam_alloc(DIM);
+        // Final RMSNorm + embedding + classifier
+        float *rms_final = (float*)malloc(DIM*4);
+        float *embed = (float*)malloc(VOCAB*DIM*4);  // [VOCAB, DIM] row-major
+        float *grms_final = (float*)calloc(DIM, 4);
+        float *gembed = (float*)calloc(VOCAB*DIM, 4);
+        AdamState arms_final = adam_alloc(DIM);
+        AdamState aembed = adam_alloc((size_t)VOCAB*DIM);
 
         double cum_compile=0, cum_train=0, cum_wall=0;
         int cum_steps=0, cum_batches=0;
 
+        float resume_loss = 0;
         bool resuming = false;
-        if (argc > 1 && strcmp(argv[1], "--resume") == 0) {
-            FILE *f = fopen(CKPT_PATH, "rb");
-            if (f) {
-                CkptHdr h; fread(&h, sizeof(h), 1, f);
-                start_step=h.step; total_steps=h.total_steps; lr=h.lr;
-                cum_compile=h.cum_compile; cum_train=h.cum_train; cum_wall=h.cum_wall;
-                cum_steps=h.cum_steps; cum_batches=h.cum_batches; adam_t=h.adam_t;
-                fread(Wq,4,wq_sz,f); fread(Wk,4,wq_sz,f); fread(Wv,4,wq_sz,f); fread(Wo,4,wo_sz,f);
-                fread(W1,4,w1_sz,f); fread(W2,4,w2_sz,f); fread(W3,4,w3_sz,f);
-                fread(rms1_w,4,DIM,f); fread(rms2_w,4,DIM,f);
-                // Adam state
-                fread(aWq.m,4,wq_sz,f);fread(aWq.v,4,wq_sz,f);
-                fread(aWk.m,4,wq_sz,f);fread(aWk.v,4,wq_sz,f);
-                fread(aWv.m,4,wq_sz,f);fread(aWv.v,4,wq_sz,f);
-                fread(aWo.m,4,wo_sz,f);fread(aWo.v,4,wo_sz,f);
-                fread(aW1.m,4,w1_sz,f);fread(aW1.v,4,w1_sz,f);
-                fread(aW2.m,4,w2_sz,f);fread(aW2.v,4,w2_sz,f);
-                fread(aW3.m,4,w3_sz,f);fread(aW3.v,4,w3_sz,f);
-                fread(arms1.m,4,DIM,f);fread(arms1.v,4,DIM,f);
-                fread(arms2.m,4,DIM,f);fread(arms2.v,4,DIM,f);
-                fclose(f);
-                resuming = true;
-                printf("[RESUMED step %d, loss=%.6f]\n", start_step, h.loss);
+        if (do_resume) {
+            resuming = load_checkpoint(CKPT_PATH, &start_step, &total_steps, &lr, &resume_loss,
+                &cum_compile, &cum_train, &cum_wall, &cum_steps, &cum_batches, &adam_t,
+                lw, la, rms_final, &arms_final, embed, &aembed);
+            if (resuming) printf("[RESUMED step %d, loss=%.4f]\n", start_step, resume_loss);
+        }
+        if (!resuming) {
+            printf("=== ANE Training: Stories110M (12 layers) ===\n");
+            printf("dim=%d hidden=%d heads=%d seq=%d vocab=%d layers=%d\n", DIM, HIDDEN, HEADS, SEQ, VOCAB, NLAYERS);
+            if (!load_pretrained(lw, rms_final, embed, MODEL_PATH)) {
+                printf("Pretrained load failed, using random init\n");
+                srand48(42);
+                float scale_d=1.0f/sqrtf(DIM), scale_h=1.0f/sqrtf(HIDDEN);
+                for (int L=0; L<NLAYERS; L++) {
+                    for(size_t i=0;i<WQ_SZ;i++){lw[L].Wq[i]=scale_d*(2*drand48()-1);lw[L].Wk[i]=scale_d*(2*drand48()-1);}
+                    for(size_t i=0;i<WQ_SZ;i++){lw[L].Wv[i]=scale_d*(2*drand48()-1);lw[L].Wo[i]=scale_d*(2*drand48()-1);}
+                    for(size_t i=0;i<W1_SZ;i++) lw[L].W1[i]=scale_h*(2*drand48()-1);
+                    for(size_t i=0;i<W2_SZ;i++) lw[L].W2[i]=scale_d*(2*drand48()-1);
+                    for(size_t i=0;i<W3_SZ;i++) lw[L].W3[i]=scale_h*(2*drand48()-1);
+                    for(int i=0;i<DIM;i++){lw[L].rms_att[i]=1.0f; lw[L].rms_ffn[i]=1.0f;}
+                }
+                for(int i=0;i<DIM;i++) rms_final[i]=1.0f;
+                float escale = 0.02f;
+                for(size_t i=0;i<(size_t)VOCAB*DIM;i++) embed[i]=escale*(2*drand48()-1);
             }
-        }
-        if (!resuming) {
-            srand48(42);
-            float scale_d=1.0f/sqrtf(DIM), scale_h=1.0f/sqrtf(HIDDEN);
-            for(size_t i=0;i<wq_sz;i++){Wq[i]=scale_d*(2*drand48()-1);Wk[i]=scale_d*(2*drand48()-1);}
-            for(size_t i=0;i<wq_sz;i++){Wv[i]=scale_d*(2*drand48()-1);Wo[i]=scale_d*(2*drand48()-1);}
-            for(size_t i=0;i<w1_sz;i++) W1[i]=scale_h*(2*drand48()-1);
-            for(size_t i=0;i<w2_sz;i++) W2[i]=scale_d*(2*drand48()-1);
-            for(size_t i=0;i<w3_sz;i++) W3[i]=scale_h*(2*drand48()-1);
-            for(int i=0;i<DIM;i++){rms1_w[i]=1.0f; rms2_w[i]=1.0f;}
-        }
-
-        if (!resuming) {
-            // FLOP accounting: 7 weight matrices, each 2*OC*IC*SEQ for forward
-            double fwd_flops = 4.0*2*DIM*DIM*SEQ + 2.0*2*DIM*HIDDEN*SEQ + 2.0*HIDDEN*DIM*SEQ;
-            double bwd_dx_flops = fwd_flops; // same matmuls transposed
-            double bwd_dw_flops = fwd_flops; // dW = dy^T @ x, same FLOPs
-            double sdpa_flops = 2.0*HEADS*5*SEQ*SEQ*HD; // 5 SEQ×SEQ matmuls in backward
-            double total_flops = fwd_flops + bwd_dx_flops + bwd_dw_flops + sdpa_flops;
-            double ane_flops_step = fwd_flops + bwd_dx_flops + sdpa_flops;
-            printf("=== ANE Training: Fully-ANE Pipeline ===\n");
-            printf("dim=%d hidden=%d heads=%d seq=%d\n", DIM, HIDDEN, HEADS, SEQ);
-            printf("Params: %.2fM | Weights: %.1fMB FP16\n", total_params/1e6, total_params*2.0/1e6);
-            printf("Kernels: %d (fwdAttn+fwdFFN+ffnBwd+sdpaBwd1+sdpaBwd2+qkvBwd)\n", NUM_KERNELS);
-            printf("Accum %d steps per recompile | Adam LR=%.1e b1=%.1f b2=%.3f\n\n", ACCUM_STEPS, lr, adam_b1, adam_b2);
+            size_t tp = (size_t)NLAYERS*LAYER_PARAMS + DIM + (size_t)VOCAB*DIM;
+            double xfmr_params = (double)NLAYERS*LAYER_PARAMS;
+            double embed_params = (double)VOCAB*DIM;
+            printf("Params: %.2fM (transformer %.2fM + embed %.2fM)\n", tp/1e6, xfmr_params/1e6, embed_params/1e6);
+            printf("Kernels: %d (%d weight-bearing + %d static sdpaBwd2)\n",
+                   TOTAL_WEIGHT_KERNELS+NLAYERS, TOTAL_WEIGHT_KERNELS, NLAYERS);
+            printf("Accum %d steps per recompile | Adam LR=%.1e b1=%.1f b2=%.3f\n", ACCUM_STEPS, lr, adam_b1, adam_b2);
+            double fwd_f = NLAYERS*(4.0*2*DIM*DIM*SEQ + 2.0*2*DIM*HIDDEN*SEQ + 2.0*HIDDEN*DIM*SEQ);
+            double bwd_dx_f = fwd_f, bwd_dw_f = fwd_f;
+            double sdpa_f = NLAYERS*2.0*HEADS*5*SEQ*SEQ*HD;
+            double cls_f = 2.0*VOCAB*DIM*SEQ;
+            double total_f = fwd_f + bwd_dx_f + bwd_dw_f + sdpa_f + cls_f*3;
+            double ane_f = fwd_f + bwd_dx_f + sdpa_f;
             printf("FLOPs/step: fwd=%.0fM bwd_dx=%.0fM bwd_dW=%.0fM sdpa_bwd=%.0fM total=%.0fM\n",
-                   fwd_flops/1e6, bwd_dx_flops/1e6, bwd_dw_flops/1e6, sdpa_flops/1e6, total_flops/1e6);
-            printf("ANE FLOPs/step: %.0fM (fwd+bwd_dx+sdpa_bwd) | CPU: dW (cblas)\n\n", ane_flops_step/1e6);
+                   fwd_f/1e6, bwd_dx_f/1e6, bwd_dw_f/1e6, sdpa_f/1e6, total_f/1e6);
+            printf("ANE FLOPs/step: %.0fM (fwd+bwd_dx+sdpa_bwd) | CPU: dW+cls (cblas)\n\n", ane_f/1e6);
         }
 
-        // Training data
-        float *x_in=malloc(SEQ*DIM*4), *y_tgt=malloc(SEQ*DIM*4);
-        // Training data in channel-first [C,S] layout
-        if (!resuming) srand48(42);
-        for(int c=0;c<DIM;c++) for(int t=0;t<SEQ;t++) {
-            int idx = c*SEQ+t;
-            x_in[idx]=0.1f*(2*drand48()-1);
-            y_tgt[idx]=0.1f*sinf(idx*0.03f+1.0f);
+        // mmap token data
+        int data_fd = open(DATA_PATH, O_RDONLY);
+        if (data_fd < 0) { printf("Cannot open %s\n", DATA_PATH); return 1; }
+        struct stat st; fstat(data_fd, &st);
+        size_t data_len = st.st_size;
+        uint16_t *token_data = (uint16_t*)mmap(NULL, data_len, PROT_READ, MAP_PRIVATE, data_fd, 0);
+        if (token_data == MAP_FAILED) { printf("mmap failed\n"); return 1; }
+        size_t n_tokens = data_len / 2;
+        printf("Token data: %zu tokens (%.1f MB)\n", n_tokens, data_len/1e6);
+
+        // Gradient buffers shared across layers (reused each step)
+        float *dy = (float*)malloc(SEQ*DIM*4);            // gradient flowing backward
+        float *dffn = (float*)malloc(SEQ*DIM*4);
+        float *dh1 = (float*)malloc(SEQ*HIDDEN*4);
+        float *dh3 = (float*)malloc(SEQ*HIDDEN*4);
+        float *dx_ffn = (float*)malloc(SEQ*DIM*4);
+        float *dx2 = (float*)malloc(SEQ*DIM*4);
+        float *do_out_buf = (float*)malloc(SEQ*DIM*4);
+        float *dq = (float*)malloc(SEQ*DIM*4);
+        float *dk = (float*)malloc(SEQ*DIM*4);
+        float *dv = (float*)malloc(SEQ*DIM*4);
+        float *dx_attn = (float*)malloc(SEQ*DIM*4);
+
+        // x buffer for input to each layer (channel-first [DIM, SEQ])
+        float *x_cur = (float*)malloc(SEQ*DIM*4);
+        float *x_final = (float*)malloc(SEQ*DIM*4);     // after final rmsnorm
+        float *logits = (float*)malloc(SEQ*VOCAB*4);     // [VOCAB, SEQ] for cross-entropy
+        float *dlogits = (float*)malloc(SEQ*VOCAB*4);
+
+        // Compile static sdpaBwd2 kernels (no weights, one per layer)
+        Kern *sdpaBwd2[NLAYERS];
+        for (int L=0; L<NLAYERS; L++) {
+            sdpaBwd2[L] = compile_sdpa_bwd2();
+            if (!sdpaBwd2[L]) { printf("sdpaBwd2 compile failed\n"); return 1; }
         }
 
-        // Activation buffers (saved from forward for backward)
-        float *xnorm=malloc(SEQ*DIM*4);
-        float *Q=malloc(SEQ*DIM*4), *K=malloc(SEQ*DIM*4), *V=malloc(SEQ*DIM*4);
-        float *attn_out=malloc(SEQ*DIM*4), *o_out=malloc(SEQ*DIM*4);
-        float *x2=malloc(SEQ*DIM*4), *x2norm=malloc(SEQ*DIM*4);
-        float *h1=malloc(SEQ*HIDDEN*4), *h3=malloc(SEQ*HIDDEN*4), *silu_out=malloc(SEQ*HIDDEN*4);
-        float *ffn_out=malloc(SEQ*DIM*4), *y_out=malloc(SEQ*DIM*4);
-
-        // Gradient buffers
-        float *dy=malloc(SEQ*DIM*4), *dffn=malloc(SEQ*DIM*4);
-        float *dh1=malloc(SEQ*HIDDEN*4), *dh3=malloc(SEQ*HIDDEN*4);
-        float *dx_ffn=malloc(SEQ*DIM*4), *dx2=malloc(SEQ*DIM*4);
-        float *do_out_buf=malloc(SEQ*DIM*4), *dattn=malloc(SEQ*DIM*4);
-        float *dq=malloc(SEQ*DIM*4), *dk=malloc(SEQ*DIM*4), *dv=malloc(SEQ*DIM*4);
-        float *dx_attn=malloc(SEQ*DIM*4);
-        // SDPA bwd intermediates
-        float *probs_flat=malloc(SEQ*DIM*4), *dp_flat=malloc(SEQ*DIM*4);
-
-        // Gradient accumulators
-        float *gWq=calloc(wq_sz,4), *gWk=calloc(wq_sz,4), *gWv=calloc(wq_sz,4), *gWo=calloc(wo_sz,4);
-        float *gW1=calloc(w1_sz,4), *gW2=calloc(w2_sz,4), *gW3=calloc(w3_sz,4);
-        float *grms1=calloc(DIM,4), *grms2=calloc(DIM,4);
-
-        // 7 ANE kernels
-        Kern *kFwdAttn=NULL, *kFwdFFN=NULL, *kFFNBwd=NULL;
-        Kern *kSdpaBwd1=NULL, *kSdpaBwd2=NULL, *kQKVb=NULL;
-
-        // Compile static (weight-free) kernels ONCE
-        kSdpaBwd2 = compile_kern_mil_w(gen_sdpa_bwd2(), @{},
-            (2*SCORE_CH+2*DIM)*SEQ*2, 2*DIM*SEQ*2);
-        if (!kSdpaBwd2) { printf("Static kernel compile failed\n"); return 1; }
-
-        // GCD queue for async dW cblas (overlaps with ANE evals)
         dispatch_queue_t dw_q = dispatch_queue_create("dw_cblas", DISPATCH_QUEUE_SERIAL);
         dispatch_group_t dw_grp = dispatch_group_create();
 
@@ -720,286 +314,351 @@ int main(int argc, char *argv[]) {
         int total_steps_done=0, total_batches=0;
         uint64_t t_wall_start = mach_absolute_time();
 
+        srand48(42 + start_step);
+
         int step = start_step;
         while (step < total_steps) {
-            // Check compile budget — 5 weight-bearing kernels per batch
-            if (g_compile_count + 5 > MAX_COMPILES) {
-                free_kern(kFwdAttn);free_kern(kFwdFFN);free_kern(kFFNBwd);
-                free_kern(kSdpaBwd1);free_kern(kQKVb);
-                free_kern(kSdpaBwd2);
+            // Check compile budget
+            if (g_compile_count + TOTAL_WEIGHT_KERNELS > MAX_COMPILES) {
+                for (int L=0; L<NLAYERS; L++) { free_layer_kernels(&kern[L]); free_kern(sdpaBwd2[L]); }
                 double wall = tb_ms(mach_absolute_time() - t_wall_start);
-                FILE *f = fopen(CKPT_PATH, "wb");
-                CkptHdr h = {step,total_steps,lr,last_loss,
+                save_checkpoint(CKPT_PATH, step, total_steps, lr, last_loss,
                     total_compile_ms+cum_compile, total_train_ms+cum_train, wall+cum_wall,
-                    total_steps_done+cum_steps, total_batches+cum_batches, adam_t};
-                fwrite(&h,sizeof(h),1,f);
-                fwrite(Wq,4,wq_sz,f);fwrite(Wk,4,wq_sz,f);fwrite(Wv,4,wq_sz,f);fwrite(Wo,4,wo_sz,f);
-                fwrite(W1,4,w1_sz,f);fwrite(W2,4,w2_sz,f);fwrite(W3,4,w3_sz,f);
-                fwrite(rms1_w,4,DIM,f);fwrite(rms2_w,4,DIM,f);
-                // Adam state
-                fwrite(aWq.m,4,wq_sz,f);fwrite(aWq.v,4,wq_sz,f);
-                fwrite(aWk.m,4,wq_sz,f);fwrite(aWk.v,4,wq_sz,f);
-                fwrite(aWv.m,4,wq_sz,f);fwrite(aWv.v,4,wq_sz,f);
-                fwrite(aWo.m,4,wo_sz,f);fwrite(aWo.v,4,wo_sz,f);
-                fwrite(aW1.m,4,w1_sz,f);fwrite(aW1.v,4,w1_sz,f);
-                fwrite(aW2.m,4,w2_sz,f);fwrite(aW2.v,4,w2_sz,f);
-                fwrite(aW3.m,4,w3_sz,f);fwrite(aW3.v,4,w3_sz,f);
-                fwrite(arms1.m,4,DIM,f);fwrite(arms1.v,4,DIM,f);
-                fwrite(arms2.m,4,DIM,f);fwrite(arms2.v,4,DIM,f);
-                fclose(f);
-                printf("[exec() restart step %d, %d compiles, loss=%.6f]\n", step, g_compile_count, last_loss);
+                    total_steps_done+cum_steps, total_batches+cum_batches, adam_t,
+                    lw, la, rms_final, &arms_final, embed, &aembed);
+                printf("[exec() restart step %d, %d compiles, loss=%.4f]\n", step, g_compile_count, last_loss);
                 fflush(stdout);
                 execl(argv[0], argv[0], "--resume", NULL);
                 perror("execl"); return 1;
             }
 
-            // Compile 5 weight-bearing kernels (sdpaBwd2 compiled once above)
+            // Compile all layers' weight-bearing kernels
             uint64_t tc = mach_absolute_time();
-            free_kern(kFwdAttn);free_kern(kFwdFFN);free_kern(kFFNBwd);
-            free_kern(kSdpaBwd1);free_kern(kQKVb);
+            for (int L=0; L<NLAYERS; L++) free_layer_kernels(&kern[L]);
 
-            kFwdAttn = compile_kern_mil_w(gen_sdpa_fwd_taps(), (@{
-                @"@model_path/weights/rms1.bin": @{@"offset":@0, @"data":build_blob(rms1_w,1,DIM)},
-                @"@model_path/weights/wq.bin": @{@"offset":@0, @"data":build_blob(Wq,DIM,DIM)},
-                @"@model_path/weights/wk.bin": @{@"offset":@0, @"data":build_blob(Wk,DIM,DIM)},
-                @"@model_path/weights/wv.bin": @{@"offset":@0, @"data":build_blob(Wv,DIM,DIM)},
-                @"@model_path/weights/wo.bin": @{@"offset":@0, @"data":build_blob(Wo,DIM,DIM)},
-                @"@model_path/weights/mask.bin": @{@"offset":@0, @"data":get_mask_blob()},
-            }), DIM*SEQ*2, 6*DIM*SEQ*2);
+            bool compile_ok = true;
+            for (int L=0; L<NLAYERS; L++) {
+                printf("  Compiling layer %d/%d... (%d compiles)\r", L+1, NLAYERS, g_compile_count);
+                fflush(stdout);
+                if (!compile_layer_kernels(&kern[L], &lw[L])) {
+                    printf("\nCompile failed at layer %d, restart\n", L);
+                    compile_ok = false; break;
+                }
+            }
+            if (!compile_ok) { g_compile_count = MAX_COMPILES; continue; }
 
-            kFwdFFN = compile_kern_mil_w(gen_ffn_fwd_taps(), (@{
-                @"@model_path/weights/rms2.bin": @{@"offset":@0, @"data":build_blob(rms2_w,1,DIM)},
-                @"@model_path/weights/w1.bin": @{@"offset":@0, @"data":build_blob(W1,HIDDEN,DIM)},
-                @"@model_path/weights/w3.bin": @{@"offset":@0, @"data":build_blob(W3,HIDDEN,DIM)},
-                @"@model_path/weights/w2.bin": @{@"offset":@0, @"data":build_blob(W2,DIM,HIDDEN)},
-            }), DIM*SEQ*2, (2*DIM+3*HIDDEN)*SEQ*2);
-
-            kFFNBwd = compile_kern_mil_w(gen_ffn_bwd(), (@{
-                @"@model_path/weights/w2t.bin": @{@"offset":@0, @"data":build_blob_t(W2,DIM,HIDDEN)},
-                @"@model_path/weights/w1t.bin": @{@"offset":@0, @"data":build_blob_t(W1,HIDDEN,DIM)},
-                @"@model_path/weights/w3t.bin": @{@"offset":@0, @"data":build_blob_t(W3,HIDDEN,DIM)},
-            }), (DIM+2*HIDDEN)*SEQ*2, (DIM+2*HIDDEN)*SEQ*2);
-
-            kSdpaBwd1 = compile_kern_mil_w(gen_sdpa_bwd1(), (@{
-                @"@model_path/weights/mask.bin": @{@"offset":@0, @"data":get_mask_blob()},
-                @"@model_path/weights/wot.bin": @{@"offset":@0, @"data":build_blob_t(Wo,DIM,DIM)},
-            }), 4*DIM*SEQ*2, (DIM+2*SCORE_CH)*SEQ*2);
-
-            kQKVb = compile_kern_mil_w(gen_qkvb(), (@{
-                @"@model_path/weights/wqt.bin": @{@"offset":@0, @"data":build_blob_t(Wq,DIM,DIM)},
-                @"@model_path/weights/wkt.bin": @{@"offset":@0, @"data":build_blob_t(Wk,DIM,DIM)},
-                @"@model_path/weights/wvt.bin": @{@"offset":@0, @"data":build_blob_t(Wv,DIM,DIM)},
-            }), 3*DIM*SEQ*2, DIM*SEQ*2);
+            // Re-compile sdpaBwd2 if needed (after exec restart)
+            for (int L=0; L<NLAYERS; L++) {
+                if (!sdpaBwd2[L]) {
+                    sdpaBwd2[L] = compile_sdpa_bwd2();
+                    if (!sdpaBwd2[L]) { printf("sdpaBwd2 recompile failed\n"); return 1; }
+                }
+            }
 
             double cms = tb_ms(mach_absolute_time() - tc);
             total_compile_ms += cms;
-            if (!kFwdAttn||!kFwdFFN||!kFFNBwd||!kSdpaBwd1||!kQKVb) {
-                printf("Compile failed at step %d, restart\n", step);
-                g_compile_count = MAX_COMPILES; continue;
-            }
+            printf("  Compiled %d kernels in %.0fms                    \n", TOTAL_WEIGHT_KERNELS, cms);
+
+            // Zero gradient accumulators
+            for (int L=0; L<NLAYERS; L++) layer_grads_zero(&grads[L]);
+            memset(grms_final, 0, DIM*4);
+            memset(gembed, 0, (size_t)VOCAB*DIM*4);
 
-            // === Training loop ===
-            memset(gWq,0,wq_sz*4);memset(gWk,0,wq_sz*4);memset(gWv,0,wq_sz*4);memset(gWo,0,wo_sz*4);
-            memset(gW1,0,w1_sz*4);memset(gW2,0,w2_sz*4);memset(gW3,0,w3_sz*4);
-            memset(grms1,0,DIM*4);memset(grms2,0,DIM*4);
             int steps_batch = 0;
             uint64_t tt = mach_absolute_time();
+            double t_ane=0,t_io=0,t_elem=0,t_rms=0,t_cblas_wait=0,t_cls=0;
 
-            double t_ane=0,t_io=0,t_elem=0,t_rms=0,t_cblas_wait=0;
             for (int a=0; a<ACCUM_STEPS && step<total_steps; a++, step++) {
                 uint64_t t0,t1;
-                // ===== FORWARD =====
-                // Attention fwd (ANE does rmsnorm internally): x_in → o_out,Q,K,V,attn_out,xnorm
+                // Sample random position in token data
+                size_t max_pos = n_tokens - SEQ - 1;
+                size_t pos = (size_t)(drand48() * max_pos);
+                uint16_t *input_tokens = token_data + pos;
+                uint16_t *target_tokens = token_data + pos + 1;
+
+                // Embedding lookup → x_cur [DIM, SEQ] channel-first
                 t0=mach_absolute_time();
-                io_write_fp16(kFwdAttn->ioIn, x_in, DIM, SEQ);
-                t1=mach_absolute_time(); t_io+=tb_ms(t1-t0); t0=t1;
-                ane_eval(kFwdAttn);
-                t1=mach_absolute_time(); t_ane+=tb_ms(t1-t0); t0=t1;
-                // Wait for prev step's dW cblas before reading attn_out/xnorm
-                dispatch_group_wait(dw_grp, DISPATCH_TIME_FOREVER);
-                t1=mach_absolute_time(); t_cblas_wait+=tb_ms(t1-t0); t0=t1;
-                io_read_fp16(kFwdAttn->ioOut, o_out,    0,     DIM, SEQ);
-                io_read_fp16(kFwdAttn->ioOut, attn_out, 4*DIM, DIM, SEQ);
-                io_read_fp16(kFwdAttn->ioOut, xnorm,    5*DIM, DIM, SEQ);
-                t1=mach_absolute_time(); t_io+=tb_ms(t1-t0); t0=t1;
+                embed_lookup(x_cur, embed, input_tokens, DIM, SEQ);
+                t1=mach_absolute_time(); t_elem+=tb_ms(t1-t0);
 
-                for(int i=0;i<SEQ*DIM;i++) x2[i] = x_in[i] + o_out[i];
-                t1=mach_absolute_time(); t_elem+=tb_ms(t1-t0); t0=t1;
+                // ===== FORWARD (12 layers) =====
+                for (int L=0; L<NLAYERS; L++) {
+                    LayerActs *ac = &acts[L];
 
-                // FFN fwd (ANE does rmsnorm internally): x2 → ffn_out,h1,h3,silu_out,x2norm
-                io_write_fp16(kFwdFFN->ioIn, x2, DIM, SEQ);
-                t1=mach_absolute_time(); t_io+=tb_ms(t1-t0); t0=t1;
-                ane_eval(kFwdFFN);
-                t1=mach_absolute_time(); t_ane+=tb_ms(t1-t0); t0=t1;
-                io_read_fp16(kFwdFFN->ioOut, ffn_out,  0,              DIM,    SEQ);
-                io_read_fp16(kFwdFFN->ioOut, h1,       DIM,            HIDDEN, SEQ);
-                io_read_fp16(kFwdFFN->ioOut, h3,       DIM+HIDDEN,     HIDDEN, SEQ);
-                io_read_fp16(kFwdFFN->ioOut, silu_out, DIM+2*HIDDEN,   HIDDEN, SEQ);
-                io_read_fp16(kFwdFFN->ioOut, x2norm,   DIM+3*HIDDEN,   DIM,    SEQ);
-                t1=mach_absolute_time(); t_io+=tb_ms(t1-t0); t0=t1;
+                    // Save layer input for rmsnorm1 backward
+                    memcpy(ac->layer_in, x_cur, SEQ*DIM*4);
+                    // Attention forward: x_cur → o_out,Q,K,V,attn_out,xnorm
+                    t0=mach_absolute_time();
+                    dispatch_group_wait(dw_grp, DISPATCH_TIME_FOREVER);
+                    t1=mach_absolute_time(); t_cblas_wait+=tb_ms(t1-t0); t0=t1;
+                    io_write_fp16(kern[L].fwdAttn->ioIn, x_cur, DIM, SEQ);
+                    t1=mach_absolute_time(); t_io+=tb_ms(t1-t0); t0=t1;
+                    ane_eval(kern[L].fwdAttn);
+                    t1=mach_absolute_time(); t_ane+=tb_ms(t1-t0); t0=t1;
+                    io_read_fp16(kern[L].fwdAttn->ioOut, ac->o_out,    0,     DIM, SEQ);
+                    io_read_fp16(kern[L].fwdAttn->ioOut, ac->attn_out, 4*DIM, DIM, SEQ);
+                    io_read_fp16(kern[L].fwdAttn->ioOut, ac->xnorm,    5*DIM, DIM, SEQ);
+                    t1=mach_absolute_time(); t_io+=tb_ms(t1-t0); t0=t1;
 
-                // Residual + Loss
-                for(int i=0;i<SEQ*DIM;i++) y_out[i] = x2[i] + ffn_out[i];
-                float loss = 0;
-                for(int i=0;i<SEQ*DIM;i++){
-                    float d = y_out[i]-y_tgt[i]; loss += d*d;
-                    dy[i] = 2.0f*d/(SEQ*DIM);
+                    vDSP_vadd(x_cur, 1, ac->o_out, 1, ac->x2, 1, (vDSP_Length)(SEQ*DIM));
+                    t1=mach_absolute_time(); t_elem+=tb_ms(t1-t0); t0=t1;
+
+                    // FFN forward
+                    io_write_fp16(kern[L].fwdFFN->ioIn, ac->x2, DIM, SEQ);
+                    t1=mach_absolute_time(); t_io+=tb_ms(t1-t0); t0=t1;
+                    ane_eval(kern[L].fwdFFN);
+                    t1=mach_absolute_time(); t_ane+=tb_ms(t1-t0); t0=t1;
+                    io_read_fp16(kern[L].fwdFFN->ioOut, ac->ffn_out,  0,              DIM,    SEQ);
+                    io_read_fp16(kern[L].fwdFFN->ioOut, ac->h1,       DIM,            HIDDEN, SEQ);
+                    io_read_fp16(kern[L].fwdFFN->ioOut, ac->h3,       DIM+HIDDEN,     HIDDEN, SEQ);
+                    io_read_fp16(kern[L].fwdFFN->ioOut, ac->silu_out, DIM+2*HIDDEN,   HIDDEN, SEQ);
+                    io_read_fp16(kern[L].fwdFFN->ioOut, ac->x2norm,   DIM+3*HIDDEN,   DIM,    SEQ);
+                    t1=mach_absolute_time(); t_io+=tb_ms(t1-t0); t0=t1;
+
+                    vDSP_vadd(ac->x2, 1, ac->ffn_out, 1, x_cur, 1, (vDSP_Length)(SEQ*DIM));
+                    t1=mach_absolute_time(); t_elem+=tb_ms(t1-t0);
                 }
-                loss /= (SEQ*DIM);
+
+                // Final RMSNorm (CPU)
+                t0=mach_absolute_time();
+                rmsnorm(x_final, x_cur, rms_final, DIM, SEQ);
+                t1=mach_absolute_time(); t_rms+=tb_ms(t1-t0); t0=t1;
+
+                // Classifier: logits = embed^T @ x_final
+                cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
+                            VOCAB, SEQ, DIM, 1.0f,
+                            embed, DIM, x_final, SEQ, 0.0f, logits, SEQ);
+                t1=mach_absolute_time(); t_cls+=tb_ms(t1-t0); t0=t1;
+
+                // Cross-entropy loss
+                float loss = cross_entropy_loss(dlogits, logits, target_tokens, VOCAB, SEQ);
                 last_loss = loss;
-                memcpy(dffn, dy, SEQ*DIM*4);
                 t1=mach_absolute_time(); t_elem+=tb_ms(t1-t0); t0=t1;
 
                 // ===== BACKWARD =====
-                // FFN backward (ANE)
-                io_write_fp16_at(kFFNBwd->ioIn, 0, dffn, DIM, SEQ);
-                io_copy(kFFNBwd->ioIn, DIM, kFwdFFN->ioOut, DIM, 2*HIDDEN, SEQ);
-                t1=mach_absolute_time(); t_io+=tb_ms(t1-t0); t0=t1;
-                ane_eval(kFFNBwd);
-                t1=mach_absolute_time(); t_ane+=tb_ms(t1-t0); t0=t1;
-                io_read_fp16(kFFNBwd->ioOut, dx_ffn, 0,           DIM,    SEQ);
-                io_read_fp16(kFFNBwd->ioOut, dh1,    DIM,         HIDDEN, SEQ);
-                io_read_fp16(kFFNBwd->ioOut, dh3,    DIM+HIDDEN,  HIDDEN, SEQ);
-                t1=mach_absolute_time(); t_io+=tb_ms(t1-t0); t0=t1;
+                // dlogits already computed by cross_entropy_loss
 
-                // dW FFN async (overlaps with rmsnorm2_bwd + SDPA)
+                // Classifier backward: dx_final = embed^T @ dlogits, dembed += dlogits @ x_final^T
+                // dx_final[DIM,SEQ] = embed^T[DIM,VOCAB] @ dlogits[VOCAB,SEQ]
+                cblas_sgemm(CblasRowMajor, CblasTrans, CblasNoTrans,
+                            DIM, SEQ, VOCAB, 1.0f,
+                            embed, DIM, dlogits, SEQ, 0.0f, dy, SEQ);
+
+                // dembed[VOCAB,DIM] += dlogits[VOCAB,SEQ] @ x_final^T[SEQ,DIM]
                 dispatch_group_async(dw_grp, dw_q, ^{
-                    cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, DIM, HIDDEN, SEQ,
-                                1.0f, dffn, SEQ, silu_out, SEQ, 1.0f, gW2, HIDDEN);
-                    cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, HIDDEN, DIM, SEQ,
-                                1.0f, dh1, SEQ, x2norm, SEQ, 1.0f, gW1, DIM);
-                    cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, HIDDEN, DIM, SEQ,
-                                1.0f, dh3, SEQ, x2norm, SEQ, 1.0f, gW3, DIM);
+                    cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
+                                VOCAB, DIM, SEQ, 1.0f,
+                                dlogits, SEQ, x_final, SEQ, 1.0f, gembed, DIM);
                 });
 
-                // RMSNorm2 backward — runs in parallel with dW FFN
-                memset(dx2, 0, SEQ*DIM*4);
-                rmsnorm_bwd(dx2, grms2, dx_ffn, x2, rms2_w, DIM, SEQ);
-                for(int i=0;i<SEQ*DIM;i++) dx2[i] += dy[i];
-                t1=mach_absolute_time(); t_rms+=tb_ms(t1-t0); t0=t1;
+                // Final RMSNorm backward
+                float *dx_rms_final = (float*)calloc(SEQ*DIM, 4);
+                rmsnorm_bwd(dx_rms_final, grms_final, dy, x_cur, rms_final, DIM, SEQ);
+                memcpy(dy, dx_rms_final, SEQ*DIM*4);
+                free(dx_rms_final);
 
-                // dWo async (overlaps with SDPA backward)
-                memcpy(do_out_buf, dx2, SEQ*DIM*4);
-                dispatch_group_async(dw_grp, dw_q, ^{
-                    cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, DIM, DIM, SEQ,
-                                1.0f, do_out_buf, SEQ, attn_out, SEQ, 1.0f, gWo, DIM);
-                });
+                // ===== BACKWARD (12 layers, reverse) =====
+                for (int L=NLAYERS-1; L>=0; L--) {
+                    LayerActs *ac = &acts[L];
+                    LayerGrads *gr = &grads[L];
 
-                // SDPA backward (ANE) — includes Wo^T conv
-                io_copy(kSdpaBwd1->ioIn, 0, kFwdAttn->ioOut, DIM, 3*DIM, SEQ);
-                io_write_fp16_at(kSdpaBwd1->ioIn, 3*DIM, dx2, DIM, SEQ);
-                t1=mach_absolute_time(); t_io+=tb_ms(t1-t0); t0=t1;
-                ane_eval(kSdpaBwd1);
-                t1=mach_absolute_time(); t_ane+=tb_ms(t1-t0); t0=t1;
-                io_copy(kSdpaBwd2->ioIn, 0, kSdpaBwd1->ioOut, DIM, 2*SCORE_CH, SEQ);
-                io_copy(kSdpaBwd2->ioIn, 2*SCORE_CH, kFwdAttn->ioOut, DIM, 2*DIM, SEQ);
-                t1=mach_absolute_time(); t_io+=tb_ms(t1-t0); t0=t1;
-                ane_eval(kSdpaBwd2);
-                t1=mach_absolute_time(); t_ane+=tb_ms(t1-t0); t0=t1;
+                    // dy is the gradient at the output of this layer
+                    // dffn = dy (residual connection: d(x2 + ffn) = dy for both)
+                    memcpy(dffn, dy, SEQ*DIM*4);
 
-                // Read dq,dk,dv — dW FFN+dWo still running async on serial queue
-                io_read_fp16(kSdpaBwd2->ioOut, dq, 0,   DIM, SEQ);
-                io_read_fp16(kSdpaBwd2->ioOut, dk, DIM,  DIM, SEQ);
-                io_read_fp16(kSdpaBwd1->ioOut, dv, 0,    DIM, SEQ);
-                t1=mach_absolute_time(); t_io+=tb_ms(t1-t0); t0=t1;
-                // dWq/dWk/dWv queues after dWo on serial queue — no wait needed
-                dispatch_group_async(dw_grp, dw_q, ^{
-                    cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, DIM, DIM, SEQ,
-                                1.0f, dq, SEQ, xnorm, SEQ, 1.0f, gWq, DIM);
-                    cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, DIM, DIM, SEQ,
-                                1.0f, dk, SEQ, xnorm, SEQ, 1.0f, gWk, DIM);
-                    cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, DIM, DIM, SEQ,
-                                1.0f, dv, SEQ, xnorm, SEQ, 1.0f, gWv, DIM);
-                });
+                    // FFN backward (ANE)
+                    io_write_fp16_at(kern[L].ffnBwd->ioIn, 0, dffn, DIM, SEQ);
+                    io_copy(kern[L].ffnBwd->ioIn, DIM, kern[L].fwdFFN->ioOut, DIM, 2*HIDDEN, SEQ);
+                    ane_eval(kern[L].ffnBwd);
+                    io_read_fp16(kern[L].ffnBwd->ioOut, dx_ffn, 0,           DIM,    SEQ);
+                    io_read_fp16(kern[L].ffnBwd->ioOut, dh1,    DIM,         HIDDEN, SEQ);
+                    io_read_fp16(kern[L].ffnBwd->ioOut, dh3,    DIM+HIDDEN,  HIDDEN, SEQ);
 
-                // QKV backward (ANE) — dWq/dWk/dWv runs async
-                io_copy(kQKVb->ioIn, 0, kSdpaBwd2->ioOut, 0, 2*DIM, SEQ);
-                io_copy(kQKVb->ioIn, 2*DIM, kSdpaBwd1->ioOut, 0, DIM, SEQ);
-                t1=mach_absolute_time(); t_io+=tb_ms(t1-t0); t0=t1;
-                ane_eval(kQKVb);
-                t1=mach_absolute_time(); t_ane+=tb_ms(t1-t0); t0=t1;
-                io_read_fp16(kQKVb->ioOut, dx_attn, 0, DIM, SEQ);
-                t1=mach_absolute_time(); t_io+=tb_ms(t1-t0); t0=t1;
+                    // dW FFN async
+                    float *capt_dffn = (float*)malloc(SEQ*DIM*4); memcpy(capt_dffn, dffn, SEQ*DIM*4);
+                    float *capt_silu = (float*)malloc(SEQ*HIDDEN*4); memcpy(capt_silu, ac->silu_out, SEQ*HIDDEN*4);
+                    float *capt_dh1 = (float*)malloc(SEQ*HIDDEN*4); memcpy(capt_dh1, dh1, SEQ*HIDDEN*4);
+                    float *capt_dh3 = (float*)malloc(SEQ*HIDDEN*4); memcpy(capt_dh3, dh3, SEQ*HIDDEN*4);
+                    float *capt_x2n = (float*)malloc(SEQ*DIM*4); memcpy(capt_x2n, ac->x2norm, SEQ*DIM*4);
+                    dispatch_group_async(dw_grp, dw_q, ^{
+                        cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, DIM, HIDDEN, SEQ,
+                                    1.0f, capt_dffn, SEQ, capt_silu, SEQ, 1.0f, gr->W2, HIDDEN);
+                        cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, HIDDEN, DIM, SEQ,
+                                    1.0f, capt_dh1, SEQ, capt_x2n, SEQ, 1.0f, gr->W1, DIM);
+                        cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, HIDDEN, DIM, SEQ,
+                                    1.0f, capt_dh3, SEQ, capt_x2n, SEQ, 1.0f, gr->W3, DIM);
+                        free(capt_dffn); free(capt_silu); free(capt_dh1); free(capt_dh3); free(capt_x2n);
+                    });
 
-                // RMSNorm1 backward (CPU) — doesn't touch cblas buffers
-                float *dx_rms = calloc(SEQ*DIM, 4);
-                rmsnorm_bwd(dx_rms, grms1, dx_attn, x_in, rms1_w, DIM, SEQ);
-                free(dx_rms);
-                t1=mach_absolute_time(); t_rms+=tb_ms(t1-t0);
+                    // RMSNorm2 backward
+                    memset(dx2, 0, SEQ*DIM*4);
+                    rmsnorm_bwd(dx2, gr->rms_ffn, dx_ffn, ac->x2, lw[L].rms_ffn, DIM, SEQ);
+                    // Add residual: dx2 += dy (from skip connection)
+                    for(int i=0;i<SEQ*DIM;i++) dx2[i] += dy[i];
+
+                    // dWo async
+                    memcpy(do_out_buf, dx2, SEQ*DIM*4);
+                    float *capt_do = (float*)malloc(SEQ*DIM*4); memcpy(capt_do, do_out_buf, SEQ*DIM*4);
+                    float *capt_attn = (float*)malloc(SEQ*DIM*4); memcpy(capt_attn, ac->attn_out, SEQ*DIM*4);
+                    dispatch_group_async(dw_grp, dw_q, ^{
+                        cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, DIM, DIM, SEQ,
+                                    1.0f, capt_do, SEQ, capt_attn, SEQ, 1.0f, gr->Wo, DIM);
+                        free(capt_do); free(capt_attn);
+                    });
+
+                    // SDPA backward (ANE)
+                    io_copy(kern[L].sdpaBwd1->ioIn, 0, kern[L].fwdAttn->ioOut, DIM, 3*DIM, SEQ);
+                    io_write_fp16_at(kern[L].sdpaBwd1->ioIn, 3*DIM, dx2, DIM, SEQ);
+                    ane_eval(kern[L].sdpaBwd1);
+                    io_copy(sdpaBwd2[L]->ioIn, 0, kern[L].sdpaBwd1->ioOut, DIM, 2*SCORE_CH, SEQ);
+                    io_copy(sdpaBwd2[L]->ioIn, 2*SCORE_CH, kern[L].fwdAttn->ioOut, DIM, 2*DIM, SEQ);
+                    ane_eval(sdpaBwd2[L]);
+
+                    io_read_fp16(sdpaBwd2[L]->ioOut, dq, 0,   DIM, SEQ);
+                    io_read_fp16(sdpaBwd2[L]->ioOut, dk, DIM,  DIM, SEQ);
+                    io_read_fp16(kern[L].sdpaBwd1->ioOut, dv, 0, DIM, SEQ);
+
+                    // dWq/dWk/dWv async
+                    float *capt_dq = (float*)malloc(SEQ*DIM*4); memcpy(capt_dq, dq, SEQ*DIM*4);
+                    float *capt_dk = (float*)malloc(SEQ*DIM*4); memcpy(capt_dk, dk, SEQ*DIM*4);
+                    float *capt_dv = (float*)malloc(SEQ*DIM*4); memcpy(capt_dv, dv, SEQ*DIM*4);
+                    float *capt_xn = (float*)malloc(SEQ*DIM*4); memcpy(capt_xn, ac->xnorm, SEQ*DIM*4);
+                    dispatch_group_async(dw_grp, dw_q, ^{
+                        cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, DIM, DIM, SEQ,
+                                    1.0f, capt_dq, SEQ, capt_xn, SEQ, 1.0f, gr->Wq, DIM);
+                        cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, DIM, DIM, SEQ,
+                                    1.0f, capt_dk, SEQ, capt_xn, SEQ, 1.0f, gr->Wk, DIM);
+                        cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, DIM, DIM, SEQ,
+                                    1.0f, capt_dv, SEQ, capt_xn, SEQ, 1.0f, gr->Wv, DIM);
+                        free(capt_dq); free(capt_dk); free(capt_dv); free(capt_xn);
+                    });
+
+                    // QKV backward (ANE)
+                    io_copy(kern[L].qkvBwd->ioIn, 0, sdpaBwd2[L]->ioOut, 0, 2*DIM, SEQ);
+                    io_copy(kern[L].qkvBwd->ioIn, 2*DIM, kern[L].sdpaBwd1->ioOut, 0, DIM, SEQ);
+                    ane_eval(kern[L].qkvBwd);
+                    io_read_fp16(kern[L].qkvBwd->ioOut, dx_attn, 0, DIM, SEQ);
+
+                    // RMSNorm1 backward (using saved layer input)
+                    float *dx_rms1 = (float*)calloc(SEQ*DIM, 4);
+                    rmsnorm_bwd(dx_rms1, gr->rms_att, dx_attn, ac->layer_in, lw[L].rms_att, DIM, SEQ);
+
+                    // dy for next layer (going backward) = dx_rms1 + dx2 residual
+                    // Actually: layer output = layer_input + o_out, and x2 = layer_input + o_out
+                    // So dx(layer_input) = dx_attn_rmsnorm + dx2 (residual from attn skip)
+                    // Wait, dx2 already includes the attn skip residual gradient.
+                    // dy = dx_rms1 (through rmsnorm1) is the gradient to the layer input
+                    // But there's also the skip connection: layer_input → x2 directly
+                    // So total gradient to layer_input = dx_rms1 + dx2_skip
+                    // dx2 was computed as rmsnorm2_bwd + dy(ffn_skip), which already flows to x2
+                    // x2 = layer_input + o_out, so d(layer_input) from x2 path = dx2
+                    // And d(layer_input) from attn path through rmsnorm1 = dx_rms1
+                    // Total: dy_prev = dx_rms1 (attn rmsnorm path)
+                    // Wait no - dx2 = d(loss)/d(x2), not d(loss)/d(layer_input)
+                    // d(layer_input) = d(loss)/d(x2) * d(x2)/d(layer_input) = dx2 (since x2 = input + o_out, d(x2)/d(input) = 1)
+                    // Plus the path through rmsnorm1: dx_rms1
+                    // Hmm but dx2 was already used as input to SDPA backward... let me reconsider.
+                    //
+                    // Actually the gradient flow is:
+                    //   dy → split to (dffn, dy_skip)  [dy_skip = dy due to residual]
+                    //   dffn → ffnBwd → dx_ffn
+                    //   dx_ffn → rmsnorm2_bwd → dx_rms2
+                    //   dx2 = dx_rms2 + dy  (skip connection from residual x2 → output)
+                    //   dx2 → sdpaBwd → dx_attn through Wo^T
+                    //   dx_attn → qkvBwd → dx_qkv
+                    //   dx_qkv → rmsnorm1_bwd → dx_rms1
+                    //   dy_prev_layer = dx_rms1 + dx2  (skip connection input → x2)
+                    //
+                    // So: dy for previous layer = dx_rms1 + dx2
+                    for(int i=0;i<SEQ*DIM;i++) dy[i] = dx_rms1[i] + dx2[i];
+                    free(dx_rms1);
+                }
+
+                // Embedding backward
+                dispatch_group_wait(dw_grp, DISPATCH_TIME_FOREVER);
+                embed_backward(gembed, dy, input_tokens, DIM, SEQ);
 
                 steps_batch++;
                 if (step % 10 == 0 || step == start_step)
-                    printf("step %-4d loss=%.6f\n", step, loss);
+                    printf("step %-4d loss=%.4f\n", step, loss);
             }
             double tms = tb_ms(mach_absolute_time() - tt);
             total_train_ms += tms;
             total_steps_done += steps_batch;
             total_batches++;
 
-            // Ensure all async dW finished before Adam
+            // Ensure all async dW finished
             dispatch_group_wait(dw_grp, DISPATCH_TIME_FOREVER);
-            // Adam update (scale gradients by 1/steps_batch for averaging)
+
+            // Adam update (scale gradients by 1/steps_batch)
             float gsc = 1.0f / steps_batch;
-            for(size_t i=0;i<wq_sz;i++){gWq[i]*=gsc;gWk[i]*=gsc;gWv[i]*=gsc;gWo[i]*=gsc;}
-            for(size_t i=0;i<w1_sz;i++) gW1[i]*=gsc;
-            for(size_t i=0;i<w2_sz;i++) gW2[i]*=gsc;
-            for(size_t i=0;i<w3_sz;i++) gW3[i]*=gsc;
-            for(int i=0;i<DIM;i++){grms1[i]*=gsc; grms2[i]*=gsc;}
             adam_t++;
-            adam_update(Wq, gWq, &aWq, adam_t, lr, adam_b1, adam_b2, adam_eps);
-            adam_update(Wk, gWk, &aWk, adam_t, lr, adam_b1, adam_b2, adam_eps);
-            adam_update(Wv, gWv, &aWv, adam_t, lr, adam_b1, adam_b2, adam_eps);
-            adam_update(Wo, gWo, &aWo, adam_t, lr, adam_b1, adam_b2, adam_eps);
-            adam_update(W1, gW1, &aW1, adam_t, lr, adam_b1, adam_b2, adam_eps);
-            adam_update(W2, gW2, &aW2, adam_t, lr, adam_b1, adam_b2, adam_eps);
-            adam_update(W3, gW3, &aW3, adam_t, lr, adam_b1, adam_b2, adam_eps);
-            adam_update(rms1_w, grms1, &arms1, adam_t, lr, adam_b1, adam_b2, adam_eps);
-            adam_update(rms2_w, grms2, &arms2, adam_t, lr, adam_b1, adam_b2, adam_eps);
+            for (int L=0; L<NLAYERS; L++) {
+                LayerGrads *g = &grads[L];
+                for(size_t i=0;i<WQ_SZ;i++){g->Wq[i]*=gsc;g->Wk[i]*=gsc;g->Wv[i]*=gsc;g->Wo[i]*=gsc;}
+                for(size_t i=0;i<W1_SZ;i++) g->W1[i]*=gsc;
+                for(size_t i=0;i<W2_SZ;i++) g->W2[i]*=gsc;
+                for(size_t i=0;i<W3_SZ;i++) g->W3[i]*=gsc;
+                for(int i=0;i<DIM;i++){g->rms_att[i]*=gsc; g->rms_ffn[i]*=gsc;}
+
+                adam_update(lw[L].Wq, g->Wq, &la[L].Wq, adam_t, lr, adam_b1, adam_b2, adam_eps);
+                adam_update(lw[L].Wk, g->Wk, &la[L].Wk, adam_t, lr, adam_b1, adam_b2, adam_eps);
+                adam_update(lw[L].Wv, g->Wv, &la[L].Wv, adam_t, lr, adam_b1, adam_b2, adam_eps);
+                adam_update(lw[L].Wo, g->Wo, &la[L].Wo, adam_t, lr, adam_b1, adam_b2, adam_eps);
+                adam_update(lw[L].W1, g->W1, &la[L].W1, adam_t, lr, adam_b1, adam_b2, adam_eps);
+                adam_update(lw[L].W2, g->W2, &la[L].W2, adam_t, lr, adam_b1, adam_b2, adam_eps);
+                adam_update(lw[L].W3, g->W3, &la[L].W3, adam_t, lr, adam_b1, adam_b2, adam_eps);
+                adam_update(lw[L].rms_att, g->rms_att, &la[L].rms_att, adam_t, lr, adam_b1, adam_b2, adam_eps);
+                adam_update(lw[L].rms_ffn, g->rms_ffn, &la[L].rms_ffn, adam_t, lr, adam_b1, adam_b2, adam_eps);
+            }
+            for(int i=0;i<DIM;i++) grms_final[i]*=gsc;
+            adam_update(rms_final, grms_final, &arms_final, adam_t, lr, adam_b1, adam_b2, adam_eps);
+            // Scale and update embed
+            for(size_t i=0;i<(size_t)VOCAB*DIM;i++) gembed[i]*=gsc;
+            adam_update(embed, gembed, &aembed, adam_t, lr, adam_b1, adam_b2, adam_eps);
 
             printf("  [batch %d: compile=%.0fms train=%.1fms (%.1fms/step) compiles=%d]\n",
                    steps_batch, cms, tms, tms/steps_batch, g_compile_count);
-            printf("    ane=%.1f io=%.1f elem=%.1f rms=%.1f cblas_wait=%.1f ms/step\n",
-                   t_ane/steps_batch, t_io/steps_batch, t_elem/steps_batch,
+            printf("    ane=%.1f io=%.1f cls=%.1f elem=%.1f rms=%.1f cblas_wait=%.1f ms/step\n",
+                   t_ane/steps_batch, t_io/steps_batch, t_cls/steps_batch, t_elem/steps_batch,
                    t_rms/steps_batch, t_cblas_wait/steps_batch);
         }
 
-        // === Efficiency Report ===
+        // Efficiency report
         double wall = tb_ms(mach_absolute_time() - t_wall_start);
         total_compile_ms += cum_compile; total_train_ms += cum_train;
         wall += cum_wall; total_steps_done += cum_steps; total_batches += cum_batches;
-        double fwd_flops = 4.0*2*DIM*DIM*SEQ + 2.0*2*DIM*HIDDEN*SEQ + 2.0*HIDDEN*DIM*SEQ;
-        double bwd_dx_flops = fwd_flops;
-        double sdpa_flops = 2.0*HEADS*5*SEQ*SEQ*HD;
-        double ane_flops = (fwd_flops + bwd_dx_flops + sdpa_flops) * total_steps_done;
-        double total_flops = (fwd_flops*3 + sdpa_flops) * total_steps_done; // fwd+bwd_dx+bwd_dw+sdpa
-
+        double fwd_flops = NLAYERS * (4.0*2*DIM*DIM*SEQ + 2.0*2*DIM*HIDDEN*SEQ + 2.0*HIDDEN*DIM*SEQ);
+        double sdpa_flops = NLAYERS * 2.0*HEADS*5*SEQ*SEQ*HD;
+        double cls_flops = 2.0*VOCAB*DIM*SEQ;
+        double total_flops = (fwd_flops*3 + sdpa_flops + cls_flops*3) * total_steps_done;
+        double ane_flops = (fwd_flops*2 + sdpa_flops) * total_steps_done;
         printf("\n=== Efficiency Report ===\n");
         printf("Total steps:     %d\n", total_steps_done);
         printf("Wall time:       %.0f ms (%.1f s)\n", wall, wall/1000);
         printf("Compile time:    %.0f ms (%.1f%%)\n", total_compile_ms, 100*total_compile_ms/wall);
         printf("Train time:      %.0f ms (%.1f%%)\n", total_train_ms, 100*total_train_ms/wall);
-        printf("Avg compile:     %.0f ms per batch (5 kernels)\n", total_compile_ms/total_batches);
         printf("Avg train:       %.1f ms/step\n", total_train_ms/total_steps_done);
         printf("ANE TFLOPS:      %.2f sustained\n", ane_flops / (total_train_ms * 1e9));
         printf("Total TFLOPS:    %.2f (ANE+CPU)\n", total_flops / (total_train_ms * 1e9));
         printf("ANE utilization: %.1f%% of 15.8 TFLOPS\n", 100*ane_flops/(total_train_ms*1e9)/15.8);
-        printf("Params:          %.2fM  Weights: %.1fMB FP16\n", total_params/1e6, total_params*2.0/1e6);
 
         // Cleanup
-        free_kern(kFwdAttn);free_kern(kFwdFFN);free_kern(kFFNBwd);
-        free_kern(kSdpaBwd1);free_kern(kSdpaBwd2);free_kern(kQKVb);
-        free(Wq);free(Wk);free(Wv);free(Wo);free(W1);free(W2);free(W3);
-        free(rms1_w);free(rms2_w);free(x_in);free(y_tgt);
-        free(xnorm);free(Q);free(K);free(V);free(attn_out);free(o_out);
-        free(x2);free(x2norm);free(h1);free(h3);free(silu_out);free(ffn_out);free(y_out);
-        free(dy);free(dffn);free(dh1);free(dh3);free(dx_ffn);free(dx2);
-        free(do_out_buf);free(dattn);free(dq);free(dk);free(dv);free(dx_attn);
-        free(probs_flat);free(dp_flat);
-        free(gWq);free(gWk);free(gWv);free(gWo);free(gW1);free(gW2);free(gW3);
-        free(grms1);free(grms2);
-        adam_free(&aWq);adam_free(&aWk);adam_free(&aWv);adam_free(&aWo);
-        adam_free(&aW1);adam_free(&aW2);adam_free(&aW3);adam_free(&arms1);adam_free(&arms2);
-        unlink(CKPT_PATH);
+        for (int L=0; L<NLAYERS; L++) {
+            free_layer_kernels(&kern[L]);
+            free_kern(sdpaBwd2[L]);
+            layer_weights_free(&lw[L]);
+            layer_adam_free(&la[L]);
+            layer_acts_free(&acts[L]);
+            layer_grads_free(&grads[L]);
+        }
+        munmap(token_data, data_len);
+        close(data_fd);
+        free(rms_final); free(embed); free(grms_final); free(gembed);
+        adam_free(&arms_final); adam_free(&aembed);
+        free(dy); free(dffn); free(dh1); free(dh3); free(dx_ffn); free(dx2);
+        free(do_out_buf); free(dq); free(dk); free(dv); free(dx_attn);
+        free(x_cur); free(x_final); free(logits); free(dlogits);
     }
     return 0;
 }