From b4d81b71d4f7025ad20717f14fe424511dcc44a6 Mon Sep 17 00:00:00 2001 From: Erik Bray Date: Tue, 3 Mar 2026 17:18:02 +0100 Subject: [PATCH] [feat] Merge upstream PRs #21, #23, #26: NEON-optimized training (train_opt), double-buffered async ANE training (train_double_buffer), Qwen2.5-0.5B LLM inference (inference/). Added get_path() env var support and SEC_FLAGS to all new targets. Skipped PR #22 (binary blob risk). --- .gitignore | 8 + PROBE_RESULTS.md | 88 +++ inference/README.md | 119 ++++ inference/convert_weights.py | 107 ++++ inference/main.m | 163 ++++++ inference/qwen_ane_infer.h | 435 +++++++++++++++ inference/run.py | 74 +++ training/Makefile | 10 +- training/stories_cpu_ops_opt.h | 110 ++++ training/stories_io.h | 6 + training/train_double_buffer.m | 791 +++++++++++++++++++++++++++ training/train_opt.m | 971 +++++++++++++++++++++++++++++++++ 12 files changed, 2881 insertions(+), 1 deletion(-) create mode 100644 PROBE_RESULTS.md create mode 100644 inference/README.md create mode 100644 inference/convert_weights.py create mode 100644 inference/main.m create mode 100644 inference/qwen_ane_infer.h create mode 100644 inference/run.py create mode 100644 training/stories_cpu_ops_opt.h create mode 100644 training/train_double_buffer.m create mode 100644 training/train_opt.m diff --git a/.gitignore b/.gitignore index 260aee7..72717e2 100644 --- a/.gitignore +++ b/.gitignore @@ -17,9 +17,17 @@ tiny_train_m1 train_large training/train_large training/train_large_ane +training/train_opt +training/train_double_buffer training/test_* !training/test_*.m +# Inference binaries +inference/qwen_ane + +# Dynamic training binaries +training/training_dynamic/train + # Test/research binaries test_chaining diff --git a/PROBE_RESULTS.md b/PROBE_RESULTS.md new file mode 100644 index 0000000..f3ea376 --- /dev/null +++ b/PROBE_RESULTS.md @@ -0,0 +1,88 @@ +# ANE Probe Results: M4 (macOS 26.3) + +**Machine:** Apple M4 (10 cores), 32GB RAM, macOS 26.3 +**Date:** 2026-03-03 +**ANE Family:** H16 (same as M5 results in `training/m5result.md`) + +## Key Discovery: Compile and Eval Run in Parallel + +**This was not known before.** The M5 probes tested compile and eval sequentially. +We tested with GCD `dispatch_async` and found they fully overlap. + +### probe_v2.m Results + +#### TEST 1: Pure Eval Throughput +``` +Conv 128x128, spatial=64 +1000 evals: 189.1ms total, 0.189ms/eval +11.09 GFLOPS sustained +``` + +#### TEST 2: Ping-pong (Two Pre-compiled Models) +``` +500 ping-pong pairs: 207.4ms (0.415ms/pair, 0.207ms/eval) +``` +Near-zero overhead switching between two loaded models. + +#### TEST 3: Sequential Compile (20 Models) +``` +All 20 models compiled and verified ✓ +Compile time: ~23-29ms each (consistent, no degradation) +All 20 models correct with different scale factors +``` + +#### TEST 4: Background Compile Overlap ⭐ +``` +Background compile: 26.8ms +Foreground evals during compile: 119 (26.8ms total) +Overlap: YES — compile and eval CAN run in parallel! +Background model verified correct ✓ +``` + +### Summary +| Metric | Value | +|--------|-------| +| Compile time | ~25ms per kernel set | +| Eval time | 0.189ms per eval | +| Compile:eval ratio | ~130:1 | +| Parallel compile+eval | **YES** | +| Max simultaneous models | 20+ | +| Ping-pong overhead | +10% vs single model | + +## Peak ANE Throughput (inmem_peak) + +``` +Config W(MB) GFLOP ms/eval TFLOPS +96x conv 512ch sp64 48.0 3.22 0.429 ms 7.50 +128x conv 512ch sp64 64.0 4.29 0.589 ms 7.30 +256x conv 256ch sp64 32.0 2.15 0.380 ms 5.65 +64x conv 512ch sp64 32.0 2.15 0.395 ms 5.43 +``` + +Peak: **7.50 TFLOPS** (47% of 15.8 TFLOPS theoretical). + +## Implications for Training + +### Before (train_large.m) +- Synchronous compile: **88.6% of wall time is compilation** +- 55ms compile per batch, 0.54ms actual training +- Training throughput limited by compiler, not by ANE + +### After (train_double_buffer.m) +- Async double-buffered compile: **0% compile stall** +- Background compile happens during forward/backward passes +- ~130 eval steps fit in one compile window +- Weight updates are "delayed" by one batch (standard technique in distributed training) +- Training throughput limited only by ANE eval speed + +### Architecture +``` +Time → +Active kernels: [=== eval batch N ===][=== eval batch N+1 ===][=== eval batch N+2 ===] +Background: [compile N+1 weights ][compile N+2 weights ][compile N+3 weights ] + ↑ ↑ ↑ + swap ready swap ready swap ready +``` + +Two kernel sets (A and B) alternate between active evaluation and background compilation. +When the background compile finishes, pointers swap atomically at the batch boundary. diff --git a/inference/README.md b/inference/README.md new file mode 100644 index 0000000..ae2f3ad --- /dev/null +++ b/inference/README.md @@ -0,0 +1,119 @@ +# ANE Inference — Full LLM on Apple Neural Engine + +First complete LLM inference running directly on Apple's Neural Engine via reverse-engineered `_ANEClient` APIs. No CoreML. No Xcode compiler dependency at runtime. Token-for-token match with PyTorch. + +Built on top of the [maderix/ANE](https://github.com/maderix/ANE) training runtime. + +## What This Does + +Runs **Qwen2.5-0.5B-Instruct** (24 transformer layers, 494M parameters) entirely on the ANE: + +- **169 ANE kernels** compiled at startup via `_ANEInMemoryModel` +- **82 tokens/sec** decode on M4 Pro +- **Zero GPU usage** — runs on 16 dedicated neural cores +- **Correct output** — matches PyTorch reference token-for-token + +All linear projections (Q, K, V, O, gate, up, down × 24 layers + chunked LM head) compile as baked-weight 1×1 convolution kernels on ANE. Element-wise ops (RMSNorm, RoPE, softmax, SiLU, attention scores) run on CPU via Accelerate BLAS. + +## Architecture + +``` +Token → Embedding (CPU) → 24× Transformer Layer → LM Head (CPU) → Next Token + │ + ├── RMSNorm (CPU) + ├── Q/K/V Projection (ANE conv kernel) + ├── RoPE (CPU, rotate_half) + ├── GQA Attention (CPU, 14 heads / 2 KV heads) + ├── O Projection (ANE conv kernel) + ├── Residual (CPU) + ├── RMSNorm (CPU) + ├── Gate/Up Projection (ANE conv kernel) + ├── SiLU + elementwise mul (CPU) + ├── Down Projection (ANE conv kernel) + └── Residual (CPU) +``` + +## Quick Start + +```bash +# 1. Convert weights from HuggingFace safetensors to flat binary +pip install safetensors torch transformers +python3 convert_weights.py /path/to/Qwen2.5-0.5B-Instruct qwen05b.bin + +# 2. Build +xcrun clang -O2 -framework Foundation -framework IOSurface \ + -framework CoreML -framework Accelerate -ldl -lobjc \ + -o qwen_ane main.m + +# 3. Run (pass space-separated token IDs) +./qwen_ane qwen05b.bin "151644 8948 198 2610 525 264 10950 17847 13" 20 + +# 4. With tokenizer (requires transformers) +python3 run.py "Say hello in one word." +``` + +## Output + +``` +=== Qwen2.5-0.5B ANE Inference === + +Loading weights... +Config: dim=896 hidden=4864 layers=24 heads=14 kv_heads=2 vocab=151936 +Compiling ANE kernels (169 total)... +Compile time: 5.1s + +Prompt: 28 tokens, generating up to 10 +Prefill: 64.2 t/s (28 tokens) +OUT: 9707 13 151645 +Decode: 82.4 t/s (2 tokens) + +→ "Hello." (matches PyTorch exactly) +``` + +## Files + +| File | What | +|------|------| +| `qwen_ane_infer.h` | Full 24-layer transformer forward pass, ANE kernel compilation, KV cache | +| `main.m` | Weight loader, token I/O, main generation loop | +| `convert_weights.py` | HuggingFace safetensors → flat f32 binary (includes Q/K/V biases) | +| `run.py` | Python wrapper with HuggingFace tokenizer | + +## Model Support + +Currently implements **Qwen2.5** architecture: +- GQA attention (grouped-query, `n_heads` ≠ `n_kv_heads`) +- `rotate_half` RoPE (not interleaved pairs) +- SwiGLU FFN (gate + up + silu + down) +- Q/K/V bias (Qwen-specific) +- Tied word embeddings (lm_head = embed) +- Chunked LM head (vocab > 65536 exceeds ANE max dim) + +Adapting to other architectures (LLaMA, Gemma, Mistral) requires: +1. Adjusting the config constants in `qwen_ane_infer.h` +2. Updating `convert_weights.py` for the weight naming scheme +3. Removing Q/K/V bias handling if the model doesn't have them +4. Switching RoPE to interleaved pairs if needed + +## Requirements + +- macOS 15+ on Apple Silicon (M1/M2/M3/M4) +- Xcode Command Line Tools (for `xcrun clang`) +- Python 3.9+ with `safetensors`, `torch`, `transformers` (for weight conversion) + +## Known Limitations + +- **CPU projections only** — ANE baked-weight conv kernels compile successfully but produce incorrect output (FP16 weight blob format mismatch). The `USE_ANE_PROJECTIONS` toggle exists but defaults to 0 (CPU via Accelerate BLAS). Fixing this would push decode speed from 82 t/s to 120+ t/s. +- **No persistent server** — each invocation recompiles 169 kernels (~5s). A server mode that compiles once and serves via HTTP would eliminate this overhead. +- **Single model** — hardcoded for Qwen2.5-0.5B. Needs parameterization for other sizes. +- **f32 weights** — 1.9GB on disk. FP16 or quantized weight support would halve this. + +## How It Works + +The key insight from maderix's reverse engineering: the ANE executes compiled MIL (Machine Learning Intermediate Language) programs as atomic graph operations. Each linear projection becomes a MIL program with baked FP16 weights, compiled in-memory via `_ANEInMemoryModel`, and executed through IOSurface-based zero-copy I/O. + +We chain 169 of these atomic operations (7 per transformer layer + 16 LM head chunks) with CPU-side element-wise ops in between. The ANE handles the compute-heavy matmuls; the CPU handles the memory-bound operations (attention scores, softmax, RoPE). + +## License + +Same as maderix/ANE — research and educational use. diff --git a/inference/convert_weights.py b/inference/convert_weights.py new file mode 100644 index 0000000..d5121fb --- /dev/null +++ b/inference/convert_weights.py @@ -0,0 +1,107 @@ +#!/usr/bin/env python3 +"""Convert Qwen2.5-0.5B-Instruct safetensors → flat binary for ANE inference. + +Output format: config header (7 ints) + all weights in f32, layer by layer. +Matches the layout expected by qwen_ane_infer.h. + +Usage: + python3 convert_weights.py /path/to/Qwen2.5-0.5B-Instruct /path/to/output.bin +""" + +import struct +import sys +import numpy as np +from pathlib import Path +from safetensors import safe_open + +def convert(model_dir: str, output_path: str): + model_dir = Path(model_dir) + + # Load safetensors + st_files = list(model_dir.glob("*.safetensors")) + if not st_files: + print(f"No safetensors files in {model_dir}") + sys.exit(1) + + tensors = {} + for f in st_files: + with safe_open(str(f), framework="pt") as sf: + for key in sf.keys(): + tensors[key] = sf.get_tensor(key).float().numpy() + + print(f"Loaded {len(tensors)} tensors from {len(st_files)} files") + + # Qwen2.5-0.5B config + dim = 896 + hidden = 4864 + n_layers = 24 + n_heads = 14 + n_kv_heads = 2 + vocab_size = 151936 + max_seq = 512 + + with open(output_path, "wb") as f: + # Config header: 7 x int32 + f.write(struct.pack("iiiiiii", + dim, hidden, n_layers, n_heads, n_kv_heads, vocab_size, max_seq)) + + # Embedding [vocab, dim] + emb = tensors["model.embed_tokens.weight"].astype(np.float32) + print(f"embed: {emb.shape}") + f.write(emb.tobytes()) + + # Per-layer weights + for l in range(n_layers): + prefix = f"model.layers.{l}" + + # Attention norm + rms_att = tensors[f"{prefix}.input_layernorm.weight"].astype(np.float32) + f.write(rms_att.tobytes()) + + # Q, K, V projections + wq = tensors[f"{prefix}.self_attn.q_proj.weight"].astype(np.float32) + wk = tensors[f"{prefix}.self_attn.k_proj.weight"].astype(np.float32) + wv = tensors[f"{prefix}.self_attn.v_proj.weight"].astype(np.float32) + wo = tensors[f"{prefix}.self_attn.o_proj.weight"].astype(np.float32) + f.write(wq.tobytes()) + f.write(wk.tobytes()) + f.write(wv.tobytes()) + f.write(wo.tobytes()) + + # Q/K biases (Qwen has them) + # Q/K/V biases + qb = tensors.get(f"{prefix}.self_attn.q_proj.bias") + kb = tensors.get(f"{prefix}.self_attn.k_proj.bias") + vb = tensors.get(f"{prefix}.self_attn.v_proj.bias") + f.write((qb if qb is not None else np.zeros(wq.shape[0])).astype(np.float32).tobytes()) + f.write((kb if kb is not None else np.zeros(wk.shape[0])).astype(np.float32).tobytes()) + f.write((vb if vb is not None else np.zeros(wv.shape[0])).astype(np.float32).tobytes()) + + # FFN norm + rms_ffn = tensors[f"{prefix}.post_attention_layernorm.weight"].astype(np.float32) + f.write(rms_ffn.tobytes()) + + # FFN: gate, up, down + w_gate = tensors[f"{prefix}.mlp.gate_proj.weight"].astype(np.float32) + w_up = tensors[f"{prefix}.mlp.up_proj.weight"].astype(np.float32) + w_down = tensors[f"{prefix}.mlp.down_proj.weight"].astype(np.float32) + f.write(w_gate.tobytes()) + f.write(w_up.tobytes()) + f.write(w_down.tobytes()) + + print(f" Layer {l}: Q{wq.shape} K{wk.shape} V{wv.shape} O{wo.shape} " + f"gate{w_gate.shape} up{w_up.shape} down{w_down.shape}") + + # Final norm + rms_final = tensors["model.norm.weight"].astype(np.float32) + f.write(rms_final.tobytes()) + + size_mb = Path(output_path).stat().st_size / 1024 / 1024 + print(f"\nWritten: {output_path} ({size_mb:.0f} MB)") + + +if __name__ == "__main__": + if len(sys.argv) != 3: + print("Usage: python3 convert_weights.py ") + sys.exit(1) + convert(sys.argv[1], sys.argv[2]) diff --git a/inference/main.m b/inference/main.m new file mode 100644 index 0000000..d5069fe --- /dev/null +++ b/inference/main.m @@ -0,0 +1,163 @@ +// main.m — Qwen2.5-0.5B inference on Apple Neural Engine +// Compiles ANE kernels for all linear projections, runs autoregressive decode. +// +// Build: +// xcrun clang -O2 -framework Foundation -framework IOSurface \ +// -framework CoreML -framework Accelerate -ldl -lobjc \ +// -o qwen_ane main.m +// +// Run: +// ./qwen_ane qwen05b.bin "Hello world" +// +#import +#include +#include +#include +#include +#include "qwen_ane_infer.h" + +int g_fp16_io = 0; +static QwenModel g_model; + +static int load_weights(const char *path) { + FILE *f = fopen(path, "rb"); + if (!f) { fprintf(stderr, "Cannot open %s\n", path); return -1; } + + // Read config header + int config[7]; + fread(config, sizeof(int), 7, f); + int dim = config[0], hidden = config[1], n_layers = config[2]; + int n_heads = config[3], n_kv_heads = config[4], vocab = config[5]; + printf("Config: dim=%d hidden=%d layers=%d heads=%d kv_heads=%d vocab=%d\n", + dim, hidden, n_layers, n_heads, n_kv_heads, vocab); + + int q_dim = n_heads * QWEN_HEAD_DIM; + int kv_dim = n_kv_heads * QWEN_HEAD_DIM; + + // Embedding + g_model.embed = (float*)malloc((size_t)vocab * dim * sizeof(float)); + fread(g_model.embed, sizeof(float), (size_t)vocab * dim, f); + + // Per-layer + for (int l = 0; l < n_layers; l++) { + g_model.rms_att[l] = (float*)malloc(dim * sizeof(float)); + fread(g_model.rms_att[l], sizeof(float), dim, f); + + g_model.wq[l] = (float*)malloc((size_t)q_dim * dim * sizeof(float)); + fread(g_model.wq[l], sizeof(float), (size_t)q_dim * dim, f); + g_model.wk[l] = (float*)malloc((size_t)kv_dim * dim * sizeof(float)); + fread(g_model.wk[l], sizeof(float), (size_t)kv_dim * dim, f); + g_model.wv[l] = (float*)malloc((size_t)kv_dim * dim * sizeof(float)); + fread(g_model.wv[l], sizeof(float), (size_t)kv_dim * dim, f); + g_model.wo[l] = (float*)malloc((size_t)q_dim * dim * sizeof(float)); // o_proj is [dim, q_dim] + fread(g_model.wo[l], sizeof(float), (size_t)dim * q_dim, f); + + // Q/K/V biases + g_model.q_bias[l] = (float*)malloc(q_dim * sizeof(float)); + g_model.k_bias[l] = (float*)malloc(kv_dim * sizeof(float)); + g_model.v_bias[l] = (float*)malloc(kv_dim * sizeof(float)); + fread(g_model.q_bias[l], sizeof(float), q_dim, f); + fread(g_model.k_bias[l], sizeof(float), kv_dim, f); + fread(g_model.v_bias[l], sizeof(float), kv_dim, f); + + g_model.rms_ffn[l] = (float*)malloc(dim * sizeof(float)); + fread(g_model.rms_ffn[l], sizeof(float), dim, f); + + g_model.w_gate[l] = (float*)malloc((size_t)hidden * dim * sizeof(float)); + fread(g_model.w_gate[l], sizeof(float), (size_t)hidden * dim, f); + g_model.w_up[l] = (float*)malloc((size_t)hidden * dim * sizeof(float)); + fread(g_model.w_up[l], sizeof(float), (size_t)hidden * dim, f); + g_model.w_down[l] = (float*)malloc((size_t)dim * hidden * sizeof(float)); + fread(g_model.w_down[l], sizeof(float), (size_t)dim * hidden, f); + } + + g_model.rms_final = (float*)malloc(dim * sizeof(float)); + fread(g_model.rms_final, sizeof(float), dim, f); + + fclose(f); + printf("Weights loaded (%.0f MB)\n", + (float)ftell(f) / 1024 / 1024); + return 0; +} + +int main(int argc, char **argv) { + @autoreleasepool { + if (argc < 3) { + fprintf(stderr, "Usage: %s \n", argv[0]); + return 1; + } + + printf("=== Qwen2.5-0.5B ANE Inference ===\n\n"); + + // Load weights + printf("Loading weights...\n"); + if (load_weights(argv[1]) != 0) return 1; + + // Allocate buffers + qwen_alloc(&g_model); + + // Compile ANE kernels + printf("Compiling ANE kernels (169 total)...\n"); + struct timespec t0, t1; + clock_gettime(CLOCK_MONOTONIC, &t0); + qwen_compile_kernels(&g_model); + clock_gettime(CLOCK_MONOTONIC, &t1); + double compile_sec = (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1e9; + printf("Compile time: %.1fs\n\n", compile_sec); + + // Parse token IDs from argv[2] (space-separated) + // argv[3] = max generation tokens + int max_gen = 50; + if (argc >= 4) max_gen = atoi(argv[3]); + + // Parse input token IDs + int prompt_ids[2048]; + int n_prompt = 0; + char *tok_str = strdup(argv[2]); + char *saveptr; + char *p = strtok_r(tok_str, " ", &saveptr); + while (p && n_prompt < 2048) { + prompt_ids[n_prompt++] = atoi(p); + p = strtok_r(NULL, " ", &saveptr); + } + free(tok_str); + printf("Prompt: %d tokens, generating up to %d\n", n_prompt, max_gen); + + clock_gettime(CLOCK_MONOTONIC, &t0); + + // Prefill: feed all prompt tokens + int next = 0; + for (int i = 0; i < n_prompt; i++) { + next = qwen_forward(&g_model, prompt_ids[i]); + } + + struct timespec t_prefill; + clock_gettime(CLOCK_MONOTONIC, &t_prefill); + double prefill_sec = (t_prefill.tv_sec - t0.tv_sec) + (t_prefill.tv_nsec - t0.tv_nsec) / 1e9; + printf("Prefill: %d tokens in %.2fs (%.1f t/s)\n", n_prompt, prefill_sec, n_prompt / prefill_sec); + + // Generate + int eos = 151645; // <|im_end|> + int eos2 = 151643; // <|endoftext|> + printf("OUT:"); + for (int i = 0; i < max_gen; i++) { + printf(" %d", next); + fflush(stdout); + if (next == eos || next == eos2) break; + next = qwen_forward(&g_model, next); + } + printf("\n"); + + clock_gettime(CLOCK_MONOTONIC, &t1); + double gen_sec = (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1e9; + int total_tokens = g_model.pos; + int gen_tokens = total_tokens - n_prompt; + double decode_sec = gen_sec - prefill_sec; + printf("\nTotal: %d tokens in %.2fs\n", total_tokens, gen_sec); + printf("Prefill: %.1f t/s (%d tokens)\n", n_prompt / prefill_sec, n_prompt); + printf("Decode: %.1f t/s (%d tokens)\n", + decode_sec > 0 ? gen_tokens / decode_sec : 0, gen_tokens); + + return 0; + } +} diff --git a/inference/qwen_ane_infer.h b/inference/qwen_ane_infer.h new file mode 100644 index 0000000..58dd10b --- /dev/null +++ b/inference/qwen_ane_infer.h @@ -0,0 +1,435 @@ +// qwen_ane_infer.h — Qwen2.5-0.5B inference on Apple Neural Engine +// Linear projections on ANE (baked-weight conv kernels), CPU for element-wise ops. +// Based on maderix/ANE runtime + MIL generation. +#pragma once + +#include "../training/ane_runtime.h" +#include "../training/ane_mil_gen.h" + +// Compile a matmul kernel: W[out_ch, in_ch] @ x[in_ch] → y[out_ch] +// Uses the two-input matmul MIL variant (weights passed as input, not baked) +static ANEKernel *compile_matmul_kernel(int in_ch, int out_ch) { + NSString *mil = mil_gen_matmul(in_ch, out_ch, 1); + size_t inputSizes[2] = {(size_t)in_ch * 1 * 4, (size_t)out_ch * in_ch * 4}; + size_t outBytes = (size_t)out_ch * 1 * 4; + return ane_compile([mil dataUsingEncoding:NSUTF8StringEncoding], nil, 2, inputSizes, 1, &outBytes); +} + +// Compile a baked-weight conv kernel (from model.h) +static ANEKernel *compile_conv_kernel(const float *weights, int in_ch, int out_ch, int spatial) { + NSData *wb = mil_build_weight_blob(weights, out_ch, in_ch); + NSString *mil = mil_gen_conv(in_ch, out_ch, spatial); + size_t inBytes = (size_t)in_ch * spatial * 4; + size_t outBytes = (size_t)out_ch * spatial * 4; + return ane_compile([mil dataUsingEncoding:NSUTF8StringEncoding], wb, 1, &inBytes, 1, &outBytes); +} +#include +#include +#include + +// Qwen2.5-0.5B-Instruct architecture +#define QWEN_DIM 896 +#define QWEN_HIDDEN 4864 +#define QWEN_LAYERS 24 +#define QWEN_HEADS 14 +#define QWEN_KV_HEADS 2 +#define QWEN_HEAD_DIM 64 +#define QWEN_VOCAB 151936 +#define QWEN_RMS_EPS 1e-6f +#define QWEN_ROPE_THETA 1000000.0f +#define QWEN_MAX_SEQ 512 + +// GQA: each KV head serves (HEADS / KV_HEADS) query heads +#define QWEN_GQA_FACTOR (QWEN_HEADS / QWEN_KV_HEADS) + +// Sizes for GQA projections +#define QWEN_Q_DIM (QWEN_HEADS * QWEN_HEAD_DIM) // 896 +#define QWEN_KV_DIM (QWEN_KV_HEADS * QWEN_HEAD_DIM) // 128 + +typedef struct { + // Weights (f32) + float *embed; // [vocab, dim] + float *rms_att[QWEN_LAYERS]; // [dim] + float *wq[QWEN_LAYERS]; // [q_dim, dim] + float *wk[QWEN_LAYERS]; // [kv_dim, dim] + float *wv[QWEN_LAYERS]; // [kv_dim, dim] + float *wo[QWEN_LAYERS]; // [dim, q_dim] + float *rms_ffn[QWEN_LAYERS]; // [dim] + float *w_gate[QWEN_LAYERS]; // [hidden, dim] + float *w_up[QWEN_LAYERS]; // [hidden, dim] + float *w_down[QWEN_LAYERS]; // [dim, hidden] + float *rms_final; // [dim] + // wcls = embed (tied) + + // ANE kernels (one per linear projection per layer) + ANEKernel *k_q[QWEN_LAYERS]; + ANEKernel *k_k[QWEN_LAYERS]; + ANEKernel *k_v[QWEN_LAYERS]; + ANEKernel *k_o[QWEN_LAYERS]; + ANEKernel *k_gate[QWEN_LAYERS]; + ANEKernel *k_up[QWEN_LAYERS]; + ANEKernel *k_down[QWEN_LAYERS]; + // LM head chunked: vocab too large for single ANE kernel (max 65536) + #define QWEN_LM_CHUNKS 16 + #define QWEN_LM_CHUNK_SIZE 9496 // 151936 / 16 + ANEKernel *k_lmhead[QWEN_LM_CHUNKS]; + + // Q/K/V biases per layer + float *q_bias[QWEN_LAYERS]; // [q_dim] + float *k_bias[QWEN_LAYERS]; // [kv_dim] + float *v_bias[QWEN_LAYERS]; // [kv_dim] + + // KV cache [layer][kv_heads * head_dim * max_seq] + float *kv_cache_k[QWEN_LAYERS]; + float *kv_cache_v[QWEN_LAYERS]; + int pos; // current position in sequence + + // Scratch buffers + float *x; // [dim] + float *xb; // [dim] + float *q; // [q_dim] + float *k; // [kv_dim] + float *v; // [kv_dim] + float *att; // [heads * max_seq] + float *hb; // [hidden] + float *hb2; // [hidden] + float *logits; // [vocab] +} QwenModel; + +// ── CPU ops ────────────────────────────────────────────────────────── + +static void qwen_rmsnorm(float *out, const float *x, const float *w, int D) { + float ss = 0; + for (int i = 0; i < D; i++) ss += x[i] * x[i]; + ss = 1.0f / sqrtf(ss / D + QWEN_RMS_EPS); + for (int i = 0; i < D; i++) out[i] = x[i] * ss * w[i]; +} + +static void qwen_rope(float *q, float *k, int pos, int n_q_heads, int n_kv_heads, int head_dim) { + // Qwen uses rotate_half RoPE (NOT interleaved pairs): + // rotate_half(x) = [-x[dim/2:], x[:dim/2]] + // q_embed = q * cos + rotate_half(q) * sin + // cos/sin have shape [head_dim/2] and are applied to both halves + int half = head_dim / 2; + + // Precompute cos/sin for this position (head_dim/2 frequencies) + float cos_v[half], sin_v[half]; + for (int i = 0; i < half; i++) { + float freq = 1.0f / powf(QWEN_ROPE_THETA, (float)(2 * i) / head_dim); + float angle = pos * freq; + cos_v[i] = cosf(angle); + sin_v[i] = sinf(angle); + } + + // Apply to Q heads + for (int h = 0; h < n_q_heads; h++) { + float *qh = q + h * head_dim; + for (int i = 0; i < half; i++) { + float q_first = qh[i]; + float q_second = qh[i + half]; + // rotate_half: [-q_second, q_first] + qh[i] = q_first * cos_v[i] + (-q_second) * sin_v[i]; + qh[i + half] = q_second * cos_v[i] + q_first * sin_v[i]; + } + } + + // Apply to K heads + for (int h = 0; h < n_kv_heads; h++) { + float *kh = k + h * head_dim; + for (int i = 0; i < half; i++) { + float k_first = kh[i]; + float k_second = kh[i + half]; + kh[i] = k_first * cos_v[i] + (-k_second) * sin_v[i]; + kh[i + half] = k_second * cos_v[i] + k_first * sin_v[i]; + } + } +} + +static void qwen_silu(float *x, int n) { + for (int i = 0; i < n; i++) + x[i] = x[i] / (1.0f + expf(-x[i])); +} + +// ── ANE projection helper (single token: spatial=1) ───────────────── + +static void ane_project(ANEKernel *kernel, const float *in, float *out, + int in_dim, int out_dim) { + // For single-token inference: spatial=1 + ane_write_input(kernel, 0, in, in_dim * sizeof(float)); + ane_eval(kernel); + ane_read_output(kernel, 0, out, out_dim * sizeof(float)); +} + +// CPU matmul via Accelerate BLAS: y = W @ x, W[out_dim, in_dim] +#include + +static void cpu_project(const float *W, const float *x, float *y, int in_dim, int out_dim) { + // y = W @ x where W is [out_dim, in_dim] row-major + // cblas_sgemv: y = alpha * A * x + beta * y + cblas_sgemv(CblasRowMajor, CblasNoTrans, + out_dim, in_dim, + 1.0f, W, in_dim, + x, 1, + 0.0f, y, 1); +} + +// Toggle: 1 = use ANE for projections, 0 = CPU fallback +#define USE_ANE_PROJECTIONS 0 + +// ── Forward one token ──────────────────────────────────────────────── + +static int qwen_forward(QwenModel *m, int token) { + int D = QWEN_DIM, HD = QWEN_HIDDEN; + int pos = m->pos; + + // Token embedding + memcpy(m->x, m->embed + token * D, D * sizeof(float)); + + for (int l = 0; l < QWEN_LAYERS; l++) { + // Attention RMSNorm + qwen_rmsnorm(m->xb, m->x, m->rms_att[l], D); + + // Debug: print first layer input/output norms + if (l == 0 && pos == 0) { + float xnorm = 0, qnorm = 0; + for (int i = 0; i < D; i++) xnorm += m->xb[i] * m->xb[i]; + printf(" L0 RMSNorm out norm=%.4f (first 4: %.4f %.4f %.4f %.4f)\n", + sqrtf(xnorm), m->xb[0], m->xb[1], m->xb[2], m->xb[3]); + } + + // QKV projections (ANE) + bias + #if USE_ANE_PROJECTIONS + ane_project(m->k_q[l], m->xb, m->q, D, QWEN_Q_DIM); + ane_project(m->k_k[l], m->xb, m->k, D, QWEN_KV_DIM); + ane_project(m->k_v[l], m->xb, m->v, D, QWEN_KV_DIM); + #else + cpu_project(m->wq[l], m->xb, m->q, D, QWEN_Q_DIM); + cpu_project(m->wk[l], m->xb, m->k, D, QWEN_KV_DIM); + cpu_project(m->wv[l], m->xb, m->v, D, QWEN_KV_DIM); + #endif + // Apply Q/K biases + if (m->q_bias[l]) { + for (int i = 0; i < QWEN_Q_DIM; i++) m->q[i] += m->q_bias[l][i]; + } + if (m->k_bias[l]) { + for (int i = 0; i < QWEN_KV_DIM; i++) m->k[i] += m->k_bias[l][i]; + } + if (m->v_bias[l]) { + for (int i = 0; i < QWEN_KV_DIM; i++) m->v[i] += m->v_bias[l][i]; + } + + if (l == 0 && pos == 0) { + float qn = 0; + for (int i = 0; i < QWEN_Q_DIM; i++) qn += m->q[i] * m->q[i]; + printf(" L0 ANE Q norm=%.4f (first 4: %.4f %.4f %.4f %.4f)\n", + sqrtf(qn), m->q[0], m->q[1], m->q[2], m->q[3]); + // CPU reference + float cpu_q[4] = {0}; + for (int i = 0; i < 4; i++) { + for (int j = 0; j < D; j++) + cpu_q[i] += m->wq[0][i * D + j] * m->xb[j]; + cpu_q[i] += m->q_bias[0][i]; + } + printf(" L0 CPU Q first 4: %.4f %.4f %.4f %.4f\n", + cpu_q[0], cpu_q[1], cpu_q[2], cpu_q[3]); + } + + // RoPE + qwen_rope(m->q, m->k, pos, QWEN_HEADS, QWEN_KV_HEADS, QWEN_HEAD_DIM); + + // Store K, V in cache + memcpy(m->kv_cache_k[l] + pos * QWEN_KV_DIM, + m->k, QWEN_KV_DIM * sizeof(float)); + memcpy(m->kv_cache_v[l] + pos * QWEN_KV_DIM, + m->v, QWEN_KV_DIM * sizeof(float)); + + // GQA attention (CPU — element-wise ops) + float scale = 1.0f / sqrtf((float)QWEN_HEAD_DIM); + float *attn_out = m->xb; // reuse buffer + memset(attn_out, 0, QWEN_Q_DIM * sizeof(float)); + + for (int h = 0; h < QWEN_HEADS; h++) { + int kv_h = h / QWEN_GQA_FACTOR; + float *qh = m->q + h * QWEN_HEAD_DIM; + + // Attention scores: Q @ K^T for all positions up to pos + float max_score = -1e9f; + for (int t = 0; t <= pos; t++) { + float *kt = m->kv_cache_k[l] + t * QWEN_KV_DIM + kv_h * QWEN_HEAD_DIM; + // Use BLAS dot product for precision + float score = cblas_sdot(QWEN_HEAD_DIM, qh, 1, kt, 1); + m->att[h * QWEN_MAX_SEQ + t] = score * scale; + if (score * scale > max_score) max_score = score * scale; + } + // Softmax (double accumulation for precision) + double sum = 0; + for (int t = 0; t <= pos; t++) { + m->att[h * QWEN_MAX_SEQ + t] = expf(m->att[h * QWEN_MAX_SEQ + t] - max_score); + sum += (double)m->att[h * QWEN_MAX_SEQ + t]; + } + float inv_sum = (float)(1.0 / sum); + for (int t = 0; t <= pos; t++) + m->att[h * QWEN_MAX_SEQ + t] *= inv_sum; + + // Weighted sum of V: attn_out[h] += att[t] * V[t] for each t + for (int t = 0; t <= pos; t++) { + float a = m->att[h * QWEN_MAX_SEQ + t]; + float *vt = m->kv_cache_v[l] + t * QWEN_KV_DIM + kv_h * QWEN_HEAD_DIM; + cblas_saxpy(QWEN_HEAD_DIM, a, vt, 1, + attn_out + h * QWEN_HEAD_DIM, 1); + } + } + + float o_out[QWEN_DIM]; + #if USE_ANE_PROJECTIONS + ane_project(m->k_o[l], attn_out, o_out, QWEN_Q_DIM, D); + #else + cpu_project(m->wo[l], attn_out, o_out, QWEN_Q_DIM, D); + #endif + + // Residual + for (int i = 0; i < D; i++) m->x[i] += o_out[i]; + + if (l == 0 && pos == 0) { + float pan = 0; + for (int i = 0; i < D; i++) pan += m->x[i] * m->x[i]; + printf(" L0 post-attn norm=%.4f first4=[%.6f, %.6f, %.6f, %.6f]\n", + sqrtf(pan), m->x[0], m->x[1], m->x[2], m->x[3]); + float on = 0; + for (int i = 0; i < D; i++) on += o_out[i] * o_out[i]; + printf(" L0 o_proj out norm=%.4f first4=[%.6f, %.6f, %.6f, %.6f]\n", + sqrtf(on), o_out[0], o_out[1], o_out[2], o_out[3]); + } + + // FFN RMSNorm + qwen_rmsnorm(m->xb, m->x, m->rms_ffn[l], D); + + // SwiGLU FFN + #if USE_ANE_PROJECTIONS + ane_project(m->k_gate[l], m->xb, m->hb, D, HD); + ane_project(m->k_up[l], m->xb, m->hb2, D, HD); + #else + cpu_project(m->w_gate[l], m->xb, m->hb, D, HD); + cpu_project(m->w_up[l], m->xb, m->hb2, D, HD); + #endif + + if (l == 0 && pos == 0) { + float gn = 0, un = 0; + for (int i = 0; i < HD; i++) { gn += m->hb[i]*m->hb[i]; un += m->hb2[i]*m->hb2[i]; } + printf(" L0 gate norm=%.4f up norm=%.4f\n", sqrtf(gn), sqrtf(un)); + printf(" L0 gate first4=[%.6f, %.6f, %.6f, %.6f]\n", + m->hb[0], m->hb[1], m->hb[2], m->hb[3]); + } + + qwen_silu(m->hb, HD); + for (int i = 0; i < HD; i++) m->hb[i] *= m->hb2[i]; + + float ffn_out[QWEN_DIM]; + #if USE_ANE_PROJECTIONS + ane_project(m->k_down[l], m->hb, ffn_out, HD, D); + #else + cpu_project(m->w_down[l], m->hb, ffn_out, HD, D); + #endif + + // Residual + for (int i = 0; i < D; i++) m->x[i] += ffn_out[i]; + + // Debug: hidden state after each layer (first 3 layers, first token only) + if (l < 3 && pos == 0) { + float hn = 0; + for (int i = 0; i < D; i++) hn += m->x[i] * m->x[i]; + printf(" C hidden[%d] norm=%.4f first4=[%.4f, %.4f, %.4f, %.4f]\n", + l+1, sqrtf(hn), m->x[0], m->x[1], m->x[2], m->x[3]); + } + } + + // Final RMSNorm + qwen_rmsnorm(m->xb, m->x, m->rms_final, D); + + // Debug: check final hidden state before LM head + if (m->pos < 2) { + float fn = 0; + for (int i = 0; i < D; i++) fn += m->xb[i] * m->xb[i]; + printf(" Final hidden norm=%.4f (first 4: %.6f %.6f %.6f %.6f)\n", + sqrtf(fn), m->xb[0], m->xb[1], m->xb[2], m->xb[3]); + } + + // LM head via Accelerate BLAS: logits = embed @ xb + // embed is [vocab, dim] row-major + cblas_sgemv(CblasRowMajor, CblasNoTrans, + QWEN_VOCAB, D, + 1.0f, m->embed, D, + m->xb, 1, + 0.0f, m->logits, 1); + + // Debug: check logits + if (m->pos < 2) { + float lmax = m->logits[0], lmin = m->logits[0]; + int nonzero = 0; + for (int i = 0; i < QWEN_VOCAB; i++) { + if (m->logits[i] > lmax) lmax = m->logits[i]; + if (m->logits[i] < lmin) lmin = m->logits[i]; + if (m->logits[i] != 0.0f) nonzero++; + } + printf(" Logits: min=%.4f max=%.4f nonzero=%d/%d\n", lmin, lmax, nonzero, QWEN_VOCAB); + } + + m->pos++; + + // Argmax + int max_idx = 0; + float max_val = m->logits[0]; + for (int i = 1; i < QWEN_VOCAB; i++) { + if (m->logits[i] > max_val) { + max_val = m->logits[i]; + max_idx = i; + } + } + return max_idx; +} + +// ── Compile all ANE kernels ────────────────────────────────────────── + +static void qwen_compile_kernels(QwenModel *m) { + int D = QWEN_DIM, HD = QWEN_HIDDEN; + printf("Compiling %d ANE kernels...\n", QWEN_LAYERS * 7 + 1); + for (int l = 0; l < QWEN_LAYERS; l++) { + m->k_q[l] = compile_conv_kernel(m->wq[l], D, QWEN_Q_DIM, 1); + m->k_k[l] = compile_conv_kernel(m->wk[l], D, QWEN_KV_DIM, 1); + m->k_v[l] = compile_conv_kernel(m->wv[l], D, QWEN_KV_DIM, 1); + m->k_o[l] = compile_conv_kernel(m->wo[l], QWEN_Q_DIM, D, 1); + m->k_gate[l] = compile_conv_kernel(m->w_gate[l], D, HD, 1); + m->k_up[l] = compile_conv_kernel(m->w_up[l], D, HD, 1); + m->k_down[l] = compile_conv_kernel(m->w_down[l], HD, D, 1); + printf(" Layer %d/%d compiled\r", l+1, QWEN_LAYERS); + fflush(stdout); + } + // LM head (tied = embedding, chunked into 16 pieces) + for (int c = 0; c < QWEN_LM_CHUNKS; c++) { + float *chunk_weights = m->embed + c * QWEN_LM_CHUNK_SIZE * D; + m->k_lmhead[c] = compile_conv_kernel(chunk_weights, D, QWEN_LM_CHUNK_SIZE, 1); + if (!m->k_lmhead[c]) { + printf(" LM head chunk %d FAILED to compile\n", c); + } + } + printf("\nAll kernels compiled.\n"); +} + +// ── Allocate buffers ───────────────────────────────────────────────── + +static void qwen_alloc(QwenModel *m) { + m->x = (float*)calloc(QWEN_DIM, sizeof(float)); + m->xb = (float*)calloc(QWEN_DIM, sizeof(float)); + m->q = (float*)calloc(QWEN_Q_DIM, sizeof(float)); + m->k = (float*)calloc(QWEN_KV_DIM, sizeof(float)); + m->v = (float*)calloc(QWEN_KV_DIM, sizeof(float)); + m->att = (float*)calloc(QWEN_HEADS * QWEN_MAX_SEQ, sizeof(float)); + m->hb = (float*)calloc(QWEN_HIDDEN, sizeof(float)); + m->hb2 = (float*)calloc(QWEN_HIDDEN, sizeof(float)); + m->logits = (float*)calloc(QWEN_VOCAB, sizeof(float)); + for (int l = 0; l < QWEN_LAYERS; l++) { + m->kv_cache_k[l] = (float*)calloc(QWEN_MAX_SEQ * QWEN_KV_DIM, sizeof(float)); + m->kv_cache_v[l] = (float*)calloc(QWEN_MAX_SEQ * QWEN_KV_DIM, sizeof(float)); + } + m->pos = 0; +} diff --git a/inference/run.py b/inference/run.py new file mode 100644 index 0000000..234ff86 --- /dev/null +++ b/inference/run.py @@ -0,0 +1,74 @@ +#!/usr/bin/env python3 +"""Run Qwen2.5-0.5B on ANE with proper tokenization. + +Usage: + python3 run.py "Your prompt here" [--max-tokens 50] +""" +import argparse +import ctypes +import struct +import sys +import time +from pathlib import Path + +INFERENCE_DIR = Path(__file__).parent +WEIGHTS_PATH = INFERENCE_DIR / "qwen05b.bin" +MODEL_DIR = Path.home() / "models" / "Qwen2.5-0.5B-Instruct" + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("prompt", type=str) + parser.add_argument("--max-tokens", type=int, default=50) + args = parser.parse_args() + + from transformers import AutoTokenizer + + print("Loading tokenizer...") + tok = AutoTokenizer.from_pretrained(str(MODEL_DIR), trust_remote_code=True) + + # Build chat template + messages = [ + {"role": "system", "content": "You are a helpful assistant. Be concise."}, + {"role": "user", "content": args.prompt}, + ] + text = tok.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) + input_ids = tok.encode(text) + print(f"Prompt tokens: {len(input_ids)}") + + # Run the C binary — pass token IDs as arguments + import subprocess + binary = str(INFERENCE_DIR / "qwen_ane") + + # We need to modify the binary to accept token IDs as input + # For now, print the token IDs so we can verify tokenization + print(f"First 10 tokens: {input_ids[:10]}") + print(f"Token text: {[tok.decode([t]) for t in input_ids[:10]]}") + print(f"\nRunning ANE inference with {len(input_ids)} prompt tokens + {args.max_tokens} generation...") + + # Call binary with token IDs piped via stdin + result = subprocess.run( + [binary, str(WEIGHTS_PATH), " ".join(str(t) for t in input_ids), + str(args.max_tokens)], + capture_output=True, text=True, timeout=120, + ) + print(result.stdout) + if result.stderr: + print(result.stderr[:500], file=sys.stderr) + + # Parse output token IDs from binary stdout + output_ids = [] + for line in result.stdout.split("\n"): + if line.startswith("OUT:"): + ids = [int(x) for x in line[4:].split() if x.isdigit()] + output_ids.extend(ids) + + if output_ids: + decoded = tok.decode(output_ids, skip_special_tokens=True) + print(f"\n=== Response ===\n{decoded}") + else: + print("\n(No output tokens parsed — binary may need token ID input mode)") + + +if __name__ == "__main__": + main() diff --git a/training/Makefile b/training/Makefile index b726d22..405c770 100644 --- a/training/Makefile +++ b/training/Makefile @@ -21,6 +21,14 @@ train_large: train_large.m $(HEADERS_LARGE) train_large_ane: train_large_ane.m $(HEADERS_ANE) $(CC) $(CFLAGS) -o $@ train_large_ane.m $(LDFLAGS) -framework Accelerate +HEADERS_OPT = $(HEADERS_LARGE) stories_cpu_ops_opt.h + +train_opt: train_opt.m $(HEADERS_OPT) + $(CC) $(CFLAGS) -o $@ train_opt.m $(LDFLAGS) -framework Accelerate -framework Metal -framework MetalPerformanceShaders + +train_double_buffer: train_double_buffer.m $(HEADERS_LARGE) + $(CC) $(CFLAGS) -o $@ train_double_buffer.m $(LDFLAGS) -framework Accelerate + PROBES = test_weight_reload test_perf_stats test_qos_sweep test_ane_advanced test_rmsnorm_bwd: test_rmsnorm_bwd.m $(HEADERS_ANE) @@ -65,7 +73,7 @@ verify-flags: @xcrun clang --version clean: - rm -f train train_large train_large_ane $(PROBES) test_rmsnorm_bwd test_classifier + rm -f train train_large train_large_ane train_opt train_double_buffer $(PROBES) test_rmsnorm_bwd test_classifier .PHONY: clean tokenize probes verify-flags data setup diff --git a/training/stories_cpu_ops_opt.h b/training/stories_cpu_ops_opt.h new file mode 100644 index 0000000..1b843c8 --- /dev/null +++ b/training/stories_cpu_ops_opt.h @@ -0,0 +1,110 @@ +// stories_cpu_ops_opt.h — Optimized CPU operations: NEON Adam, vectorized embedding +#pragma once +#include "stories_cpu_ops.h" +#include + +// ===== NEON-vectorized Adam optimizer ===== +// ~3-3.5x faster than scalar version for large param counts +// Uses vrsqrteq_f32 + one Newton-Raphson step for fast reciprocal sqrt +static void adam_update_opt(float *w, const float *g, AdamState *s, int t, + float lr, float b1, float b2, float eps) { + float bc1 = 1.0f - powf(b1, t); + float bc2 = 1.0f - powf(b2, t); + float inv_bc1 = 1.0f / bc1; + float inv_bc2 = 1.0f / bc2; + float one_minus_b1 = 1.0f - b1; + float one_minus_b2 = 1.0f - b2; + + float32x4_t vb1 = vdupq_n_f32(b1); + float32x4_t vb2 = vdupq_n_f32(b2); + float32x4_t v1mb1 = vdupq_n_f32(one_minus_b1); + float32x4_t v1mb2 = vdupq_n_f32(one_minus_b2); + float32x4_t vinv_bc1 = vdupq_n_f32(inv_bc1); + float32x4_t vinv_bc2 = vdupq_n_f32(inv_bc2); + float32x4_t vneg_lr = vdupq_n_f32(-lr); + float32x4_t veps = vdupq_n_f32(eps); + + size_t n = s->n; + size_t i = 0; + + // Process 4 elements at a time + for (; i + 3 < n; i += 4) { + // Load + float32x4_t vm = vld1q_f32(s->m + i); + float32x4_t vv = vld1q_f32(s->v + i); + float32x4_t vg = vld1q_f32(g + i); + float32x4_t vw = vld1q_f32(w + i); + + // m = b1*m + (1-b1)*g + vm = vmlaq_f32(vmulq_f32(vb1, vm), v1mb1, vg); + // v = b2*v + (1-b2)*g*g + float32x4_t g2 = vmulq_f32(vg, vg); + vv = vmlaq_f32(vmulq_f32(vb2, vv), v1mb2, g2); + + // Store updated m, v + vst1q_f32(s->m + i, vm); + vst1q_f32(s->v + i, vv); + + // mhat = m / bc1, vhat = v / bc2 + float32x4_t mhat = vmulq_f32(vm, vinv_bc1); + float32x4_t vhat = vmulq_f32(vv, vinv_bc2); + + // Fast reciprocal sqrt: vrsqrteq + one Newton-Raphson iteration + // rsqrt_est ≈ 1/sqrt(vhat) + float32x4_t rsqrt_est = vrsqrteq_f32(vhat); + // Newton-Raphson: rsqrt *= (3 - vhat * rsqrt^2) / 2 + float32x4_t rsqrt_sq = vmulq_f32(rsqrt_est, rsqrt_est); + float32x4_t nr_step = vrsqrtsq_f32(vhat, rsqrt_sq); + rsqrt_est = vmulq_f32(rsqrt_est, nr_step); + + // w -= lr * mhat / (sqrt(vhat) + eps) + // = w + (-lr) * mhat * (1/(sqrt(vhat) + eps)) + // Compute sqrt(vhat) from rsqrt: sqrt = vhat * rsqrt(vhat) (avoids division) + float32x4_t sqrt_vhat = vmulq_f32(vhat, rsqrt_est); + float32x4_t denom = vaddq_f32(sqrt_vhat, veps); + + // Use vdivq_f32 for the final division (accurate, eps-adjusted) + float32x4_t update = vmulq_f32(vneg_lr, vdivq_f32(mhat, denom)); + vw = vaddq_f32(vw, update); + + vst1q_f32(w + i, vw); + } + + // Scalar tail + for (; i < n; i++) { + s->m[i] = b1 * s->m[i] + one_minus_b1 * g[i]; + s->v[i] = b2 * s->v[i] + one_minus_b2 * g[i] * g[i]; + float mh = s->m[i] * inv_bc1; + float vh = s->v[i] * inv_bc2; + w[i] -= lr * mh / (sqrtf(vh) + eps); + } +} + +// ===== Vectorized embedding lookup ===== +// Gather rows from [VOCAB, DIM] row-major embed table → x [DIM, SEQ] channel-first +// Strategy: gather token rows into temp buffer [SEQ, DIM], then transpose via vDSP_mtrans +static void embed_lookup_opt(float *x, const float *embed, const uint16_t *tokens, + int dim, int seq, float *tmp) { + // Gather: tmp[t*dim + d] = embed[tokens[t]*dim + d] + for (int t = 0; t < seq; t++) { + memcpy(tmp + t * dim, embed + tokens[t] * dim, dim * sizeof(float)); + } + // Transpose [SEQ, DIM] → [DIM, SEQ]: x[d*seq + t] = tmp[t*dim + d] + vDSP_mtrans(tmp, 1, x, 1, (vDSP_Length)dim, (vDSP_Length)seq); +} + +// ===== Vectorized embedding backward ===== +// Accumulate dE[tok] += dx[:,t] for each position +// Strategy: transpose dx [DIM, SEQ] → tmp [SEQ, DIM], then accumulate rows +static void embed_backward_opt(float *d_embed, const float *dx, const uint16_t *tokens, + int dim, int seq, float *tmp) { + // Transpose [DIM, SEQ] → [SEQ, DIM]: tmp[t*dim + d] = dx[d*seq + t] + vDSP_mtrans(dx, 1, tmp, 1, (vDSP_Length)seq, (vDSP_Length)dim); + // Scatter-add: d_embed[tok*dim .. (tok+1)*dim] += tmp[t*dim .. (t+1)*dim] + for (int t = 0; t < seq; t++) { + vDSP_vadd(tmp + t * dim, 1, + d_embed + tokens[t] * dim, 1, + d_embed + tokens[t] * dim, 1, + (vDSP_Length)dim); + } +} diff --git a/training/stories_io.h b/training/stories_io.h index fbb5dee..efc88db 100644 --- a/training/stories_io.h +++ b/training/stories_io.h @@ -85,6 +85,12 @@ static void io_write_fp16_at(IOSurfaceRef s, int ch_off, const float *data, int cvt_f32_f16((_Float16*)IOSurfaceGetBaseAddress(s) + ch_off * sp, data, channels * sp); IOSurfaceUnlock(s, 0, NULL); } +// Read raw fp16 from IOSurface without conversion (for fp16 activation cache) +static void io_read_raw_fp16(IOSurfaceRef s, _Float16 *data, int ch_off, int channels, int sp) { + IOSurfaceLock(s, kIOSurfaceLockReadOnly, NULL); + memcpy(data, (_Float16*)IOSurfaceGetBaseAddress(s) + ch_off * sp, channels * sp * sizeof(_Float16)); + IOSurfaceUnlock(s, kIOSurfaceLockReadOnly, NULL); +} // Kernel compile/eval static Kern *compile_kern_mil_w(NSString *mil, NSDictionary *weights, int ic_bytes, int oc_bytes) { diff --git a/training/train_double_buffer.m b/training/train_double_buffer.m new file mode 100644 index 0000000..bfb8236 --- /dev/null +++ b/training/train_double_buffer.m @@ -0,0 +1,791 @@ +// train_double_buffer.m — Double-buffered async ANE training for stories110M +// Based on train_large.m with the key innovation: compile and eval overlap via GCD +// Discovery: probe_v2.m proved ANE compile and eval can run in parallel +// Architecture: two kernel sets (A/B), background compile while active set runs +// 5 weight-bearing ANE kernels per layer × 12 layers = 60 per compile batch +#include +#include "stories_io.h" +#include "stories_mil.h" +#include "stories_cpu_ops.h" + +// Double-buffer needs more compile budget than single-buffer +// The original MAX_COMPILES=100 only allows 1 batch per exec() restart +// We push higher to allow initial compile + at least 1 background compile +// If ANE rejects at ~119, the exec() restart will handle it gracefully +#define DB_MAX_COMPILES 250 + +#define CKPT_PATH_DEFAULT "ane_db_ckpt.bin" +#define MODEL_PATH_DEFAULT "../../assets/models/stories110M.bin" +#define DATA_PATH_DEFAULT "tinystories_data00.bin" + +static const char *get_path(const char *env_var, const char *default_val) { + const char *v = getenv(env_var); + return (v && v[0]) ? v : default_val; +} + +// ===== Weight loading from llama2.c format ===== +static bool load_pretrained(LayerWeights *lw, float *rms_final, float *embed, const char *path) { + FILE *f = fopen(path, "rb"); + if (!f) { printf("Cannot open %s\n", path); return false; } + Llama2Config cfg; + fread(&cfg, sizeof(cfg), 1, f); + printf(" Model config: dim=%d hidden=%d layers=%d heads=%d vocab=%d seq=%d\n", + cfg.dim, cfg.hidden_dim, cfg.n_layers, cfg.n_heads, abs(cfg.vocab_size), cfg.seq_len); + if (cfg.dim != DIM || cfg.hidden_dim != HIDDEN || cfg.n_layers != NLAYERS) { + printf(" ERROR: Config mismatch! Expected dim=%d hidden=%d layers=%d\n", DIM, HIDDEN, NLAYERS); + fclose(f); return false; + } + int V = abs(cfg.vocab_size); + bool shared = cfg.vocab_size > 0; + + // Read in llama2.c order: embed, rms_att[all], wq[all], wk[all], wv[all], wo[all], + // rms_ffn[all], w1[all], w2[all], w3[all], rms_final, [wcls] + fread(embed, 4, V * DIM, f); + + // rms_att weights for all layers (contiguous) + for (int L = 0; L < NLAYERS; L++) fread(lw[L].rms_att, 4, DIM, f); + // wq for all layers + for (int L = 0; L < NLAYERS; L++) fread(lw[L].Wq, 4, WQ_SZ, f); + // wk for all layers + for (int L = 0; L < NLAYERS; L++) fread(lw[L].Wk, 4, WQ_SZ, f); + // wv for all layers + for (int L = 0; L < NLAYERS; L++) fread(lw[L].Wv, 4, WQ_SZ, f); + // wo for all layers + for (int L = 0; L < NLAYERS; L++) fread(lw[L].Wo, 4, WO_SZ, f); + // rms_ffn weights for all layers + for (int L = 0; L < NLAYERS; L++) fread(lw[L].rms_ffn, 4, DIM, f); + // w1 for all layers + for (int L = 0; L < NLAYERS; L++) fread(lw[L].W1, 4, W1_SZ, f); + // w2 for all layers + for (int L = 0; L < NLAYERS; L++) fread(lw[L].W2, 4, W2_SZ, f); + // w3 for all layers + for (int L = 0; L < NLAYERS; L++) fread(lw[L].W3, 4, W3_SZ, f); + // rms_final + fread(rms_final, 4, DIM, f); + // wcls = embed if shared (we just use embed pointer) + + fclose(f); + printf(" Loaded pretrained weights (%s)\n", shared ? "shared embed/cls" : "separate cls"); + return true; +} + +// ===== Compile one layer's kernels ===== +static bool compile_layer_kernels(LayerKernels *lk, LayerWeights *w) { + lk->fwdAttn = compile_kern_mil_w(gen_sdpa_fwd_taps(), (@{ + @"@model_path/weights/rms1.bin": @{@"offset":@0, @"data":build_blob(w->rms_att,1,DIM)}, + @"@model_path/weights/wq.bin": @{@"offset":@0, @"data":build_blob(w->Wq,DIM,DIM)}, + @"@model_path/weights/wk.bin": @{@"offset":@0, @"data":build_blob(w->Wk,DIM,DIM)}, + @"@model_path/weights/wv.bin": @{@"offset":@0, @"data":build_blob(w->Wv,DIM,DIM)}, + @"@model_path/weights/wo.bin": @{@"offset":@0, @"data":build_blob(w->Wo,DIM,DIM)}, + @"@model_path/weights/mask.bin": @{@"offset":@0, @"data":get_mask_blob()}, + }), DIM*SEQ*2, 6*DIM*SEQ*2); + + lk->fwdFFN = compile_kern_mil_w(gen_ffn_fwd_taps(), (@{ + @"@model_path/weights/rms2.bin": @{@"offset":@0, @"data":build_blob(w->rms_ffn,1,DIM)}, + @"@model_path/weights/w1.bin": @{@"offset":@0, @"data":build_blob(w->W1,HIDDEN,DIM)}, + @"@model_path/weights/w3.bin": @{@"offset":@0, @"data":build_blob(w->W3,HIDDEN,DIM)}, + @"@model_path/weights/w2.bin": @{@"offset":@0, @"data":build_blob(w->W2,DIM,HIDDEN)}, + }), DIM*SEQ*2, (2*DIM+3*HIDDEN)*SEQ*2); + + lk->ffnBwd = compile_kern_mil_w(gen_ffn_bwd(), (@{ + @"@model_path/weights/w2t.bin": @{@"offset":@0, @"data":build_blob_t(w->W2,DIM,HIDDEN)}, + @"@model_path/weights/w1t.bin": @{@"offset":@0, @"data":build_blob_t(w->W1,HIDDEN,DIM)}, + @"@model_path/weights/w3t.bin": @{@"offset":@0, @"data":build_blob_t(w->W3,HIDDEN,DIM)}, + }), (DIM+2*HIDDEN)*SEQ*2, (DIM+2*HIDDEN)*SEQ*2); + + lk->sdpaBwd1 = compile_kern_mil_w(gen_sdpa_bwd1(), (@{ + @"@model_path/weights/mask.bin": @{@"offset":@0, @"data":get_mask_blob()}, + @"@model_path/weights/wot.bin": @{@"offset":@0, @"data":build_blob_t(w->Wo,DIM,DIM)}, + }), 4*DIM*SEQ*2, (DIM+2*SCORE_CH)*SEQ*2); + + lk->qkvBwd = compile_kern_mil_w(gen_qkvb(), (@{ + @"@model_path/weights/wqt.bin": @{@"offset":@0, @"data":build_blob_t(w->Wq,DIM,DIM)}, + @"@model_path/weights/wkt.bin": @{@"offset":@0, @"data":build_blob_t(w->Wk,DIM,DIM)}, + @"@model_path/weights/wvt.bin": @{@"offset":@0, @"data":build_blob_t(w->Wv,DIM,DIM)}, + }), 3*DIM*SEQ*2, DIM*SEQ*2); + + return lk->fwdAttn && lk->fwdFFN && lk->ffnBwd && lk->sdpaBwd1 && lk->qkvBwd; +} + +// Compile weight-free sdpaBwd2 (only needs once, no weights) +static Kern *compile_sdpa_bwd2(void) { + return compile_kern_mil_w(gen_sdpa_bwd2(), @{}, + (2*SCORE_CH+2*DIM)*SEQ*2, 2*DIM*SEQ*2); +} + +static void free_layer_kernels(LayerKernels *lk) { + free_kern(lk->fwdAttn); free_kern(lk->fwdFFN); free_kern(lk->ffnBwd); + free_kern(lk->sdpaBwd1); free_kern(lk->qkvBwd); + // sdpaBwd2 is shared, freed separately + lk->fwdAttn = lk->fwdFFN = lk->ffnBwd = lk->sdpaBwd1 = lk->qkvBwd = NULL; +} + +// ===== Checkpoint save/load ===== +static void save_checkpoint(const char *path, int step, int total_steps, float lr, float loss, + double cc, double ct, double cw, int cs, int cb, int adam_t, + LayerWeights *lw, LayerAdam *la, float *rms_final, AdamState *arms_final, + float *embed, AdamState *aembed) { + FILE *f = fopen(path, "wb"); + CkptHdr h = {0}; + h.magic = 0x424C5A54; h.version = 2; + h.step = step; h.total_steps = total_steps; + h.n_layers = NLAYERS; h.vocab_size = VOCAB; h.dim = DIM; + h.hidden_dim = HIDDEN; h.n_heads = HEADS; h.seq_len = SEQ; + h.lr = lr; h.loss = loss; + h.cum_compile = cc; h.cum_train = ct; h.cum_wall = cw; + h.cum_steps = cs; h.cum_batches = cb; h.adam_t = adam_t; + fwrite(&h, sizeof(h), 1, f); + // Per-layer weights + adam + for (int L = 0; L < NLAYERS; L++) { + fwrite(lw[L].Wq,4,WQ_SZ,f); fwrite(lw[L].Wk,4,WQ_SZ,f); + fwrite(lw[L].Wv,4,WQ_SZ,f); fwrite(lw[L].Wo,4,WO_SZ,f); + fwrite(lw[L].W1,4,W1_SZ,f); fwrite(lw[L].W2,4,W2_SZ,f); fwrite(lw[L].W3,4,W3_SZ,f); + fwrite(lw[L].rms_att,4,DIM,f); fwrite(lw[L].rms_ffn,4,DIM,f); + // Adam state + fwrite(la[L].Wq.m,4,WQ_SZ,f); fwrite(la[L].Wq.v,4,WQ_SZ,f); + fwrite(la[L].Wk.m,4,WQ_SZ,f); fwrite(la[L].Wk.v,4,WQ_SZ,f); + fwrite(la[L].Wv.m,4,WQ_SZ,f); fwrite(la[L].Wv.v,4,WQ_SZ,f); + fwrite(la[L].Wo.m,4,WO_SZ,f); fwrite(la[L].Wo.v,4,WO_SZ,f); + fwrite(la[L].W1.m,4,W1_SZ,f); fwrite(la[L].W1.v,4,W1_SZ,f); + fwrite(la[L].W2.m,4,W2_SZ,f); fwrite(la[L].W2.v,4,W2_SZ,f); + fwrite(la[L].W3.m,4,W3_SZ,f); fwrite(la[L].W3.v,4,W3_SZ,f); + fwrite(la[L].rms_att.m,4,DIM,f); fwrite(la[L].rms_att.v,4,DIM,f); + fwrite(la[L].rms_ffn.m,4,DIM,f); fwrite(la[L].rms_ffn.v,4,DIM,f); + } + fwrite(rms_final,4,DIM,f); + fwrite(arms_final->m,4,DIM,f); fwrite(arms_final->v,4,DIM,f); + fwrite(embed,4,VOCAB*DIM,f); + fwrite(aembed->m,4,VOCAB*DIM,f); fwrite(aembed->v,4,VOCAB*DIM,f); + fclose(f); +} + +static bool load_checkpoint(const char *path, int *step, int *total_steps, float *lr, float *loss, + double *cc, double *ct, double *cw, int *cs, int *cb, int *adam_t, + LayerWeights *lw, LayerAdam *la, float *rms_final, AdamState *arms_final, + float *embed, AdamState *aembed) { + FILE *f = fopen(path, "rb"); + if (!f) return false; + CkptHdr h; + fread(&h, sizeof(h), 1, f); + if (h.magic != 0x424C5A54 || h.version != 2) { fclose(f); return false; } + *step = h.step; *total_steps = h.total_steps; *lr = h.lr; *loss = h.loss; + *cc = h.cum_compile; *ct = h.cum_train; *cw = h.cum_wall; + *cs = h.cum_steps; *cb = h.cum_batches; *adam_t = h.adam_t; + for (int L = 0; L < NLAYERS; L++) { + fread(lw[L].Wq,4,WQ_SZ,f); fread(lw[L].Wk,4,WQ_SZ,f); + fread(lw[L].Wv,4,WQ_SZ,f); fread(lw[L].Wo,4,WO_SZ,f); + fread(lw[L].W1,4,W1_SZ,f); fread(lw[L].W2,4,W2_SZ,f); fread(lw[L].W3,4,W3_SZ,f); + fread(lw[L].rms_att,4,DIM,f); fread(lw[L].rms_ffn,4,DIM,f); + fread(la[L].Wq.m,4,WQ_SZ,f); fread(la[L].Wq.v,4,WQ_SZ,f); + fread(la[L].Wk.m,4,WQ_SZ,f); fread(la[L].Wk.v,4,WQ_SZ,f); + fread(la[L].Wv.m,4,WQ_SZ,f); fread(la[L].Wv.v,4,WQ_SZ,f); + fread(la[L].Wo.m,4,WO_SZ,f); fread(la[L].Wo.v,4,WO_SZ,f); + fread(la[L].W1.m,4,W1_SZ,f); fread(la[L].W1.v,4,W1_SZ,f); + fread(la[L].W2.m,4,W2_SZ,f); fread(la[L].W2.v,4,W2_SZ,f); + fread(la[L].W3.m,4,W3_SZ,f); fread(la[L].W3.v,4,W3_SZ,f); + fread(la[L].rms_att.m,4,DIM,f); fread(la[L].rms_att.v,4,DIM,f); + fread(la[L].rms_ffn.m,4,DIM,f); fread(la[L].rms_ffn.v,4,DIM,f); + } + fread(rms_final,4,DIM,f); + fread(arms_final->m,4,DIM,f); fread(arms_final->v,4,DIM,f); + fread(embed,4,VOCAB*DIM,f); + fread(aembed->m,4,VOCAB*DIM,f); fread(aembed->v,4,VOCAB*DIM,f); + fclose(f); + return true; +} + +// ===== Main ===== +int main(int argc, char *argv[]) { + @autoreleasepool { + setbuf(stdout, NULL); + ane_init(); + mach_timebase_info(&g_tb); + + int total_steps = 10000; + float lr = 3e-4f; + float adam_b1=0.9f, adam_b2=0.999f, adam_eps=1e-8f; + int adam_t = 0, start_step = 0; + + const char *model_path = get_path("ANE_MODEL_PATH", MODEL_PATH_DEFAULT); + const char *ckpt_path = get_path("ANE_CKPT_PATH", CKPT_PATH_DEFAULT); + const char *data_path = get_path("ANE_DATA_PATH", DATA_PATH_DEFAULT); + + bool do_resume = false; + for (int i=1; i