diff --git a/inference/convert_weights.py b/inference/convert_weights.py new file mode 100644 index 0000000..d5121fb --- /dev/null +++ b/inference/convert_weights.py @@ -0,0 +1,107 @@ +#!/usr/bin/env python3 +"""Convert Qwen2.5-0.5B-Instruct safetensors → flat binary for ANE inference. + +Output format: config header (7 ints) + all weights in f32, layer by layer. +Matches the layout expected by qwen_ane_infer.h. + +Usage: + python3 convert_weights.py /path/to/Qwen2.5-0.5B-Instruct /path/to/output.bin +""" + +import struct +import sys +import numpy as np +from pathlib import Path +from safetensors import safe_open + +def convert(model_dir: str, output_path: str): + model_dir = Path(model_dir) + + # Load safetensors + st_files = list(model_dir.glob("*.safetensors")) + if not st_files: + print(f"No safetensors files in {model_dir}") + sys.exit(1) + + tensors = {} + for f in st_files: + with safe_open(str(f), framework="pt") as sf: + for key in sf.keys(): + tensors[key] = sf.get_tensor(key).float().numpy() + + print(f"Loaded {len(tensors)} tensors from {len(st_files)} files") + + # Qwen2.5-0.5B config + dim = 896 + hidden = 4864 + n_layers = 24 + n_heads = 14 + n_kv_heads = 2 + vocab_size = 151936 + max_seq = 512 + + with open(output_path, "wb") as f: + # Config header: 7 x int32 + f.write(struct.pack("iiiiiii", + dim, hidden, n_layers, n_heads, n_kv_heads, vocab_size, max_seq)) + + # Embedding [vocab, dim] + emb = tensors["model.embed_tokens.weight"].astype(np.float32) + print(f"embed: {emb.shape}") + f.write(emb.tobytes()) + + # Per-layer weights + for l in range(n_layers): + prefix = f"model.layers.{l}" + + # Attention norm + rms_att = tensors[f"{prefix}.input_layernorm.weight"].astype(np.float32) + f.write(rms_att.tobytes()) + + # Q, K, V projections + wq = tensors[f"{prefix}.self_attn.q_proj.weight"].astype(np.float32) + wk = tensors[f"{prefix}.self_attn.k_proj.weight"].astype(np.float32) + wv = tensors[f"{prefix}.self_attn.v_proj.weight"].astype(np.float32) + wo = tensors[f"{prefix}.self_attn.o_proj.weight"].astype(np.float32) + f.write(wq.tobytes()) + f.write(wk.tobytes()) + f.write(wv.tobytes()) + f.write(wo.tobytes()) + + # Q/K biases (Qwen has them) + # Q/K/V biases + qb = tensors.get(f"{prefix}.self_attn.q_proj.bias") + kb = tensors.get(f"{prefix}.self_attn.k_proj.bias") + vb = tensors.get(f"{prefix}.self_attn.v_proj.bias") + f.write((qb if qb is not None else np.zeros(wq.shape[0])).astype(np.float32).tobytes()) + f.write((kb if kb is not None else np.zeros(wk.shape[0])).astype(np.float32).tobytes()) + f.write((vb if vb is not None else np.zeros(wv.shape[0])).astype(np.float32).tobytes()) + + # FFN norm + rms_ffn = tensors[f"{prefix}.post_attention_layernorm.weight"].astype(np.float32) + f.write(rms_ffn.tobytes()) + + # FFN: gate, up, down + w_gate = tensors[f"{prefix}.mlp.gate_proj.weight"].astype(np.float32) + w_up = tensors[f"{prefix}.mlp.up_proj.weight"].astype(np.float32) + w_down = tensors[f"{prefix}.mlp.down_proj.weight"].astype(np.float32) + f.write(w_gate.tobytes()) + f.write(w_up.tobytes()) + f.write(w_down.tobytes()) + + print(f" Layer {l}: Q{wq.shape} K{wk.shape} V{wv.shape} O{wo.shape} " + f"gate{w_gate.shape} up{w_up.shape} down{w_down.shape}") + + # Final norm + rms_final = tensors["model.norm.weight"].astype(np.float32) + f.write(rms_final.tobytes()) + + size_mb = Path(output_path).stat().st_size / 1024 / 1024 + print(f"\nWritten: {output_path} ({size_mb:.0f} MB)") + + +if __name__ == "__main__": + if len(sys.argv) != 3: + print("Usage: python3 convert_weights.py ") + sys.exit(1) + convert(sys.argv[1], sys.argv[2]) diff --git a/inference/main.m b/inference/main.m new file mode 100644 index 0000000..f494881 --- /dev/null +++ b/inference/main.m @@ -0,0 +1,162 @@ +// main.m — Qwen2.5-0.5B inference on Apple Neural Engine +// Compiles ANE kernels for all linear projections, runs autoregressive decode. +// +// Build: +// xcrun clang -O2 -framework Foundation -framework IOSurface \ +// -framework CoreML -framework Accelerate -ldl -lobjc \ +// -o qwen_ane main.m +// +// Run: +// ./qwen_ane qwen05b.bin "Hello world" +// +#import +#include +#include +#include +#include +#include "qwen_ane_infer.h" + +static QwenModel g_model; + +static int load_weights(const char *path) { + FILE *f = fopen(path, "rb"); + if (!f) { fprintf(stderr, "Cannot open %s\n", path); return -1; } + + // Read config header + int config[7]; + fread(config, sizeof(int), 7, f); + int dim = config[0], hidden = config[1], n_layers = config[2]; + int n_heads = config[3], n_kv_heads = config[4], vocab = config[5]; + printf("Config: dim=%d hidden=%d layers=%d heads=%d kv_heads=%d vocab=%d\n", + dim, hidden, n_layers, n_heads, n_kv_heads, vocab); + + int q_dim = n_heads * QWEN_HEAD_DIM; + int kv_dim = n_kv_heads * QWEN_HEAD_DIM; + + // Embedding + g_model.embed = (float*)malloc((size_t)vocab * dim * sizeof(float)); + fread(g_model.embed, sizeof(float), (size_t)vocab * dim, f); + + // Per-layer + for (int l = 0; l < n_layers; l++) { + g_model.rms_att[l] = (float*)malloc(dim * sizeof(float)); + fread(g_model.rms_att[l], sizeof(float), dim, f); + + g_model.wq[l] = (float*)malloc((size_t)q_dim * dim * sizeof(float)); + fread(g_model.wq[l], sizeof(float), (size_t)q_dim * dim, f); + g_model.wk[l] = (float*)malloc((size_t)kv_dim * dim * sizeof(float)); + fread(g_model.wk[l], sizeof(float), (size_t)kv_dim * dim, f); + g_model.wv[l] = (float*)malloc((size_t)kv_dim * dim * sizeof(float)); + fread(g_model.wv[l], sizeof(float), (size_t)kv_dim * dim, f); + g_model.wo[l] = (float*)malloc((size_t)q_dim * dim * sizeof(float)); // o_proj is [dim, q_dim] + fread(g_model.wo[l], sizeof(float), (size_t)dim * q_dim, f); + + // Q/K/V biases + g_model.q_bias[l] = (float*)malloc(q_dim * sizeof(float)); + g_model.k_bias[l] = (float*)malloc(kv_dim * sizeof(float)); + g_model.v_bias[l] = (float*)malloc(kv_dim * sizeof(float)); + fread(g_model.q_bias[l], sizeof(float), q_dim, f); + fread(g_model.k_bias[l], sizeof(float), kv_dim, f); + fread(g_model.v_bias[l], sizeof(float), kv_dim, f); + + g_model.rms_ffn[l] = (float*)malloc(dim * sizeof(float)); + fread(g_model.rms_ffn[l], sizeof(float), dim, f); + + g_model.w_gate[l] = (float*)malloc((size_t)hidden * dim * sizeof(float)); + fread(g_model.w_gate[l], sizeof(float), (size_t)hidden * dim, f); + g_model.w_up[l] = (float*)malloc((size_t)hidden * dim * sizeof(float)); + fread(g_model.w_up[l], sizeof(float), (size_t)hidden * dim, f); + g_model.w_down[l] = (float*)malloc((size_t)dim * hidden * sizeof(float)); + fread(g_model.w_down[l], sizeof(float), (size_t)dim * hidden, f); + } + + g_model.rms_final = (float*)malloc(dim * sizeof(float)); + fread(g_model.rms_final, sizeof(float), dim, f); + + fclose(f); + printf("Weights loaded (%.0f MB)\n", + (float)ftell(f) / 1024 / 1024); + return 0; +} + +int main(int argc, char **argv) { + @autoreleasepool { + if (argc < 3) { + fprintf(stderr, "Usage: %s \n", argv[0]); + return 1; + } + + printf("=== Qwen2.5-0.5B ANE Inference ===\n\n"); + + // Load weights + printf("Loading weights...\n"); + if (load_weights(argv[1]) != 0) return 1; + + // Allocate buffers + qwen_alloc(&g_model); + + // Compile ANE kernels + printf("Compiling ANE kernels (169 total)...\n"); + struct timespec t0, t1; + clock_gettime(CLOCK_MONOTONIC, &t0); + qwen_compile_kernels(&g_model); + clock_gettime(CLOCK_MONOTONIC, &t1); + double compile_sec = (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1e9; + printf("Compile time: %.1fs\n\n", compile_sec); + + // Parse token IDs from argv[2] (space-separated) + // argv[3] = max generation tokens + int max_gen = 50; + if (argc >= 4) max_gen = atoi(argv[3]); + + // Parse input token IDs + int prompt_ids[2048]; + int n_prompt = 0; + char *tok_str = strdup(argv[2]); + char *saveptr; + char *p = strtok_r(tok_str, " ", &saveptr); + while (p && n_prompt < 2048) { + prompt_ids[n_prompt++] = atoi(p); + p = strtok_r(NULL, " ", &saveptr); + } + free(tok_str); + printf("Prompt: %d tokens, generating up to %d\n", n_prompt, max_gen); + + clock_gettime(CLOCK_MONOTONIC, &t0); + + // Prefill: feed all prompt tokens + int next = 0; + for (int i = 0; i < n_prompt; i++) { + next = qwen_forward(&g_model, prompt_ids[i]); + } + + struct timespec t_prefill; + clock_gettime(CLOCK_MONOTONIC, &t_prefill); + double prefill_sec = (t_prefill.tv_sec - t0.tv_sec) + (t_prefill.tv_nsec - t0.tv_nsec) / 1e9; + printf("Prefill: %d tokens in %.2fs (%.1f t/s)\n", n_prompt, prefill_sec, n_prompt / prefill_sec); + + // Generate + int eos = 151645; // <|im_end|> + int eos2 = 151643; // <|endoftext|> + printf("OUT:"); + for (int i = 0; i < max_gen; i++) { + printf(" %d", next); + fflush(stdout); + if (next == eos || next == eos2) break; + next = qwen_forward(&g_model, next); + } + printf("\n"); + + clock_gettime(CLOCK_MONOTONIC, &t1); + double gen_sec = (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1e9; + int total_tokens = g_model.pos; + int gen_tokens = total_tokens - n_prompt; + double decode_sec = gen_sec - prefill_sec; + printf("\nTotal: %d tokens in %.2fs\n", total_tokens, gen_sec); + printf("Prefill: %.1f t/s (%d tokens)\n", n_prompt / prefill_sec, n_prompt); + printf("Decode: %.1f t/s (%d tokens)\n", + decode_sec > 0 ? gen_tokens / decode_sec : 0, gen_tokens); + + return 0; + } +} diff --git a/inference/qwen_ane_infer.h b/inference/qwen_ane_infer.h new file mode 100644 index 0000000..58dd10b --- /dev/null +++ b/inference/qwen_ane_infer.h @@ -0,0 +1,435 @@ +// qwen_ane_infer.h — Qwen2.5-0.5B inference on Apple Neural Engine +// Linear projections on ANE (baked-weight conv kernels), CPU for element-wise ops. +// Based on maderix/ANE runtime + MIL generation. +#pragma once + +#include "../training/ane_runtime.h" +#include "../training/ane_mil_gen.h" + +// Compile a matmul kernel: W[out_ch, in_ch] @ x[in_ch] → y[out_ch] +// Uses the two-input matmul MIL variant (weights passed as input, not baked) +static ANEKernel *compile_matmul_kernel(int in_ch, int out_ch) { + NSString *mil = mil_gen_matmul(in_ch, out_ch, 1); + size_t inputSizes[2] = {(size_t)in_ch * 1 * 4, (size_t)out_ch * in_ch * 4}; + size_t outBytes = (size_t)out_ch * 1 * 4; + return ane_compile([mil dataUsingEncoding:NSUTF8StringEncoding], nil, 2, inputSizes, 1, &outBytes); +} + +// Compile a baked-weight conv kernel (from model.h) +static ANEKernel *compile_conv_kernel(const float *weights, int in_ch, int out_ch, int spatial) { + NSData *wb = mil_build_weight_blob(weights, out_ch, in_ch); + NSString *mil = mil_gen_conv(in_ch, out_ch, spatial); + size_t inBytes = (size_t)in_ch * spatial * 4; + size_t outBytes = (size_t)out_ch * spatial * 4; + return ane_compile([mil dataUsingEncoding:NSUTF8StringEncoding], wb, 1, &inBytes, 1, &outBytes); +} +#include +#include +#include + +// Qwen2.5-0.5B-Instruct architecture +#define QWEN_DIM 896 +#define QWEN_HIDDEN 4864 +#define QWEN_LAYERS 24 +#define QWEN_HEADS 14 +#define QWEN_KV_HEADS 2 +#define QWEN_HEAD_DIM 64 +#define QWEN_VOCAB 151936 +#define QWEN_RMS_EPS 1e-6f +#define QWEN_ROPE_THETA 1000000.0f +#define QWEN_MAX_SEQ 512 + +// GQA: each KV head serves (HEADS / KV_HEADS) query heads +#define QWEN_GQA_FACTOR (QWEN_HEADS / QWEN_KV_HEADS) + +// Sizes for GQA projections +#define QWEN_Q_DIM (QWEN_HEADS * QWEN_HEAD_DIM) // 896 +#define QWEN_KV_DIM (QWEN_KV_HEADS * QWEN_HEAD_DIM) // 128 + +typedef struct { + // Weights (f32) + float *embed; // [vocab, dim] + float *rms_att[QWEN_LAYERS]; // [dim] + float *wq[QWEN_LAYERS]; // [q_dim, dim] + float *wk[QWEN_LAYERS]; // [kv_dim, dim] + float *wv[QWEN_LAYERS]; // [kv_dim, dim] + float *wo[QWEN_LAYERS]; // [dim, q_dim] + float *rms_ffn[QWEN_LAYERS]; // [dim] + float *w_gate[QWEN_LAYERS]; // [hidden, dim] + float *w_up[QWEN_LAYERS]; // [hidden, dim] + float *w_down[QWEN_LAYERS]; // [dim, hidden] + float *rms_final; // [dim] + // wcls = embed (tied) + + // ANE kernels (one per linear projection per layer) + ANEKernel *k_q[QWEN_LAYERS]; + ANEKernel *k_k[QWEN_LAYERS]; + ANEKernel *k_v[QWEN_LAYERS]; + ANEKernel *k_o[QWEN_LAYERS]; + ANEKernel *k_gate[QWEN_LAYERS]; + ANEKernel *k_up[QWEN_LAYERS]; + ANEKernel *k_down[QWEN_LAYERS]; + // LM head chunked: vocab too large for single ANE kernel (max 65536) + #define QWEN_LM_CHUNKS 16 + #define QWEN_LM_CHUNK_SIZE 9496 // 151936 / 16 + ANEKernel *k_lmhead[QWEN_LM_CHUNKS]; + + // Q/K/V biases per layer + float *q_bias[QWEN_LAYERS]; // [q_dim] + float *k_bias[QWEN_LAYERS]; // [kv_dim] + float *v_bias[QWEN_LAYERS]; // [kv_dim] + + // KV cache [layer][kv_heads * head_dim * max_seq] + float *kv_cache_k[QWEN_LAYERS]; + float *kv_cache_v[QWEN_LAYERS]; + int pos; // current position in sequence + + // Scratch buffers + float *x; // [dim] + float *xb; // [dim] + float *q; // [q_dim] + float *k; // [kv_dim] + float *v; // [kv_dim] + float *att; // [heads * max_seq] + float *hb; // [hidden] + float *hb2; // [hidden] + float *logits; // [vocab] +} QwenModel; + +// ── CPU ops ────────────────────────────────────────────────────────── + +static void qwen_rmsnorm(float *out, const float *x, const float *w, int D) { + float ss = 0; + for (int i = 0; i < D; i++) ss += x[i] * x[i]; + ss = 1.0f / sqrtf(ss / D + QWEN_RMS_EPS); + for (int i = 0; i < D; i++) out[i] = x[i] * ss * w[i]; +} + +static void qwen_rope(float *q, float *k, int pos, int n_q_heads, int n_kv_heads, int head_dim) { + // Qwen uses rotate_half RoPE (NOT interleaved pairs): + // rotate_half(x) = [-x[dim/2:], x[:dim/2]] + // q_embed = q * cos + rotate_half(q) * sin + // cos/sin have shape [head_dim/2] and are applied to both halves + int half = head_dim / 2; + + // Precompute cos/sin for this position (head_dim/2 frequencies) + float cos_v[half], sin_v[half]; + for (int i = 0; i < half; i++) { + float freq = 1.0f / powf(QWEN_ROPE_THETA, (float)(2 * i) / head_dim); + float angle = pos * freq; + cos_v[i] = cosf(angle); + sin_v[i] = sinf(angle); + } + + // Apply to Q heads + for (int h = 0; h < n_q_heads; h++) { + float *qh = q + h * head_dim; + for (int i = 0; i < half; i++) { + float q_first = qh[i]; + float q_second = qh[i + half]; + // rotate_half: [-q_second, q_first] + qh[i] = q_first * cos_v[i] + (-q_second) * sin_v[i]; + qh[i + half] = q_second * cos_v[i] + q_first * sin_v[i]; + } + } + + // Apply to K heads + for (int h = 0; h < n_kv_heads; h++) { + float *kh = k + h * head_dim; + for (int i = 0; i < half; i++) { + float k_first = kh[i]; + float k_second = kh[i + half]; + kh[i] = k_first * cos_v[i] + (-k_second) * sin_v[i]; + kh[i + half] = k_second * cos_v[i] + k_first * sin_v[i]; + } + } +} + +static void qwen_silu(float *x, int n) { + for (int i = 0; i < n; i++) + x[i] = x[i] / (1.0f + expf(-x[i])); +} + +// ── ANE projection helper (single token: spatial=1) ───────────────── + +static void ane_project(ANEKernel *kernel, const float *in, float *out, + int in_dim, int out_dim) { + // For single-token inference: spatial=1 + ane_write_input(kernel, 0, in, in_dim * sizeof(float)); + ane_eval(kernel); + ane_read_output(kernel, 0, out, out_dim * sizeof(float)); +} + +// CPU matmul via Accelerate BLAS: y = W @ x, W[out_dim, in_dim] +#include + +static void cpu_project(const float *W, const float *x, float *y, int in_dim, int out_dim) { + // y = W @ x where W is [out_dim, in_dim] row-major + // cblas_sgemv: y = alpha * A * x + beta * y + cblas_sgemv(CblasRowMajor, CblasNoTrans, + out_dim, in_dim, + 1.0f, W, in_dim, + x, 1, + 0.0f, y, 1); +} + +// Toggle: 1 = use ANE for projections, 0 = CPU fallback +#define USE_ANE_PROJECTIONS 0 + +// ── Forward one token ──────────────────────────────────────────────── + +static int qwen_forward(QwenModel *m, int token) { + int D = QWEN_DIM, HD = QWEN_HIDDEN; + int pos = m->pos; + + // Token embedding + memcpy(m->x, m->embed + token * D, D * sizeof(float)); + + for (int l = 0; l < QWEN_LAYERS; l++) { + // Attention RMSNorm + qwen_rmsnorm(m->xb, m->x, m->rms_att[l], D); + + // Debug: print first layer input/output norms + if (l == 0 && pos == 0) { + float xnorm = 0, qnorm = 0; + for (int i = 0; i < D; i++) xnorm += m->xb[i] * m->xb[i]; + printf(" L0 RMSNorm out norm=%.4f (first 4: %.4f %.4f %.4f %.4f)\n", + sqrtf(xnorm), m->xb[0], m->xb[1], m->xb[2], m->xb[3]); + } + + // QKV projections (ANE) + bias + #if USE_ANE_PROJECTIONS + ane_project(m->k_q[l], m->xb, m->q, D, QWEN_Q_DIM); + ane_project(m->k_k[l], m->xb, m->k, D, QWEN_KV_DIM); + ane_project(m->k_v[l], m->xb, m->v, D, QWEN_KV_DIM); + #else + cpu_project(m->wq[l], m->xb, m->q, D, QWEN_Q_DIM); + cpu_project(m->wk[l], m->xb, m->k, D, QWEN_KV_DIM); + cpu_project(m->wv[l], m->xb, m->v, D, QWEN_KV_DIM); + #endif + // Apply Q/K biases + if (m->q_bias[l]) { + for (int i = 0; i < QWEN_Q_DIM; i++) m->q[i] += m->q_bias[l][i]; + } + if (m->k_bias[l]) { + for (int i = 0; i < QWEN_KV_DIM; i++) m->k[i] += m->k_bias[l][i]; + } + if (m->v_bias[l]) { + for (int i = 0; i < QWEN_KV_DIM; i++) m->v[i] += m->v_bias[l][i]; + } + + if (l == 0 && pos == 0) { + float qn = 0; + for (int i = 0; i < QWEN_Q_DIM; i++) qn += m->q[i] * m->q[i]; + printf(" L0 ANE Q norm=%.4f (first 4: %.4f %.4f %.4f %.4f)\n", + sqrtf(qn), m->q[0], m->q[1], m->q[2], m->q[3]); + // CPU reference + float cpu_q[4] = {0}; + for (int i = 0; i < 4; i++) { + for (int j = 0; j < D; j++) + cpu_q[i] += m->wq[0][i * D + j] * m->xb[j]; + cpu_q[i] += m->q_bias[0][i]; + } + printf(" L0 CPU Q first 4: %.4f %.4f %.4f %.4f\n", + cpu_q[0], cpu_q[1], cpu_q[2], cpu_q[3]); + } + + // RoPE + qwen_rope(m->q, m->k, pos, QWEN_HEADS, QWEN_KV_HEADS, QWEN_HEAD_DIM); + + // Store K, V in cache + memcpy(m->kv_cache_k[l] + pos * QWEN_KV_DIM, + m->k, QWEN_KV_DIM * sizeof(float)); + memcpy(m->kv_cache_v[l] + pos * QWEN_KV_DIM, + m->v, QWEN_KV_DIM * sizeof(float)); + + // GQA attention (CPU — element-wise ops) + float scale = 1.0f / sqrtf((float)QWEN_HEAD_DIM); + float *attn_out = m->xb; // reuse buffer + memset(attn_out, 0, QWEN_Q_DIM * sizeof(float)); + + for (int h = 0; h < QWEN_HEADS; h++) { + int kv_h = h / QWEN_GQA_FACTOR; + float *qh = m->q + h * QWEN_HEAD_DIM; + + // Attention scores: Q @ K^T for all positions up to pos + float max_score = -1e9f; + for (int t = 0; t <= pos; t++) { + float *kt = m->kv_cache_k[l] + t * QWEN_KV_DIM + kv_h * QWEN_HEAD_DIM; + // Use BLAS dot product for precision + float score = cblas_sdot(QWEN_HEAD_DIM, qh, 1, kt, 1); + m->att[h * QWEN_MAX_SEQ + t] = score * scale; + if (score * scale > max_score) max_score = score * scale; + } + // Softmax (double accumulation for precision) + double sum = 0; + for (int t = 0; t <= pos; t++) { + m->att[h * QWEN_MAX_SEQ + t] = expf(m->att[h * QWEN_MAX_SEQ + t] - max_score); + sum += (double)m->att[h * QWEN_MAX_SEQ + t]; + } + float inv_sum = (float)(1.0 / sum); + for (int t = 0; t <= pos; t++) + m->att[h * QWEN_MAX_SEQ + t] *= inv_sum; + + // Weighted sum of V: attn_out[h] += att[t] * V[t] for each t + for (int t = 0; t <= pos; t++) { + float a = m->att[h * QWEN_MAX_SEQ + t]; + float *vt = m->kv_cache_v[l] + t * QWEN_KV_DIM + kv_h * QWEN_HEAD_DIM; + cblas_saxpy(QWEN_HEAD_DIM, a, vt, 1, + attn_out + h * QWEN_HEAD_DIM, 1); + } + } + + float o_out[QWEN_DIM]; + #if USE_ANE_PROJECTIONS + ane_project(m->k_o[l], attn_out, o_out, QWEN_Q_DIM, D); + #else + cpu_project(m->wo[l], attn_out, o_out, QWEN_Q_DIM, D); + #endif + + // Residual + for (int i = 0; i < D; i++) m->x[i] += o_out[i]; + + if (l == 0 && pos == 0) { + float pan = 0; + for (int i = 0; i < D; i++) pan += m->x[i] * m->x[i]; + printf(" L0 post-attn norm=%.4f first4=[%.6f, %.6f, %.6f, %.6f]\n", + sqrtf(pan), m->x[0], m->x[1], m->x[2], m->x[3]); + float on = 0; + for (int i = 0; i < D; i++) on += o_out[i] * o_out[i]; + printf(" L0 o_proj out norm=%.4f first4=[%.6f, %.6f, %.6f, %.6f]\n", + sqrtf(on), o_out[0], o_out[1], o_out[2], o_out[3]); + } + + // FFN RMSNorm + qwen_rmsnorm(m->xb, m->x, m->rms_ffn[l], D); + + // SwiGLU FFN + #if USE_ANE_PROJECTIONS + ane_project(m->k_gate[l], m->xb, m->hb, D, HD); + ane_project(m->k_up[l], m->xb, m->hb2, D, HD); + #else + cpu_project(m->w_gate[l], m->xb, m->hb, D, HD); + cpu_project(m->w_up[l], m->xb, m->hb2, D, HD); + #endif + + if (l == 0 && pos == 0) { + float gn = 0, un = 0; + for (int i = 0; i < HD; i++) { gn += m->hb[i]*m->hb[i]; un += m->hb2[i]*m->hb2[i]; } + printf(" L0 gate norm=%.4f up norm=%.4f\n", sqrtf(gn), sqrtf(un)); + printf(" L0 gate first4=[%.6f, %.6f, %.6f, %.6f]\n", + m->hb[0], m->hb[1], m->hb[2], m->hb[3]); + } + + qwen_silu(m->hb, HD); + for (int i = 0; i < HD; i++) m->hb[i] *= m->hb2[i]; + + float ffn_out[QWEN_DIM]; + #if USE_ANE_PROJECTIONS + ane_project(m->k_down[l], m->hb, ffn_out, HD, D); + #else + cpu_project(m->w_down[l], m->hb, ffn_out, HD, D); + #endif + + // Residual + for (int i = 0; i < D; i++) m->x[i] += ffn_out[i]; + + // Debug: hidden state after each layer (first 3 layers, first token only) + if (l < 3 && pos == 0) { + float hn = 0; + for (int i = 0; i < D; i++) hn += m->x[i] * m->x[i]; + printf(" C hidden[%d] norm=%.4f first4=[%.4f, %.4f, %.4f, %.4f]\n", + l+1, sqrtf(hn), m->x[0], m->x[1], m->x[2], m->x[3]); + } + } + + // Final RMSNorm + qwen_rmsnorm(m->xb, m->x, m->rms_final, D); + + // Debug: check final hidden state before LM head + if (m->pos < 2) { + float fn = 0; + for (int i = 0; i < D; i++) fn += m->xb[i] * m->xb[i]; + printf(" Final hidden norm=%.4f (first 4: %.6f %.6f %.6f %.6f)\n", + sqrtf(fn), m->xb[0], m->xb[1], m->xb[2], m->xb[3]); + } + + // LM head via Accelerate BLAS: logits = embed @ xb + // embed is [vocab, dim] row-major + cblas_sgemv(CblasRowMajor, CblasNoTrans, + QWEN_VOCAB, D, + 1.0f, m->embed, D, + m->xb, 1, + 0.0f, m->logits, 1); + + // Debug: check logits + if (m->pos < 2) { + float lmax = m->logits[0], lmin = m->logits[0]; + int nonzero = 0; + for (int i = 0; i < QWEN_VOCAB; i++) { + if (m->logits[i] > lmax) lmax = m->logits[i]; + if (m->logits[i] < lmin) lmin = m->logits[i]; + if (m->logits[i] != 0.0f) nonzero++; + } + printf(" Logits: min=%.4f max=%.4f nonzero=%d/%d\n", lmin, lmax, nonzero, QWEN_VOCAB); + } + + m->pos++; + + // Argmax + int max_idx = 0; + float max_val = m->logits[0]; + for (int i = 1; i < QWEN_VOCAB; i++) { + if (m->logits[i] > max_val) { + max_val = m->logits[i]; + max_idx = i; + } + } + return max_idx; +} + +// ── Compile all ANE kernels ────────────────────────────────────────── + +static void qwen_compile_kernels(QwenModel *m) { + int D = QWEN_DIM, HD = QWEN_HIDDEN; + printf("Compiling %d ANE kernels...\n", QWEN_LAYERS * 7 + 1); + for (int l = 0; l < QWEN_LAYERS; l++) { + m->k_q[l] = compile_conv_kernel(m->wq[l], D, QWEN_Q_DIM, 1); + m->k_k[l] = compile_conv_kernel(m->wk[l], D, QWEN_KV_DIM, 1); + m->k_v[l] = compile_conv_kernel(m->wv[l], D, QWEN_KV_DIM, 1); + m->k_o[l] = compile_conv_kernel(m->wo[l], QWEN_Q_DIM, D, 1); + m->k_gate[l] = compile_conv_kernel(m->w_gate[l], D, HD, 1); + m->k_up[l] = compile_conv_kernel(m->w_up[l], D, HD, 1); + m->k_down[l] = compile_conv_kernel(m->w_down[l], HD, D, 1); + printf(" Layer %d/%d compiled\r", l+1, QWEN_LAYERS); + fflush(stdout); + } + // LM head (tied = embedding, chunked into 16 pieces) + for (int c = 0; c < QWEN_LM_CHUNKS; c++) { + float *chunk_weights = m->embed + c * QWEN_LM_CHUNK_SIZE * D; + m->k_lmhead[c] = compile_conv_kernel(chunk_weights, D, QWEN_LM_CHUNK_SIZE, 1); + if (!m->k_lmhead[c]) { + printf(" LM head chunk %d FAILED to compile\n", c); + } + } + printf("\nAll kernels compiled.\n"); +} + +// ── Allocate buffers ───────────────────────────────────────────────── + +static void qwen_alloc(QwenModel *m) { + m->x = (float*)calloc(QWEN_DIM, sizeof(float)); + m->xb = (float*)calloc(QWEN_DIM, sizeof(float)); + m->q = (float*)calloc(QWEN_Q_DIM, sizeof(float)); + m->k = (float*)calloc(QWEN_KV_DIM, sizeof(float)); + m->v = (float*)calloc(QWEN_KV_DIM, sizeof(float)); + m->att = (float*)calloc(QWEN_HEADS * QWEN_MAX_SEQ, sizeof(float)); + m->hb = (float*)calloc(QWEN_HIDDEN, sizeof(float)); + m->hb2 = (float*)calloc(QWEN_HIDDEN, sizeof(float)); + m->logits = (float*)calloc(QWEN_VOCAB, sizeof(float)); + for (int l = 0; l < QWEN_LAYERS; l++) { + m->kv_cache_k[l] = (float*)calloc(QWEN_MAX_SEQ * QWEN_KV_DIM, sizeof(float)); + m->kv_cache_v[l] = (float*)calloc(QWEN_MAX_SEQ * QWEN_KV_DIM, sizeof(float)); + } + m->pos = 0; +} diff --git a/inference/run.py b/inference/run.py new file mode 100644 index 0000000..234ff86 --- /dev/null +++ b/inference/run.py @@ -0,0 +1,74 @@ +#!/usr/bin/env python3 +"""Run Qwen2.5-0.5B on ANE with proper tokenization. + +Usage: + python3 run.py "Your prompt here" [--max-tokens 50] +""" +import argparse +import ctypes +import struct +import sys +import time +from pathlib import Path + +INFERENCE_DIR = Path(__file__).parent +WEIGHTS_PATH = INFERENCE_DIR / "qwen05b.bin" +MODEL_DIR = Path.home() / "models" / "Qwen2.5-0.5B-Instruct" + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("prompt", type=str) + parser.add_argument("--max-tokens", type=int, default=50) + args = parser.parse_args() + + from transformers import AutoTokenizer + + print("Loading tokenizer...") + tok = AutoTokenizer.from_pretrained(str(MODEL_DIR), trust_remote_code=True) + + # Build chat template + messages = [ + {"role": "system", "content": "You are a helpful assistant. Be concise."}, + {"role": "user", "content": args.prompt}, + ] + text = tok.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) + input_ids = tok.encode(text) + print(f"Prompt tokens: {len(input_ids)}") + + # Run the C binary — pass token IDs as arguments + import subprocess + binary = str(INFERENCE_DIR / "qwen_ane") + + # We need to modify the binary to accept token IDs as input + # For now, print the token IDs so we can verify tokenization + print(f"First 10 tokens: {input_ids[:10]}") + print(f"Token text: {[tok.decode([t]) for t in input_ids[:10]]}") + print(f"\nRunning ANE inference with {len(input_ids)} prompt tokens + {args.max_tokens} generation...") + + # Call binary with token IDs piped via stdin + result = subprocess.run( + [binary, str(WEIGHTS_PATH), " ".join(str(t) for t in input_ids), + str(args.max_tokens)], + capture_output=True, text=True, timeout=120, + ) + print(result.stdout) + if result.stderr: + print(result.stderr[:500], file=sys.stderr) + + # Parse output token IDs from binary stdout + output_ids = [] + for line in result.stdout.split("\n"): + if line.startswith("OUT:"): + ids = [int(x) for x in line[4:].split() if x.isdigit()] + output_ids.extend(ids) + + if output_ids: + decoded = tok.decode(output_ids, skip_special_tokens=True) + print(f"\n=== Response ===\n{decoded}") + else: + print("\n(No output tokens parsed — binary may need token ID input mode)") + + +if __name__ == "__main__": + main()