// qwen_ane_infer.h — Qwen2.5-0.5B inference on Apple Neural Engine // Linear projections on ANE (baked-weight conv kernels), CPU for element-wise ops. // Based on maderix/ANE runtime + MIL generation. #pragma once #include "../training/ane_runtime.h" #include "../training/ane_mil_gen.h" // Compile a matmul kernel: W[out_ch, in_ch] @ x[in_ch] → y[out_ch] // Uses the two-input matmul MIL variant (weights passed as input, not baked) static ANEKernel *compile_matmul_kernel(int in_ch, int out_ch) { NSString *mil = mil_gen_matmul(in_ch, out_ch, 1); size_t inputSizes[2] = {(size_t)in_ch * 1 * 4, (size_t)out_ch * in_ch * 4}; size_t outBytes = (size_t)out_ch * 1 * 4; return ane_compile([mil dataUsingEncoding:NSUTF8StringEncoding], nil, 2, inputSizes, 1, &outBytes); } // Compile a baked-weight conv kernel (from model.h) static ANEKernel *compile_conv_kernel(const float *weights, int in_ch, int out_ch, int spatial) { NSData *wb = mil_build_weight_blob(weights, out_ch, in_ch); NSString *mil = mil_gen_conv(in_ch, out_ch, spatial); size_t inBytes = (size_t)in_ch * spatial * 4; size_t outBytes = (size_t)out_ch * spatial * 4; return ane_compile([mil dataUsingEncoding:NSUTF8StringEncoding], wb, 1, &inBytes, 1, &outBytes); } #include #include #include #include #include #ifndef QWEN_DEBUG #define QWEN_DEBUG 0 #endif // Qwen2.5-0.5B-Instruct architecture #define QWEN_DIM 896 #define QWEN_HIDDEN 4864 #define QWEN_LAYERS 24 #define QWEN_HEADS 14 #define QWEN_KV_HEADS 2 #define QWEN_HEAD_DIM 64 #define QWEN_VOCAB 151936 #define QWEN_RMS_EPS 1e-6f #define QWEN_ROPE_THETA 1000000.0f #define QWEN_MAX_SEQ 512 // GQA: each KV head serves (HEADS / KV_HEADS) query heads #define QWEN_GQA_FACTOR (QWEN_HEADS / QWEN_KV_HEADS) // Sizes for GQA projections #define QWEN_Q_DIM (QWEN_HEADS * QWEN_HEAD_DIM) // 896 #define QWEN_KV_DIM (QWEN_KV_HEADS * QWEN_HEAD_DIM) // 128 typedef struct { // Weights (f32) float *embed; // [vocab, dim] float *rms_att[QWEN_LAYERS]; // [dim] float *wq[QWEN_LAYERS]; // [q_dim, dim] float *wk[QWEN_LAYERS]; // [kv_dim, dim] float *wv[QWEN_LAYERS]; // [kv_dim, dim] float *wo[QWEN_LAYERS]; // [dim, q_dim] float *rms_ffn[QWEN_LAYERS]; // [dim] float *w_gate[QWEN_LAYERS]; // [hidden, dim] float *w_up[QWEN_LAYERS]; // [hidden, dim] float *w_down[QWEN_LAYERS]; // [dim, hidden] float *rms_final; // [dim] // wcls = embed (tied) // ANE kernels (one per linear projection per layer) ANEKernel *k_q[QWEN_LAYERS]; ANEKernel *k_k[QWEN_LAYERS]; ANEKernel *k_v[QWEN_LAYERS]; ANEKernel *k_o[QWEN_LAYERS]; ANEKernel *k_gate[QWEN_LAYERS]; ANEKernel *k_up[QWEN_LAYERS]; ANEKernel *k_down[QWEN_LAYERS]; // LM head chunked: vocab too large for single ANE kernel (max 65536) #define QWEN_LM_CHUNKS 16 #define QWEN_LM_CHUNK_SIZE 9496 // 151936 / 16 ANEKernel *k_lmhead[QWEN_LM_CHUNKS]; // Q/K/V biases per layer float *q_bias[QWEN_LAYERS]; // [q_dim] float *k_bias[QWEN_LAYERS]; // [kv_dim] float *v_bias[QWEN_LAYERS]; // [kv_dim] // KV cache [layer][kv_heads * head_dim * max_seq] float *kv_cache_k[QWEN_LAYERS]; float *kv_cache_v[QWEN_LAYERS]; int pos; // current position in sequence // Scratch buffers float *x; // [dim] float *xb; // [dim] float *q; // [q_dim] float *k; // [kv_dim] float *v; // [kv_dim] float *att; // [heads * max_seq] float *hb; // [hidden] float *hb2; // [hidden] float *logits; // [vocab] } QwenModel; // ── Precomputed RoPE table ─────────────────────────────────────────── static float g_rope_cos[QWEN_MAX_SEQ][QWEN_HEAD_DIM / 2]; static float g_rope_sin[QWEN_MAX_SEQ][QWEN_HEAD_DIM / 2]; static int g_rope_initialized = 0; static void qwen_rope_init(void) { if (g_rope_initialized) return; int half = QWEN_HEAD_DIM / 2; for (int pos = 0; pos < QWEN_MAX_SEQ; pos++) { for (int i = 0; i < half; i++) { float freq = 1.0f / powf(QWEN_ROPE_THETA, (float)(2 * i) / QWEN_HEAD_DIM); float angle = pos * freq; g_rope_cos[pos][i] = cosf(angle); g_rope_sin[pos][i] = sinf(angle); } } g_rope_initialized = 1; } // ── CPU ops (vectorized with NEON + vDSP) ──────────────────────────── static void qwen_rmsnorm(float *out, const float *x, const float *w, int D) { float ss; vDSP_svesq(x, 1, &ss, (vDSP_Length)D); ss = 1.0f / sqrtf(ss / D + QWEN_RMS_EPS); vDSP_vsmul(x, 1, &ss, out, 1, (vDSP_Length)D); vDSP_vmul(out, 1, w, 1, out, 1, (vDSP_Length)D); } static void qwen_rope(float *q, float *k, int pos, int n_q_heads, int n_kv_heads, int head_dim) { int half = head_dim / 2; const float *cv = g_rope_cos[pos]; const float *sv = g_rope_sin[pos]; for (int h = 0; h < n_q_heads; h++) { float *qh = q + h * head_dim; int i = 0; for (; i + 3 < half; i += 4) { float32x4_t first = vld1q_f32(qh + i); float32x4_t second = vld1q_f32(qh + i + half); float32x4_t c = vld1q_f32(cv + i); float32x4_t s = vld1q_f32(sv + i); vst1q_f32(qh + i, vmlsq_f32(vmulq_f32(first, c), second, s)); vst1q_f32(qh + i + half, vmlaq_f32(vmulq_f32(second, c), first, s)); } for (; i < half; i++) { float f = qh[i], se = qh[i + half]; qh[i] = f * cv[i] - se * sv[i]; qh[i + half] = se * cv[i] + f * sv[i]; } } for (int h = 0; h < n_kv_heads; h++) { float *kh = k + h * head_dim; int i = 0; for (; i + 3 < half; i += 4) { float32x4_t first = vld1q_f32(kh + i); float32x4_t second = vld1q_f32(kh + i + half); float32x4_t c = vld1q_f32(cv + i); float32x4_t s = vld1q_f32(sv + i); vst1q_f32(kh + i, vmlsq_f32(vmulq_f32(first, c), second, s)); vst1q_f32(kh + i + half, vmlaq_f32(vmulq_f32(second, c), first, s)); } for (; i < half; i++) { float f = kh[i], se = kh[i + half]; kh[i] = f * cv[i] - se * sv[i]; kh[i + half] = se * cv[i] + f * sv[i]; } } } static void qwen_silu(float *x, int n) { int i = 0; float32x4_t one = vdupq_n_f32(1.0f); for (; i + 3 < n; i += 4) { float32x4_t v = vld1q_f32(x + i); float neg[4]; vst1q_f32(neg, vnegq_f32(v)); float exp_neg[4]; for (int j = 0; j < 4; j++) exp_neg[j] = expf(neg[j]); float32x4_t denom = vaddq_f32(one, vld1q_f32(exp_neg)); vst1q_f32(x + i, vdivq_f32(v, denom)); } for (; i < n; i++) x[i] = x[i] / (1.0f + expf(-x[i])); } // ── ANE projection helper (single token: spatial=1) ───────────────── static inline bool ane_run(ANEKernel *k) { return ane_eval(k); } static void ane_project(ANEKernel *kernel, const float *in, float *out, int in_dim, int out_dim) { ane_write_input(kernel, 0, in, in_dim * sizeof(float)); ane_run(kernel); ane_read_output(kernel, 0, out, out_dim * sizeof(float)); } // CPU matmul via Accelerate BLAS: y = W @ x, W[out_dim, in_dim] static void cpu_project(const float *W, const float *x, float *y, int in_dim, int out_dim) { // y = W @ x where W is [out_dim, in_dim] row-major // cblas_sgemv: y = alpha * A * x + beta * y cblas_sgemv(CblasRowMajor, CblasNoTrans, out_dim, in_dim, 1.0f, W, in_dim, x, 1, 0.0f, y, 1); } // Toggle: 1 = use ANE for projections, 0 = CPU fallback #define USE_ANE_PROJECTIONS 0 // ── Forward one token ──────────────────────────────────────────────── static int qwen_forward(QwenModel *m, int token) { int D = QWEN_DIM, HD = QWEN_HIDDEN; int pos = m->pos; // Token embedding memcpy(m->x, m->embed + token * D, D * sizeof(float)); for (int l = 0; l < QWEN_LAYERS; l++) { // Attention RMSNorm qwen_rmsnorm(m->xb, m->x, m->rms_att[l], D); #if QWEN_DEBUG if (l == 0 && pos == 0) { float xnorm = 0; for (int i = 0; i < D; i++) xnorm += m->xb[i] * m->xb[i]; printf(" L0 RMSNorm out norm=%.4f (first 4: %.4f %.4f %.4f %.4f)\n", sqrtf(xnorm), m->xb[0], m->xb[1], m->xb[2], m->xb[3]); } #endif // QKV projections (ANE) + bias #if USE_ANE_PROJECTIONS ane_project(m->k_q[l], m->xb, m->q, D, QWEN_Q_DIM); ane_project(m->k_k[l], m->xb, m->k, D, QWEN_KV_DIM); ane_project(m->k_v[l], m->xb, m->v, D, QWEN_KV_DIM); #else cpu_project(m->wq[l], m->xb, m->q, D, QWEN_Q_DIM); cpu_project(m->wk[l], m->xb, m->k, D, QWEN_KV_DIM); cpu_project(m->wv[l], m->xb, m->v, D, QWEN_KV_DIM); #endif // Apply Q/K/V biases (vectorized) if (m->q_bias[l]) vDSP_vadd(m->q, 1, m->q_bias[l], 1, m->q, 1, (vDSP_Length)QWEN_Q_DIM); if (m->k_bias[l]) vDSP_vadd(m->k, 1, m->k_bias[l], 1, m->k, 1, (vDSP_Length)QWEN_KV_DIM); if (m->v_bias[l]) vDSP_vadd(m->v, 1, m->v_bias[l], 1, m->v, 1, (vDSP_Length)QWEN_KV_DIM); #if QWEN_DEBUG if (l == 0 && pos == 0) { float qn = 0; for (int i = 0; i < QWEN_Q_DIM; i++) qn += m->q[i] * m->q[i]; printf(" L0 ANE Q norm=%.4f (first 4: %.4f %.4f %.4f %.4f)\n", sqrtf(qn), m->q[0], m->q[1], m->q[2], m->q[3]); float cpu_q[4] = {0}; for (int i = 0; i < 4; i++) { for (int j = 0; j < D; j++) cpu_q[i] += m->wq[0][i * D + j] * m->xb[j]; cpu_q[i] += m->q_bias[0][i]; } printf(" L0 CPU Q first 4: %.4f %.4f %.4f %.4f\n", cpu_q[0], cpu_q[1], cpu_q[2], cpu_q[3]); } #endif // RoPE qwen_rope(m->q, m->k, pos, QWEN_HEADS, QWEN_KV_HEADS, QWEN_HEAD_DIM); // Store K, V in cache memcpy(m->kv_cache_k[l] + pos * QWEN_KV_DIM, m->k, QWEN_KV_DIM * sizeof(float)); memcpy(m->kv_cache_v[l] + pos * QWEN_KV_DIM, m->v, QWEN_KV_DIM * sizeof(float)); // GQA attention (CPU — element-wise ops) float scale = 1.0f / sqrtf((float)QWEN_HEAD_DIM); float *attn_out = m->xb; // reuse buffer memset(attn_out, 0, QWEN_Q_DIM * sizeof(float)); for (int h = 0; h < QWEN_HEADS; h++) { int kv_h = h / QWEN_GQA_FACTOR; float *qh = m->q + h * QWEN_HEAD_DIM; float *att_h = m->att + h * QWEN_MAX_SEQ; int seq_len = pos + 1; // Attention scores: Q @ K^T float max_score = -1e9f; for (int t = 0; t <= pos; t++) { float *kt = m->kv_cache_k[l] + t * QWEN_KV_DIM + kv_h * QWEN_HEAD_DIM; float score = cblas_sdot(QWEN_HEAD_DIM, qh, 1, kt, 1); att_h[t] = score * scale; if (att_h[t] > max_score) max_score = att_h[t]; } // Softmax: subtract max, exp, normalize (vDSP) float neg_max = -max_score; vDSP_vsadd(att_h, 1, &neg_max, att_h, 1, (vDSP_Length)seq_len); int n_exp = seq_len; vvexpf(att_h, att_h, &n_exp); float sum; vDSP_sve(att_h, 1, &sum, (vDSP_Length)seq_len); float inv_sum = 1.0f / sum; vDSP_vsmul(att_h, 1, &inv_sum, att_h, 1, (vDSP_Length)seq_len); // Weighted sum of V for (int t = 0; t <= pos; t++) { float a = att_h[t]; float *vt = m->kv_cache_v[l] + t * QWEN_KV_DIM + kv_h * QWEN_HEAD_DIM; cblas_saxpy(QWEN_HEAD_DIM, a, vt, 1, attn_out + h * QWEN_HEAD_DIM, 1); } } float o_out[QWEN_DIM]; #if USE_ANE_PROJECTIONS ane_project(m->k_o[l], attn_out, o_out, QWEN_Q_DIM, D); #else cpu_project(m->wo[l], attn_out, o_out, QWEN_Q_DIM, D); #endif // Residual (vectorized) vDSP_vadd(m->x, 1, o_out, 1, m->x, 1, (vDSP_Length)D); #if QWEN_DEBUG if (l == 0 && pos == 0) { float pan = 0; for (int i = 0; i < D; i++) pan += m->x[i] * m->x[i]; printf(" L0 post-attn norm=%.4f first4=[%.6f, %.6f, %.6f, %.6f]\n", sqrtf(pan), m->x[0], m->x[1], m->x[2], m->x[3]); float on = 0; for (int i = 0; i < D; i++) on += o_out[i] * o_out[i]; printf(" L0 o_proj out norm=%.4f first4=[%.6f, %.6f, %.6f, %.6f]\n", sqrtf(on), o_out[0], o_out[1], o_out[2], o_out[3]); } #endif // FFN RMSNorm qwen_rmsnorm(m->xb, m->x, m->rms_ffn[l], D); // SwiGLU FFN #if USE_ANE_PROJECTIONS ane_project(m->k_gate[l], m->xb, m->hb, D, HD); ane_project(m->k_up[l], m->xb, m->hb2, D, HD); #else cpu_project(m->w_gate[l], m->xb, m->hb, D, HD); cpu_project(m->w_up[l], m->xb, m->hb2, D, HD); #endif #if QWEN_DEBUG if (l == 0 && pos == 0) { float gn = 0, un = 0; for (int i = 0; i < HD; i++) { gn += m->hb[i]*m->hb[i]; un += m->hb2[i]*m->hb2[i]; } printf(" L0 gate norm=%.4f up norm=%.4f\n", sqrtf(gn), sqrtf(un)); printf(" L0 gate first4=[%.6f, %.6f, %.6f, %.6f]\n", m->hb[0], m->hb[1], m->hb[2], m->hb[3]); } #endif qwen_silu(m->hb, HD); // SiLU(gate) * up (vectorized element-wise multiply) vDSP_vmul(m->hb, 1, m->hb2, 1, m->hb, 1, (vDSP_Length)HD); float ffn_out[QWEN_DIM]; #if USE_ANE_PROJECTIONS ane_project(m->k_down[l], m->hb, ffn_out, HD, D); #else cpu_project(m->w_down[l], m->hb, ffn_out, HD, D); #endif // Residual (vectorized) vDSP_vadd(m->x, 1, ffn_out, 1, m->x, 1, (vDSP_Length)D); #if QWEN_DEBUG if (l < 3 && pos == 0) { float hn = 0; for (int i = 0; i < D; i++) hn += m->x[i] * m->x[i]; printf(" C hidden[%d] norm=%.4f first4=[%.4f, %.4f, %.4f, %.4f]\n", l+1, sqrtf(hn), m->x[0], m->x[1], m->x[2], m->x[3]); } #endif } // Final RMSNorm qwen_rmsnorm(m->xb, m->x, m->rms_final, D); #if QWEN_DEBUG if (m->pos < 2) { float fn = 0; for (int i = 0; i < D; i++) fn += m->xb[i] * m->xb[i]; printf(" Final hidden norm=%.4f (first 4: %.6f %.6f %.6f %.6f)\n", sqrtf(fn), m->xb[0], m->xb[1], m->xb[2], m->xb[3]); } #endif // LM head via Accelerate BLAS: logits = embed @ xb cblas_sgemv(CblasRowMajor, CblasNoTrans, QWEN_VOCAB, D, 1.0f, m->embed, D, m->xb, 1, 0.0f, m->logits, 1); #if QWEN_DEBUG if (m->pos < 2) { float lmax = m->logits[0], lmin = m->logits[0]; int nonzero = 0; for (int i = 0; i < QWEN_VOCAB; i++) { if (m->logits[i] > lmax) lmax = m->logits[i]; if (m->logits[i] < lmin) lmin = m->logits[i]; if (m->logits[i] != 0.0f) nonzero++; } printf(" Logits: min=%.4f max=%.4f nonzero=%d/%d\n", lmin, lmax, nonzero, QWEN_VOCAB); } #endif m->pos++; // Argmax (vDSP, single call over 151936 elements) float max_val; vDSP_Length max_idx_vdsp; vDSP_maxvi(m->logits, 1, &max_val, &max_idx_vdsp, (vDSP_Length)QWEN_VOCAB); return (int)max_idx_vdsp; } // ── Compile all ANE kernels ────────────────────────────────────────── static void qwen_compile_kernels(QwenModel *m) { #if USE_ANE_PROJECTIONS int D = QWEN_DIM, HD = QWEN_HIDDEN; printf("Compiling %d ANE kernels...\n", QWEN_LAYERS * 7 + 1); for (int l = 0; l < QWEN_LAYERS; l++) { m->k_q[l] = compile_conv_kernel(m->wq[l], D, QWEN_Q_DIM, 1); m->k_k[l] = compile_conv_kernel(m->wk[l], D, QWEN_KV_DIM, 1); m->k_v[l] = compile_conv_kernel(m->wv[l], D, QWEN_KV_DIM, 1); m->k_o[l] = compile_conv_kernel(m->wo[l], QWEN_Q_DIM, D, 1); m->k_gate[l] = compile_conv_kernel(m->w_gate[l], D, HD, 1); m->k_up[l] = compile_conv_kernel(m->w_up[l], D, HD, 1); m->k_down[l] = compile_conv_kernel(m->w_down[l], HD, D, 1); printf(" Layer %d/%d compiled\r", l+1, QWEN_LAYERS); fflush(stdout); } for (int c = 0; c < QWEN_LM_CHUNKS; c++) { float *chunk_weights = m->embed + c * QWEN_LM_CHUNK_SIZE * D; m->k_lmhead[c] = compile_conv_kernel(chunk_weights, D, QWEN_LM_CHUNK_SIZE, 1); if (!m->k_lmhead[c]) { printf(" LM head chunk %d FAILED to compile\n", c); } } printf("\nAll kernels compiled.\n"); #else printf("CPU-only mode (ANE kernel compilation skipped).\n"); (void)m; #endif } // ── Allocate buffers ───────────────────────────────────────────────── static void qwen_alloc(QwenModel *m) { m->x = (float*)calloc(QWEN_DIM, sizeof(float)); m->xb = (float*)calloc(QWEN_DIM, sizeof(float)); m->q = (float*)calloc(QWEN_Q_DIM, sizeof(float)); m->k = (float*)calloc(QWEN_KV_DIM, sizeof(float)); m->v = (float*)calloc(QWEN_KV_DIM, sizeof(float)); m->att = (float*)calloc(QWEN_HEADS * QWEN_MAX_SEQ, sizeof(float)); m->hb = (float*)calloc(QWEN_HIDDEN, sizeof(float)); m->hb2 = (float*)calloc(QWEN_HIDDEN, sizeof(float)); m->logits = (float*)calloc(QWEN_VOCAB, sizeof(float)); for (int l = 0; l < QWEN_LAYERS; l++) { m->kv_cache_k[l] = (float*)calloc(QWEN_MAX_SEQ * QWEN_KV_DIM, sizeof(float)); m->kv_cache_v[l] = (float*)calloc(QWEN_MAX_SEQ * QWEN_KV_DIM, sizeof(float)); } m->pos = 0; } static void qwen_reset(QwenModel *m) { for (int l = 0; l < QWEN_LAYERS; l++) { memset(m->kv_cache_k[l], 0, QWEN_MAX_SEQ * QWEN_KV_DIM * sizeof(float)); memset(m->kv_cache_v[l], 0, QWEN_MAX_SEQ * QWEN_KV_DIM * sizeof(float)); } m->pos = 0; }