ANE/inference/main.m

163 lines
6.1 KiB
Objective-C

// main.m — Qwen2.5-0.5B inference on Apple Neural Engine
// Compiles ANE kernels for all linear projections, runs autoregressive decode.
//
// Build:
// xcrun clang -O2 -framework Foundation -framework IOSurface \
// -framework CoreML -framework Accelerate -ldl -lobjc \
// -o qwen_ane main.m
//
// Run:
// ./qwen_ane qwen05b.bin "Hello world"
//
#import <Foundation/Foundation.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include "qwen_ane_infer.h"
static QwenModel g_model;
static int load_weights(const char *path) {
FILE *f = fopen(path, "rb");
if (!f) { fprintf(stderr, "Cannot open %s\n", path); return -1; }
// Read config header
int config[7];
fread(config, sizeof(int), 7, f);
int dim = config[0], hidden = config[1], n_layers = config[2];
int n_heads = config[3], n_kv_heads = config[4], vocab = config[5];
printf("Config: dim=%d hidden=%d layers=%d heads=%d kv_heads=%d vocab=%d\n",
dim, hidden, n_layers, n_heads, n_kv_heads, vocab);
int q_dim = n_heads * QWEN_HEAD_DIM;
int kv_dim = n_kv_heads * QWEN_HEAD_DIM;
// Embedding
g_model.embed = (float*)malloc((size_t)vocab * dim * sizeof(float));
fread(g_model.embed, sizeof(float), (size_t)vocab * dim, f);
// Per-layer
for (int l = 0; l < n_layers; l++) {
g_model.rms_att[l] = (float*)malloc(dim * sizeof(float));
fread(g_model.rms_att[l], sizeof(float), dim, f);
g_model.wq[l] = (float*)malloc((size_t)q_dim * dim * sizeof(float));
fread(g_model.wq[l], sizeof(float), (size_t)q_dim * dim, f);
g_model.wk[l] = (float*)malloc((size_t)kv_dim * dim * sizeof(float));
fread(g_model.wk[l], sizeof(float), (size_t)kv_dim * dim, f);
g_model.wv[l] = (float*)malloc((size_t)kv_dim * dim * sizeof(float));
fread(g_model.wv[l], sizeof(float), (size_t)kv_dim * dim, f);
g_model.wo[l] = (float*)malloc((size_t)q_dim * dim * sizeof(float)); // o_proj is [dim, q_dim]
fread(g_model.wo[l], sizeof(float), (size_t)dim * q_dim, f);
// Q/K/V biases
g_model.q_bias[l] = (float*)malloc(q_dim * sizeof(float));
g_model.k_bias[l] = (float*)malloc(kv_dim * sizeof(float));
g_model.v_bias[l] = (float*)malloc(kv_dim * sizeof(float));
fread(g_model.q_bias[l], sizeof(float), q_dim, f);
fread(g_model.k_bias[l], sizeof(float), kv_dim, f);
fread(g_model.v_bias[l], sizeof(float), kv_dim, f);
g_model.rms_ffn[l] = (float*)malloc(dim * sizeof(float));
fread(g_model.rms_ffn[l], sizeof(float), dim, f);
g_model.w_gate[l] = (float*)malloc((size_t)hidden * dim * sizeof(float));
fread(g_model.w_gate[l], sizeof(float), (size_t)hidden * dim, f);
g_model.w_up[l] = (float*)malloc((size_t)hidden * dim * sizeof(float));
fread(g_model.w_up[l], sizeof(float), (size_t)hidden * dim, f);
g_model.w_down[l] = (float*)malloc((size_t)dim * hidden * sizeof(float));
fread(g_model.w_down[l], sizeof(float), (size_t)dim * hidden, f);
}
g_model.rms_final = (float*)malloc(dim * sizeof(float));
fread(g_model.rms_final, sizeof(float), dim, f);
fclose(f);
printf("Weights loaded (%.0f MB)\n",
(float)ftell(f) / 1024 / 1024);
return 0;
}
int main(int argc, char **argv) {
@autoreleasepool {
if (argc < 3) {
fprintf(stderr, "Usage: %s <weights.bin> <prompt>\n", argv[0]);
return 1;
}
printf("=== Qwen2.5-0.5B ANE Inference ===\n\n");
// Load weights
printf("Loading weights...\n");
if (load_weights(argv[1]) != 0) return 1;
// Allocate buffers
qwen_alloc(&g_model);
// Compile ANE kernels
printf("Compiling ANE kernels (169 total)...\n");
struct timespec t0, t1;
clock_gettime(CLOCK_MONOTONIC, &t0);
qwen_compile_kernels(&g_model);
clock_gettime(CLOCK_MONOTONIC, &t1);
double compile_sec = (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1e9;
printf("Compile time: %.1fs\n\n", compile_sec);
// Parse token IDs from argv[2] (space-separated)
// argv[3] = max generation tokens
int max_gen = 50;
if (argc >= 4) max_gen = atoi(argv[3]);
// Parse input token IDs
int prompt_ids[2048];
int n_prompt = 0;
char *tok_str = strdup(argv[2]);
char *saveptr;
char *p = strtok_r(tok_str, " ", &saveptr);
while (p && n_prompt < 2048) {
prompt_ids[n_prompt++] = atoi(p);
p = strtok_r(NULL, " ", &saveptr);
}
free(tok_str);
printf("Prompt: %d tokens, generating up to %d\n", n_prompt, max_gen);
clock_gettime(CLOCK_MONOTONIC, &t0);
// Prefill: feed all prompt tokens
int next = 0;
for (int i = 0; i < n_prompt; i++) {
next = qwen_forward(&g_model, prompt_ids[i]);
}
struct timespec t_prefill;
clock_gettime(CLOCK_MONOTONIC, &t_prefill);
double prefill_sec = (t_prefill.tv_sec - t0.tv_sec) + (t_prefill.tv_nsec - t0.tv_nsec) / 1e9;
printf("Prefill: %d tokens in %.2fs (%.1f t/s)\n", n_prompt, prefill_sec, n_prompt / prefill_sec);
// Generate
int eos = 151645; // <|im_end|>
int eos2 = 151643; // <|endoftext|>
printf("OUT:");
for (int i = 0; i < max_gen; i++) {
printf(" %d", next);
fflush(stdout);
if (next == eos || next == eos2) break;
next = qwen_forward(&g_model, next);
}
printf("\n");
clock_gettime(CLOCK_MONOTONIC, &t1);
double gen_sec = (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1e9;
int total_tokens = g_model.pos;
int gen_tokens = total_tokens - n_prompt;
double decode_sec = gen_sec - prefill_sec;
printf("\nTotal: %d tokens in %.2fs\n", total_tokens, gen_sec);
printf("Prefill: %.1f t/s (%d tokens)\n", n_prompt / prefill_sec, n_prompt);
printf("Decode: %.1f t/s (%d tokens)\n",
decode_sec > 0 ? gen_tokens / decode_sec : 0, gen_tokens);
return 0;
}
}