// main.m — Qwen2.5-0.5B inference on Apple Neural Engine // Compiles ANE kernels for all linear projections, runs autoregressive decode. // // Build: // xcrun clang -O2 -framework Foundation -framework IOSurface \ // -framework CoreML -framework Accelerate -ldl -lobjc \ // -o qwen_ane main.m // // Run: // ./qwen_ane qwen05b.bin "Hello world" // #import #include #include #include #include #include "qwen_ane_infer.h" static QwenModel g_model; static int load_weights(const char *path) { FILE *f = fopen(path, "rb"); if (!f) { fprintf(stderr, "Cannot open %s\n", path); return -1; } // Read config header int config[7]; fread(config, sizeof(int), 7, f); int dim = config[0], hidden = config[1], n_layers = config[2]; int n_heads = config[3], n_kv_heads = config[4], vocab = config[5]; printf("Config: dim=%d hidden=%d layers=%d heads=%d kv_heads=%d vocab=%d\n", dim, hidden, n_layers, n_heads, n_kv_heads, vocab); int q_dim = n_heads * QWEN_HEAD_DIM; int kv_dim = n_kv_heads * QWEN_HEAD_DIM; // Embedding g_model.embed = (float*)malloc((size_t)vocab * dim * sizeof(float)); fread(g_model.embed, sizeof(float), (size_t)vocab * dim, f); // Per-layer for (int l = 0; l < n_layers; l++) { g_model.rms_att[l] = (float*)malloc(dim * sizeof(float)); fread(g_model.rms_att[l], sizeof(float), dim, f); g_model.wq[l] = (float*)malloc((size_t)q_dim * dim * sizeof(float)); fread(g_model.wq[l], sizeof(float), (size_t)q_dim * dim, f); g_model.wk[l] = (float*)malloc((size_t)kv_dim * dim * sizeof(float)); fread(g_model.wk[l], sizeof(float), (size_t)kv_dim * dim, f); g_model.wv[l] = (float*)malloc((size_t)kv_dim * dim * sizeof(float)); fread(g_model.wv[l], sizeof(float), (size_t)kv_dim * dim, f); g_model.wo[l] = (float*)malloc((size_t)q_dim * dim * sizeof(float)); // o_proj is [dim, q_dim] fread(g_model.wo[l], sizeof(float), (size_t)dim * q_dim, f); // Q/K/V biases g_model.q_bias[l] = (float*)malloc(q_dim * sizeof(float)); g_model.k_bias[l] = (float*)malloc(kv_dim * sizeof(float)); g_model.v_bias[l] = (float*)malloc(kv_dim * sizeof(float)); fread(g_model.q_bias[l], sizeof(float), q_dim, f); fread(g_model.k_bias[l], sizeof(float), kv_dim, f); fread(g_model.v_bias[l], sizeof(float), kv_dim, f); g_model.rms_ffn[l] = (float*)malloc(dim * sizeof(float)); fread(g_model.rms_ffn[l], sizeof(float), dim, f); g_model.w_gate[l] = (float*)malloc((size_t)hidden * dim * sizeof(float)); fread(g_model.w_gate[l], sizeof(float), (size_t)hidden * dim, f); g_model.w_up[l] = (float*)malloc((size_t)hidden * dim * sizeof(float)); fread(g_model.w_up[l], sizeof(float), (size_t)hidden * dim, f); g_model.w_down[l] = (float*)malloc((size_t)dim * hidden * sizeof(float)); fread(g_model.w_down[l], sizeof(float), (size_t)dim * hidden, f); } g_model.rms_final = (float*)malloc(dim * sizeof(float)); fread(g_model.rms_final, sizeof(float), dim, f); fclose(f); printf("Weights loaded (%.0f MB)\n", (float)ftell(f) / 1024 / 1024); return 0; } int main(int argc, char **argv) { @autoreleasepool { if (argc < 3) { fprintf(stderr, "Usage: %s \n", argv[0]); return 1; } printf("=== Qwen2.5-0.5B ANE Inference ===\n\n"); // Load weights printf("Loading weights...\n"); if (load_weights(argv[1]) != 0) return 1; // Allocate buffers qwen_alloc(&g_model); // Compile ANE kernels printf("Compiling ANE kernels (169 total)...\n"); struct timespec t0, t1; clock_gettime(CLOCK_MONOTONIC, &t0); qwen_compile_kernels(&g_model); clock_gettime(CLOCK_MONOTONIC, &t1); double compile_sec = (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1e9; printf("Compile time: %.1fs\n\n", compile_sec); // Parse token IDs from argv[2] (space-separated) // argv[3] = max generation tokens int max_gen = 50; if (argc >= 4) max_gen = atoi(argv[3]); // Parse input token IDs int prompt_ids[2048]; int n_prompt = 0; char *tok_str = strdup(argv[2]); char *saveptr; char *p = strtok_r(tok_str, " ", &saveptr); while (p && n_prompt < 2048) { prompt_ids[n_prompt++] = atoi(p); p = strtok_r(NULL, " ", &saveptr); } free(tok_str); printf("Prompt: %d tokens, generating up to %d\n", n_prompt, max_gen); clock_gettime(CLOCK_MONOTONIC, &t0); // Prefill: feed all prompt tokens int next = 0; for (int i = 0; i < n_prompt; i++) { next = qwen_forward(&g_model, prompt_ids[i]); } struct timespec t_prefill; clock_gettime(CLOCK_MONOTONIC, &t_prefill); double prefill_sec = (t_prefill.tv_sec - t0.tv_sec) + (t_prefill.tv_nsec - t0.tv_nsec) / 1e9; printf("Prefill: %d tokens in %.2fs (%.1f t/s)\n", n_prompt, prefill_sec, n_prompt / prefill_sec); // Generate int eos = 151645; // <|im_end|> int eos2 = 151643; // <|endoftext|> printf("OUT:"); for (int i = 0; i < max_gen; i++) { printf(" %d", next); fflush(stdout); if (next == eos || next == eos2) break; next = qwen_forward(&g_model, next); } printf("\n"); clock_gettime(CLOCK_MONOTONIC, &t1); double gen_sec = (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1e9; int total_tokens = g_model.pos; int gen_tokens = total_tokens - n_prompt; double decode_sec = gen_sec - prefill_sec; printf("\nTotal: %d tokens in %.2fs\n", total_tokens, gen_sec); printf("Prefill: %.1f t/s (%d tokens)\n", n_prompt / prefill_sec, n_prompt); printf("Decode: %.1f t/s (%d tokens)\n", decode_sec > 0 ? gen_tokens / decode_sec : 0, gen_tokens); return 0; } }