// main.m -- Qwen2.5-0.5B inference on Apple Neural Engine // Supports four modes: // 1. Single-shot: ./qwen_ane weights.bin "token_ids" [max_tokens] // 2. Stdin server: ./qwen_ane weights.bin --server // 3. Socket server: ./qwen_ane weights.bin --server /tmp/qwen_ane.sock // 4. HTTP API: ./qwen_ane weights.bin --http 8000 --model-dir ~/models/Qwen2.5-0.5B-Instruct // // Build: // xcrun clang -O3 -ffast-math -mcpu=apple-m4 -flto \ // -framework Foundation -framework IOSurface \ // -framework CoreML -framework Accelerate -framework Metal \ // -ldl -lobjc -fobjc-arc -o qwen_ane main.m // #import #include #include #include #include #include #include #include #include #include "qwen_ane_infer.h" #include "tokenizer.h" #include "http_server.h" int g_fp16_io = 0; static QwenModel g_model; static const char *g_sock_path = NULL; static Tokenizer g_tokenizer; static int g_tokenizer_loaded = 0; static void cleanup_socket(void) { if (g_sock_path) unlink(g_sock_path); } static void handle_signal(int sig) { (void)sig; cleanup_socket(); _exit(0); } static void *safe_malloc(size_t size, const char *desc) { void *p = malloc(size); if (!p) { fprintf(stderr, "FATAL: malloc failed for %s (%.1f MB)\n", desc, (double)size / (1024*1024)); exit(1); } return p; } static void *safe_calloc(size_t count, size_t size, const char *desc) { void *p = calloc(count, size); if (!p) { fprintf(stderr, "FATAL: calloc failed for %s (%.1f MB)\n", desc, (double)(count * size) / (1024*1024)); exit(1); } return p; } static int load_weights(const char *path) { FILE *f = fopen(path, "rb"); if (!f) { fprintf(stderr, "Cannot open %s\n", path); return -1; } // Try 8-int header first (new format), fall back to 7-int (legacy) int config[8] = {0}; size_t hdr_read = fread(config, sizeof(int), 8, f); int dim = config[0], hidden = config[1], n_layers = config[2]; int n_heads = config[3], n_kv_heads = config[4], vocab = config[5]; int fmt_flag = 0; if (hdr_read == 8 && config[7] >= 0 && config[7] <= 3) { fmt_flag = config[7]; } else { fseek(f, 7 * sizeof(int), SEEK_SET); } g_model.weight_fmt = fmt_flag; int is_f16 = (fmt_flag == 1); int is_q8 = (fmt_flag == 2); int is_q4 = (fmt_flag == 3); const char *fmt_str = is_q4 ? "Q4" : (is_q8 ? "Q8" : (is_f16 ? "F16" : "F32")); printf("Config: dim=%d hidden=%d layers=%d heads=%d kv_heads=%d vocab=%d fmt=%s\n", dim, hidden, n_layers, n_heads, n_kv_heads, vocab, fmt_str); int q_dim = n_heads * QWEN_HEAD_DIM; int kv_dim = n_kv_heads * QWEN_HEAD_DIM; // Embeddings always F32 g_model.embed = (float*)safe_malloc((size_t)vocab * dim * sizeof(float), "embed"); fread(g_model.embed, sizeof(float), (size_t)vocab * dim, f); for (int l = 0; l < n_layers; l++) { // RMSNorm always F32 g_model.rms_att[l] = (float*)malloc(dim * sizeof(float)); fread(g_model.rms_att[l], sizeof(float), dim, f); if (is_q4) { #define LOAD_Q4(q8ptr, out_d, in_d) do { \ size_t _nb = (size_t)(in_d) / Q4_BLOCK_SIZE; \ size_t _bytes = (size_t)(out_d) * _nb * Q4_BLOCK_BYTES; \ q8ptr = (uint8_t*)safe_malloc(_bytes, #q8ptr); \ fread(q8ptr, 1, _bytes, f); \ } while(0) LOAD_Q4(g_model.wq_q8[l], q_dim, dim); LOAD_Q4(g_model.wk_q8[l], kv_dim, dim); LOAD_Q4(g_model.wv_q8[l], kv_dim, dim); LOAD_Q4(g_model.wo_q8[l], dim, q_dim); #undef LOAD_Q4 } else if (is_q8) { #define LOAD_Q8(q8ptr, out_d, in_d) do { \ size_t _nb = (size_t)(in_d) / Q8_BLOCK_SIZE; \ size_t _bytes = (size_t)(out_d) * _nb * Q8_BLOCK_BYTES; \ q8ptr = (uint8_t*)safe_malloc(_bytes, #q8ptr); \ fread(q8ptr, 1, _bytes, f); \ } while(0) LOAD_Q8(g_model.wq_q8[l], q_dim, dim); LOAD_Q8(g_model.wk_q8[l], kv_dim, dim); LOAD_Q8(g_model.wv_q8[l], kv_dim, dim); LOAD_Q8(g_model.wo_q8[l], dim, q_dim); #undef LOAD_Q8 } else if (is_f16) { #define LOAD_F16_AS_F32(f32ptr, f16ptr, n) do { \ size_t _n = (size_t)(n); \ f16ptr = (_Float16*)malloc(_n * sizeof(_Float16)); \ fread(f16ptr, sizeof(_Float16), _n, f); \ f32ptr = (float*)malloc(_n * sizeof(float)); \ convert_f16_to_f32(f16ptr, f32ptr, _n); \ } while(0) LOAD_F16_AS_F32(g_model.wq[l], g_model.wq_f16[l], (size_t)q_dim * dim); LOAD_F16_AS_F32(g_model.wk[l], g_model.wk_f16[l], (size_t)kv_dim * dim); LOAD_F16_AS_F32(g_model.wv[l], g_model.wv_f16[l], (size_t)kv_dim * dim); LOAD_F16_AS_F32(g_model.wo[l], g_model.wo_f16[l], (size_t)dim * q_dim); #undef LOAD_F16_AS_F32 } else { g_model.wq[l] = (float*)malloc((size_t)q_dim * dim * sizeof(float)); fread(g_model.wq[l], sizeof(float), (size_t)q_dim * dim, f); g_model.wk[l] = (float*)malloc((size_t)kv_dim * dim * sizeof(float)); fread(g_model.wk[l], sizeof(float), (size_t)kv_dim * dim, f); g_model.wv[l] = (float*)malloc((size_t)kv_dim * dim * sizeof(float)); fread(g_model.wv[l], sizeof(float), (size_t)kv_dim * dim, f); g_model.wo[l] = (float*)malloc((size_t)q_dim * dim * sizeof(float)); fread(g_model.wo[l], sizeof(float), (size_t)dim * q_dim, f); } // Biases always F32 g_model.q_bias[l] = (float*)malloc(q_dim * sizeof(float)); g_model.k_bias[l] = (float*)malloc(kv_dim * sizeof(float)); g_model.v_bias[l] = (float*)malloc(kv_dim * sizeof(float)); fread(g_model.q_bias[l], sizeof(float), q_dim, f); fread(g_model.k_bias[l], sizeof(float), kv_dim, f); fread(g_model.v_bias[l], sizeof(float), kv_dim, f); // FFN RMSNorm always F32 g_model.rms_ffn[l] = (float*)malloc(dim * sizeof(float)); fread(g_model.rms_ffn[l], sizeof(float), dim, f); if (is_q4) { #define LOAD_Q4(q8ptr, out_d, in_d) do { \ size_t _nb = (size_t)(in_d) / Q4_BLOCK_SIZE; \ size_t _bytes = (size_t)(out_d) * _nb * Q4_BLOCK_BYTES; \ q8ptr = (uint8_t*)safe_malloc(_bytes, #q8ptr); \ fread(q8ptr, 1, _bytes, f); \ } while(0) LOAD_Q4(g_model.wgate_q8[l], hidden, dim); LOAD_Q4(g_model.wup_q8[l], hidden, dim); LOAD_Q4(g_model.wdown_q8[l], dim, hidden); #undef LOAD_Q4 } else if (is_q8) { #define LOAD_Q8(q8ptr, out_d, in_d) do { \ size_t _nb = (size_t)(in_d) / Q8_BLOCK_SIZE; \ size_t _bytes = (size_t)(out_d) * _nb * Q8_BLOCK_BYTES; \ q8ptr = (uint8_t*)safe_malloc(_bytes, #q8ptr); \ fread(q8ptr, 1, _bytes, f); \ } while(0) LOAD_Q8(g_model.wgate_q8[l], hidden, dim); LOAD_Q8(g_model.wup_q8[l], hidden, dim); LOAD_Q8(g_model.wdown_q8[l], dim, hidden); #undef LOAD_Q8 } else if (is_f16) { #define LOAD_F16_AS_F32(f32ptr, f16ptr, n) do { \ size_t _n = (size_t)(n); \ f16ptr = (_Float16*)malloc(_n * sizeof(_Float16)); \ fread(f16ptr, sizeof(_Float16), _n, f); \ f32ptr = (float*)malloc(_n * sizeof(float)); \ convert_f16_to_f32(f16ptr, f32ptr, _n); \ } while(0) LOAD_F16_AS_F32(g_model.w_gate[l], g_model.wgate_f16[l], (size_t)hidden * dim); LOAD_F16_AS_F32(g_model.w_up[l], g_model.wup_f16[l], (size_t)hidden * dim); LOAD_F16_AS_F32(g_model.w_down[l], g_model.wdown_f16[l], (size_t)dim * hidden); #undef LOAD_F16_AS_F32 } else { g_model.w_gate[l] = (float*)malloc((size_t)hidden * dim * sizeof(float)); fread(g_model.w_gate[l], sizeof(float), (size_t)hidden * dim, f); g_model.w_up[l] = (float*)malloc((size_t)hidden * dim * sizeof(float)); fread(g_model.w_up[l], sizeof(float), (size_t)hidden * dim, f); g_model.w_down[l] = (float*)malloc((size_t)dim * hidden * sizeof(float)); fread(g_model.w_down[l], sizeof(float), (size_t)dim * hidden, f); } } g_model.rms_final = (float*)malloc(dim * sizeof(float)); fread(g_model.rms_final, sizeof(float), dim, f); long file_size = ftell(f); fclose(f); printf("Weights loaded (%.0f MB, %s projections)\n", (float)file_size / 1024 / 1024, fmt_str); return 0; } // Parse space-separated token IDs from a string. Returns count. static int parse_tokens(const char *str, int *ids, int max_ids) { int n = 0; char *buf = strdup(str); char *saveptr; char *p = strtok_r(buf, " \t\n\r", &saveptr); while (p && n < max_ids) { ids[n++] = atoi(p); p = strtok_r(NULL, " \t\n\r", &saveptr); } free(buf); return n; } static double timespec_diff(struct timespec *a, struct timespec *b) { return (b->tv_sec - a->tv_sec) + (b->tv_nsec - a->tv_nsec) / 1e9; } // Run one generation pass. Writes output token IDs to out_ids, returns count. // Uses batched prefill (sgemm) for prompt, sequential decode (sgemv) for generation. static int generate(int *prompt_ids, int n_prompt, int max_gen, int *out_ids, int max_out, double *prefill_tps, double *decode_tps) { struct timespec t0, t1, t_pre; clock_gettime(CLOCK_MONOTONIC, &t0); int next; if (g_model.use_ane) { for (int i = 0; i < n_prompt; i++) next = qwen_forward_ane(&g_model, prompt_ids[i]); } else if (n_prompt > 1 && g_model.weight_fmt == 3) { next = qwen_prefill_q4(&g_model, prompt_ids, n_prompt); } else if (n_prompt > 1 && g_model.weight_fmt != 2) { next = qwen_prefill(&g_model, prompt_ids, n_prompt); } else { for (int i = 0; i < n_prompt; i++) next = qwen_forward(&g_model, prompt_ids[i]); } clock_gettime(CLOCK_MONOTONIC, &t_pre); double ps = timespec_diff(&t0, &t_pre); *prefill_tps = ps > 0 ? n_prompt / ps : 0; int eos = 151645, eos2 = 151643; int n_out = 0; for (int i = 0; i < max_gen && n_out < max_out; i++) { if (n_out < max_out) out_ids[n_out++] = next; if (next == eos || next == eos2) break; if (g_model.use_ane) next = qwen_forward_ane(&g_model, next); else next = qwen_forward(&g_model, next); } clock_gettime(CLOCK_MONOTONIC, &t1); double ds = timespec_diff(&t_pre, &t1); int gen_tokens = n_out > 1 ? n_out - 1 : 0; *decode_tps = ds > 0 ? gen_tokens / ds : 0; return n_out; } // --- Stdin server mode --- static void run_stdin_server(void) { printf("READY\n"); fflush(stdout); char line[65536]; while (fgets(line, sizeof(line), stdin)) { // Format: "token_id token_id ... [|max_tokens]" int max_gen = 50; char *pipe = strchr(line, '|'); if (pipe) { max_gen = atoi(pipe + 1); *pipe = '\0'; } int prompt_ids[2048]; int n_prompt = parse_tokens(line, prompt_ids, 2048); if (n_prompt == 0) { printf("ERR: empty prompt\n"); fflush(stdout); continue; } int out_ids[4096]; double p_tps, d_tps; int n_out = generate(prompt_ids, n_prompt, max_gen, out_ids, 4096, &p_tps, &d_tps); printf("OUT:"); for (int i = 0; i < n_out; i++) printf(" %d", out_ids[i]); printf("\n"); printf("PERF: prefill=%.1f decode=%.1f prompt=%d gen=%d\n", p_tps, d_tps, n_prompt, n_out); fflush(stdout); qwen_reset(&g_model); } } // --- Socket server mode --- static void run_socket_server(const char *sock_path) { g_sock_path = sock_path; signal(SIGINT, handle_signal); signal(SIGTERM, handle_signal); atexit(cleanup_socket); unlink(sock_path); int srv = socket(AF_UNIX, SOCK_STREAM, 0); if (srv < 0) { perror("socket"); return; } struct sockaddr_un addr; memset(&addr, 0, sizeof(addr)); addr.sun_family = AF_UNIX; strncpy(addr.sun_path, sock_path, sizeof(addr.sun_path) - 1); if (bind(srv, (struct sockaddr*)&addr, sizeof(addr)) < 0) { perror("bind"); close(srv); return; } if (listen(srv, 4) < 0) { perror("listen"); close(srv); return; } printf("Listening on %s\n", sock_path); printf("READY\n"); fflush(stdout); while (1) { int client = accept(srv, NULL, NULL); if (client < 0) { perror("accept"); continue; } // Read request: {"tokens": [1,2,3], "max_tokens": 50} char buf[131072]; ssize_t total = 0; while (total < (ssize_t)sizeof(buf) - 1) { ssize_t n = read(client, buf + total, sizeof(buf) - 1 - total); if (n <= 0) break; total += n; if (memchr(buf, '\n', total) || memchr(buf, '}', total)) break; } buf[total] = '\0'; // Minimal JSON parsing for {"tokens": [...], "max_tokens": N} int prompt_ids[2048]; int n_prompt = 0; int max_gen = 50; char *tok_start = strstr(buf, "\"tokens\""); if (tok_start) { char *bracket = strchr(tok_start, '['); if (bracket) { char *p = bracket + 1; while (*p && *p != ']' && n_prompt < 2048) { while (*p && (*p == ' ' || *p == ',')) p++; if (*p == ']') break; prompt_ids[n_prompt++] = (int)strtol(p, &p, 10); } } } char *mt = strstr(buf, "\"max_tokens\""); if (mt) { char *colon = strchr(mt, ':'); if (colon) max_gen = (int)strtol(colon + 1, NULL, 10); } if (n_prompt == 0) { const char *err = "{\"error\": \"no tokens\"}\n"; write(client, err, strlen(err)); close(client); continue; } int out_ids[4096]; double p_tps, d_tps; int n_out = generate(prompt_ids, n_prompt, max_gen, out_ids, 4096, &p_tps, &d_tps); // Build JSON response char resp[131072]; int off = snprintf(resp, sizeof(resp), "{\"output\": ["); for (int i = 0; i < n_out; i++) off += snprintf(resp + off, sizeof(resp) - off, "%s%d", i ? ", " : "", out_ids[i]); off += snprintf(resp + off, sizeof(resp) - off, "], \"prefill_tps\": %.1f, \"decode_tps\": %.1f, " "\"prompt_tokens\": %d, \"gen_tokens\": %d}\n", p_tps, d_tps, n_prompt, n_out); write(client, resp, off); close(client); printf("[socket] prompt=%d gen=%d prefill=%.1f decode=%.1f t/s\n", n_prompt, n_out, p_tps, d_tps); fflush(stdout); qwen_reset(&g_model); } } // --- HTTP API handler --- static void http_api_handler(int client_fd, HttpRequest *req, void *ctx) { (void)ctx; if (strcmp(req->method, "GET") == 0 && strcmp(req->path, "/health") == 0) { http_send_json(client_fd, 200, "{\"status\":\"ok\",\"mode\":\"http\"}"); return; } if (strcmp(req->method, "POST") != 0 || strcmp(req->path, "/v1/completions") != 0) { http_send_json(client_fd, 404, "{\"error\":\"not found, use POST /v1/completions\"}"); return; } if (req->body_len == 0) { http_send_json(client_fd, 400, "{\"error\":\"empty body\"}"); return; } char prompt[32768]; if (http_json_get_string(req->body, "prompt", prompt, sizeof(prompt)) < 0) { http_send_json(client_fd, 400, "{\"error\":\"missing 'prompt' field\"}"); return; } int max_tokens = http_json_get_int(req->body, "max_tokens", 50); if (max_tokens > 512) max_tokens = 512; if (max_tokens < 1) max_tokens = 1; char system_prompt[4096]; if (http_json_get_string(req->body, "system", system_prompt, sizeof(system_prompt)) < 0) strcpy(system_prompt, "You are a helpful assistant. Be concise."); // Time tokenization separately struct timespec t_tok0, t_tok1, t_gen0, t_gen1, t_det0, t_det1; clock_gettime(CLOCK_MONOTONIC, &t_tok0); int input_ids[4096]; int n_input = tok_encode_chat(&g_tokenizer, system_prompt, prompt, input_ids, 4096); clock_gettime(CLOCK_MONOTONIC, &t_tok1); double tokenize_ms = timespec_diff(&t_tok0, &t_tok1) * 1000.0; if (n_input == 0) { http_send_json(client_fd, 400, "{\"error\":\"tokenization produced no tokens\"}"); return; } // Pure inference timing clock_gettime(CLOCK_MONOTONIC, &t_gen0); int out_ids[4096]; double p_tps, d_tps; int n_out = generate(input_ids, n_input, max_tokens, out_ids, 4096, &p_tps, &d_tps); clock_gettime(CLOCK_MONOTONIC, &t_gen1); double inference_ms = timespec_diff(&t_gen0, &t_gen1) * 1000.0; // Prefill time = inference of prompt tokens only (from generate's internal timing) double prefill_s = p_tps > 0 ? n_input / p_tps : 0; double ttft_ms = prefill_s * 1000.0; // Time detokenization separately clock_gettime(CLOCK_MONOTONIC, &t_det0); char decoded[65536]; tok_decode(&g_tokenizer, out_ids, n_out, decoded, sizeof(decoded)); clock_gettime(CLOCK_MONOTONIC, &t_det1); double detokenize_ms = timespec_diff(&t_det0, &t_det1) * 1000.0; double total_ms = tokenize_ms + inference_ms + detokenize_ms; // Escape the decoded text for JSON char escaped[131072]; int ei = 0; for (int i = 0; decoded[i] && ei < (int)sizeof(escaped) - 6; i++) { switch (decoded[i]) { case '"': escaped[ei++] = '\\'; escaped[ei++] = '"'; break; case '\\': escaped[ei++] = '\\'; escaped[ei++] = '\\'; break; case '\n': escaped[ei++] = '\\'; escaped[ei++] = 'n'; break; case '\r': escaped[ei++] = '\\'; escaped[ei++] = 'r'; break; case '\t': escaped[ei++] = '\\'; escaped[ei++] = 't'; break; default: if ((unsigned char)decoded[i] < 0x20) { ei += snprintf(escaped + ei, 7, "\\u%04x", (unsigned char)decoded[i]); } else { escaped[ei++] = decoded[i]; } } } escaped[ei] = '\0'; // Build JSON response with detailed timing breakdown char resp[HTTP_MAX_RESPONSE]; snprintf(resp, sizeof(resp), "{\"text\":\"%s\",\"prompt_tokens\":%d,\"gen_tokens\":%d," "\"prefill_tps\":%.1f,\"decode_tps\":%.1f," "\"tokenize_ms\":%.1f,\"inference_ms\":%.1f,\"detokenize_ms\":%.1f," "\"ttft_ms\":%.1f,\"total_ms\":%.1f}", escaped, n_input, n_out, p_tps, d_tps, tokenize_ms, inference_ms, detokenize_ms, ttft_ms, total_ms); http_send_json(client_fd, 200, resp); printf("[http] prompt=%d gen=%d prefill=%.1f decode=%.1f t/s | tok=%.1f inf=%.1f detok=%.1f ms\n", n_input, n_out, p_tps, d_tps, tokenize_ms, inference_ms, detokenize_ms); fflush(stdout); qwen_reset(&g_model); } int main(int argc, char **argv) { @autoreleasepool { if (argc < 2) { fprintf(stderr, "Usage:\n" " %s \"token_ids\" [max_tokens] (single-shot)\n" " %s --server (stdin loop)\n" " %s --server /tmp/qwen_ane.sock (socket server)\n" " %s --http 8000 --model-dir ~/models/Qwen2.5 (HTTP API)\n", argv[0], argv[0], argv[0], argv[0]); return 1; } printf("=== Qwen2.5-0.5B ANE Inference ===\n\n"); setbuf(stdout, NULL); printf("Loading weights...\n"); if (load_weights(argv[1]) != 0) return 1; qwen_alloc(&g_model); qwen_rope_init(); printf("Compiling ANE kernels (169 total)...\n"); struct timespec t0, t1; clock_gettime(CLOCK_MONOTONIC, &t0); qwen_compile_kernels(&g_model); clock_gettime(CLOCK_MONOTONIC, &t1); double compile_sec = timespec_diff(&t0, &t1); printf("Compile time: %.1fs\n\n", compile_sec); // Parse flags int server_mode = 0; int http_port = 0; int test_ane = 0; int use_ane = 0; const char *sock_path = NULL; const char *model_dir = NULL; for (int i = 2; i < argc; i++) { if (strcmp(argv[i], "--server") == 0) { server_mode = 1; if (i + 1 < argc && argv[i+1][0] != '-') sock_path = argv[++i]; } else if (strcmp(argv[i], "--http") == 0) { if (i + 1 < argc) http_port = atoi(argv[++i]); else { fprintf(stderr, "--http requires a port number\n"); return 1; } } else if (strcmp(argv[i], "--model-dir") == 0) { if (i + 1 < argc) model_dir = argv[++i]; else { fprintf(stderr, "--model-dir requires a path\n"); return 1; } } else if (strcmp(argv[i], "--test-ane") == 0) { test_ane = 1; } else if (strcmp(argv[i], "--ane") == 0) { use_ane = 1; } } // Q4 CPU mode: dequantize Q4 to F32 at load time, use AMX cblas_sgemv if (g_model.weight_fmt == 3) { printf("Dequantizing Q4→F32 for AMX acceleration...\n"); int q_dim = QWEN_Q_DIM, kv_dim = QWEN_KV_DIM, dim = QWEN_DIM; int hidden = QWEN_HIDDEN; #define DEQUANT_Q4_TO_F32(f32ptr, q4ptr, out_d, in_d) do { \ size_t _n = (size_t)(out_d) * (in_d); \ f32ptr = (float*)malloc(_n * sizeof(float)); \ dequant_q4_to_f32(q4ptr, f32ptr, (in_d), (out_d)); \ free(q4ptr); q4ptr = NULL; \ } while(0) for (int l = 0; l < QWEN_LAYERS; l++) { DEQUANT_Q4_TO_F32(g_model.wq[l], g_model.wq_q8[l], q_dim, dim); DEQUANT_Q4_TO_F32(g_model.wk[l], g_model.wk_q8[l], kv_dim, dim); DEQUANT_Q4_TO_F32(g_model.wv[l], g_model.wv_q8[l], kv_dim, dim); DEQUANT_Q4_TO_F32(g_model.wo[l], g_model.wo_q8[l], dim, q_dim); DEQUANT_Q4_TO_F32(g_model.w_gate[l], g_model.wgate_q8[l], hidden, dim); DEQUANT_Q4_TO_F32(g_model.w_up[l], g_model.wup_q8[l], hidden, dim); DEQUANT_Q4_TO_F32(g_model.w_down[l], g_model.wdown_q8[l], dim, hidden); } #undef DEQUANT_Q4_TO_F32 g_model.weight_fmt = 0; printf("Q4→F32 done. Using AMX cblas_sgemv (91+ t/s decode).\n"); } // ANE fused kernel compilation (requires F32 weights for baked-weight convs) if (use_ane) { if (g_model.weight_fmt != 0) { printf("--ane requires F32 weights (weight_fmt=0). Got fmt=%d\n", g_model.weight_fmt); printf("Re-run with F32 weight file (convert_weights.py without --f16/--q4/--q8)\n"); use_ane = 0; } else { struct timespec ta0, ta1; clock_gettime(CLOCK_MONOTONIC, &ta0); qwen_compile_kernels_fused(&g_model); clock_gettime(CLOCK_MONOTONIC, &ta1); double ane_sec = timespec_diff(&ta0, &ta1); printf("ANE fused compile time: %.1fs\n", ane_sec); // Verify at least one QKV kernel compiled if (g_model.k_qkv[0] && g_model.k_o[0] && g_model.k_ffn_up[0] && g_model.k_down[0]) { g_model.use_ane = 1; printf("ANE fused mode active: 112 kernels (QKV+FFN_up fused)\n"); } else { printf("ANE fused compilation failed, falling back to CPU\n"); use_ane = 0; } } } // ANE vs CPU correctness test if (test_ane) { printf("=== ANE vs CPU Projection Test ===\n\n"); // Use a realistic input: embed token 2610 ("What"), RMSNorm it int test_token = 2610; memcpy(g_model.x, g_model.embed + test_token * QWEN_DIM, QWEN_DIM * sizeof(float)); qwen_rmsnorm(g_model.xb, g_model.x, g_model.rms_att[0], QWEN_DIM); // Also prepare a realistic Q output for the O projection test cpu_project(g_model.wq[0], g_model.xb, g_model.q, QWEN_DIM, QWEN_Q_DIM); float *cpu_out = (float*)calloc(QWEN_HIDDEN, sizeof(float)); float *ane_out = (float*)calloc(QWEN_HIDDEN, sizeof(float)); struct { const char *name; ANEKernel *kernel; const float *weights; int in_dim, out_dim; } tests[] = { {"L0 Q proj", g_model.k_q[0], g_model.wq[0], QWEN_DIM, QWEN_Q_DIM}, {"L0 K proj", g_model.k_k[0], g_model.wk[0], QWEN_DIM, QWEN_KV_DIM}, {"L0 V proj", g_model.k_v[0], g_model.wv[0], QWEN_DIM, QWEN_KV_DIM}, {"L0 O proj", g_model.k_o[0], g_model.wo[0], QWEN_Q_DIM, QWEN_DIM}, {"L0 Gate", g_model.k_gate[0], g_model.w_gate[0], QWEN_DIM, QWEN_HIDDEN}, {"L0 Up", g_model.k_up[0], g_model.w_up[0], QWEN_DIM, QWEN_HIDDEN}, {"L0 Down", g_model.k_down[0], g_model.w_down[0], QWEN_HIDDEN, QWEN_DIM}, {"LM Head c0", g_model.k_lmhead[0], g_model.embed, QWEN_DIM, QWEN_LM_CHUNK_SIZE}, }; int n_tests = sizeof(tests) / sizeof(tests[0]); int all_pass = 1; for (int t = 0; t < n_tests; t++) { if (!tests[t].kernel) { printf(" %-14s SKIP (kernel not compiled)\n", tests[t].name); continue; } const float *input; if (tests[t].in_dim == QWEN_Q_DIM) { input = g_model.q; } else if (tests[t].in_dim == QWEN_HIDDEN) { cpu_project(g_model.w_gate[0], g_model.xb, g_model.hb, QWEN_DIM, QWEN_HIDDEN); input = g_model.hb; } else { input = g_model.xb; } cpu_project(tests[t].weights, input, cpu_out, tests[t].in_dim, tests[t].out_dim); // ANE projection with return-value check ane_write_input(tests[t].kernel, 0, input, tests[t].in_dim * sizeof(float)); bool ane_ok = ane_run(tests[t].kernel); ane_read_output(tests[t].kernel, 0, ane_out, tests[t].out_dim * sizeof(float)); if (!ane_ok) printf(" !! ANE execution returned false\n"); float max_diff = 0, sum_diff = 0; float cpu_norm = 0, ane_norm = 0; for (int i = 0; i < tests[t].out_dim; i++) { float d = fabsf(cpu_out[i] - ane_out[i]); if (d > max_diff) max_diff = d; sum_diff += d; cpu_norm += cpu_out[i] * cpu_out[i]; ane_norm += ane_out[i] * ane_out[i]; } float avg_diff = sum_diff / tests[t].out_dim; float rel_err = (sqrtf(cpu_norm) > 0) ? sqrtf(sum_diff * sum_diff / tests[t].out_dim) / sqrtf(cpu_norm / tests[t].out_dim) : 0; int pass = (max_diff < 0.5f && rel_err < 0.05f); if (!pass) all_pass = 0; printf(" %-14s [%d→%d] max_diff=%.6f avg_diff=%.6f rel_err=%.4f %s\n", tests[t].name, tests[t].in_dim, tests[t].out_dim, max_diff, avg_diff, rel_err, pass ? "PASS" : "FAIL"); printf(" CPU first4: %.6f %.6f %.6f %.6f norm=%.4f\n", cpu_out[0], cpu_out[1], cpu_out[2], cpu_out[3], sqrtf(cpu_norm)); printf(" ANE first4: %.6f %.6f %.6f %.6f norm=%.4f\n", ane_out[0], ane_out[1], ane_out[2], ane_out[3], sqrtf(ane_norm)); } printf("\n%s\n", all_pass ? "ALL TESTS PASSED -- ANE projections match CPU (within FP16 tolerance)" : "SOME TESTS FAILED -- ANE projections have accuracy issues"); // If all pass, benchmark one layer ANE vs CPU speed if (all_pass) { printf("\n=== Speed comparison (1000 iterations, L0 Q proj %d→%d) ===\n", QWEN_DIM, QWEN_Q_DIM); struct timespec ts0, ts1; clock_gettime(CLOCK_MONOTONIC, &ts0); for (int i = 0; i < 1000; i++) cpu_project(g_model.wq[0], g_model.xb, cpu_out, QWEN_DIM, QWEN_Q_DIM); clock_gettime(CLOCK_MONOTONIC, &ts1); double cpu_us = timespec_diff(&ts0, &ts1) * 1e6 / 1000; clock_gettime(CLOCK_MONOTONIC, &ts0); for (int i = 0; i < 1000; i++) ane_project(g_model.k_q[0], g_model.xb, ane_out, QWEN_DIM, QWEN_Q_DIM); clock_gettime(CLOCK_MONOTONIC, &ts1); double ane_us = timespec_diff(&ts0, &ts1) * 1e6 / 1000; printf(" CPU: %.1f us/call\n", cpu_us); printf(" ANE: %.1f us/call\n", ane_us); printf(" Ratio: %.2fx %s\n", cpu_us / ane_us, ane_us < cpu_us ? "(ANE faster)" : "(CPU faster)"); } free(cpu_out); free(ane_out); return all_pass ? 0 : 1; } if (server_mode) { if (sock_path) run_socket_server(sock_path); else run_stdin_server(); return 0; } // HTTP API mode if (http_port > 0) { if (!model_dir) { // Default to ~/models/Qwen2.5-0.5B-Instruct static char default_dir[4096]; const char *home = getenv("HOME"); snprintf(default_dir, sizeof(default_dir), "%s/models/Qwen2.5-0.5B-Instruct", home ? home : "."); model_dir = default_dir; } printf("Loading tokenizer from %s...\n", model_dir); if (tok_init(&g_tokenizer, model_dir) != 0) { fprintf(stderr, "Failed to load tokenizer from %s\n", model_dir); return 1; } g_tokenizer_loaded = 1; printf("Tokenizer ready.\n\n"); signal(SIGINT, handle_signal); signal(SIGTERM, handle_signal); http_serve(http_port, http_api_handler, NULL); tok_free(&g_tokenizer); return 0; } // Single-shot mode (original behavior) if (argc < 3) { fprintf(stderr, "Error: provide token IDs or --server\n"); return 1; } int max_gen = 50; if (argc >= 4 && strcmp(argv[3], "--server") != 0) max_gen = atoi(argv[3]); int prompt_ids[2048]; int n_prompt = parse_tokens(argv[2], prompt_ids, 2048); printf("Prompt: %d tokens, generating up to %d\n", n_prompt, max_gen); int out_ids[4096]; double p_tps, d_tps; int n_out = generate(prompt_ids, n_prompt, max_gen, out_ids, 4096, &p_tps, &d_tps); printf("OUT:"); for (int i = 0; i < n_out; i++) printf(" %d", out_ids[i]); printf("\n"); printf("\nPrefill: %.1f t/s (%d tokens)\n", p_tps, n_prompt); printf("Decode: %.1f t/s (%d tokens)\n", d_tps, n_out > 1 ? n_out - 1 : 0); return 0; } }