// benchmark_ane.m — Measure ANE inference performance for Stories110M #import "stories_io.h" #import "stories_mil.h" // Globals float *embed, *rms_final; LayerWeights lw[NLAYERS]; LayerKernels kern[NLAYERS]; IOSurfaceRef causal_mask_surf; void load_checkpoint_inference(const char *path) { FILE *f = fopen(path, "rb"); if (!f) { printf("Failed to open %s\n", path); exit(1); } CkptHdr hdr; fread(&hdr, sizeof(CkptHdr), 1, f); printf("Loading checkpoint: step=%d dim=%d layers=%d\n", hdr.step, hdr.dim, hdr.n_layers); for (int L=0; LfwdAttn = compile_kern_mil_w(gen_sdpa_fwd_flex(), @{}, fwdAttn_ins, 7, 6*DIM*SEQ*2); int fwdFFN_ins[] = { DIM*SEQ*2, DIM*2, W1_SZ*2, W2_SZ*2, W3_SZ*2 }; lk->fwdFFN = compile_kern_mil_w(gen_ffn_fwd_flex(), @{}, fwdFFN_ins, 5, (2*DIM+3*HIDDEN)*SEQ*2); return lk->fwdAttn && lk->fwdFFN; } static void update_fwd_ane_weights(LayerKernels *lk, LayerWeights *w, IOSurfaceRef cms) { // fwdAttn: x(0), rw(1), Wq(2), Wk(3), Wv(4), Wo(5), cm(6) io_write_fp16(lk->fwdAttn->inputs[1], w->rms_att, 1, DIM); io_write_fp16(lk->fwdAttn->inputs[2], w->Wq, DIM, DIM); io_write_fp16(lk->fwdAttn->inputs[3], w->Wk, DIM, DIM); io_write_fp16(lk->fwdAttn->inputs[4], w->Wv, DIM, DIM); io_write_fp16(lk->fwdAttn->inputs[5], w->Wo, DIM, DIM); // Swap causal mask surface CFRelease(lk->fwdAttn->inputs[6]); lk->fwdAttn->inputs[6] = (IOSurfaceRef)CFRetain(cms); // Update request with new input (this is tricky since request is opaque, // but in stories_io.h it's created with these surfaces) // Actually, update_ane_weights in train_large just writes to existing. // Here we can just write once to CMS. static NSData *m_blob = nil; if(!m_blob) m_blob = get_mask_blob(); IOSurfaceLock(cms, 0, NULL); memcpy(IOSurfaceGetBaseAddress(cms), (uint8_t*)[m_blob bytes]+128, SEQ*SEQ*2); IOSurfaceUnlock(cms, 0, NULL); // fwdFFN: x(0), rw(1), W1(2), W2(3), W3(4) io_write_fp16(lk->fwdFFN->inputs[1], w->rms_ffn, 1, DIM); io_write_fp16(lk->fwdFFN->inputs[2], w->W1, HIDDEN, DIM); io_write_fp16(lk->fwdFFN->inputs[3], w->W2, DIM, HIDDEN); io_write_fp16(lk->fwdFFN->inputs[4], w->W3, HIDDEN, DIM); } int main(int argc, char **argv) { @autoreleasepool { ane_init(); mach_timebase_info(&g_tb); const char *ckpt = (argc > 1) ? argv[1] : "ane_stories110M_ckpt.bin"; load_checkpoint_inference(ckpt); printf("Compiling ANE kernels...\n"); uint64_t t_start = mach_absolute_time(); causal_mask_surf = make_surface(SEQ*SEQ*2); for (int L=0; L