// Full fused forward: QKV convs → reshape → matmul(Q,K^T) → scale+mask → softmax → matmul(scores,V) → Wo conv // If ANE compiler rejects matmul, we'll know and fall back // Also test: fused scores@V + Wo (2 convs in 1 dispatch) #import #import #import #import #import #include #define DIM 768 #define HEADS 12 #define HD (DIM/HEADS) #define HIDDEN 2048 #define SEQ 64 static Class g_D, g_I, g_AR, g_AIO; static mach_timebase_info_data_t g_tb; static void ane_init(void) { dlopen("/System/Library/PrivateFrameworks/AppleNeuralEngine.framework/AppleNeuralEngine", RTLD_NOW); g_D = NSClassFromString(@"_ANEInMemoryModelDescriptor"); g_I = NSClassFromString(@"_ANEInMemoryModel"); g_AR = NSClassFromString(@"_ANERequest"); g_AIO= NSClassFromString(@"_ANEIOSurfaceObject"); } static double tb_ms(uint64_t t) { return (double)t * g_tb.numer / g_tb.denom / 1e6; } static IOSurfaceRef make_surface(size_t bytes) { return IOSurfaceCreate((__bridge CFDictionaryRef)@{ (id)kIOSurfaceWidth:@(bytes), (id)kIOSurfaceHeight:@1, (id)kIOSurfaceBytesPerElement:@1, (id)kIOSurfaceBytesPerRow:@(bytes), (id)kIOSurfaceAllocSize:@(bytes), (id)kIOSurfacePixelFormat:@0}); } static NSData *build_blob(const float *w, int oc, int ic) { int wsize = oc*ic*2, total = 128+wsize; uint8_t *buf = (uint8_t*)calloc(total,1); buf[0]=1; buf[4]=2; buf[64]=0xEF; buf[65]=0xBE; buf[66]=0xAD; buf[67]=0xDE; buf[68]=1; *(uint32_t*)(buf+72)=wsize; *(uint32_t*)(buf+80)=128; _Float16 *fp16 = (_Float16*)(buf+128); for (int i = 0; i < oc*ic; i++) fp16[i] = (_Float16)w[i]; return [NSData dataWithBytesNoCopy:buf length:total freeWhenDone:YES]; } static NSData *build_blob_fp16(_Float16 *data, int count) { int wsize = count*2, total = 128+wsize; uint8_t *buf = (uint8_t*)calloc(total,1); buf[0]=1; buf[4]=2; buf[64]=0xEF; buf[65]=0xBE; buf[66]=0xAD; buf[67]=0xDE; buf[68]=1; *(uint32_t*)(buf+72)=wsize; *(uint32_t*)(buf+80)=128; memcpy(buf+128, data, wsize); return [NSData dataWithBytesNoCopy:buf length:total freeWhenDone:YES]; } typedef struct { id model; NSString *td; } Kern; static Kern compile_mil(NSString *mil, NSDictionary *wd) { Kern k = {nil, nil}; NSData *md = [mil dataUsingEncoding:NSUTF8StringEncoding]; id desc = ((id(*)(Class,SEL,id,id,id))objc_msgSend)(g_D, @selector(modelWithMILText:weights:optionsPlist:), md, wd ?: @{}, nil); if (!desc) { printf(" desc=NULL\n"); return k; } id mdl = ((id(*)(Class,SEL,id))objc_msgSend)(g_I, @selector(inMemoryModelWithDescriptor:), desc); id hx = ((id(*)(id,SEL))objc_msgSend)(mdl, @selector(hexStringIdentifier)); NSString *td = [NSTemporaryDirectory() stringByAppendingPathComponent:hx]; [[NSFileManager defaultManager] createDirectoryAtPath:[td stringByAppendingPathComponent:@"weights"] withIntermediateDirectories:YES attributes:nil error:nil]; [md writeToFile:[td stringByAppendingPathComponent:@"model.mil"] atomically:YES]; for (NSString *path in wd) { [wd[path][@"data"] writeToFile:[td stringByAppendingPathComponent: [path stringByReplacingOccurrencesOfString:@"@model_path/" withString:@""]] atomically:YES]; } NSError *e = nil; if (!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(compileWithQoS:options:error:), 21, @{}, &e)) { printf(" compile FAIL: %s\n", e?[[[e localizedDescription] substringToIndex:MIN(300,(int)[[e localizedDescription] length])] UTF8String]:""); [[NSFileManager defaultManager] removeItemAtPath:td error:nil]; return k; } if (!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(loadWithQoS:options:error:), 21, @{}, &e)) { printf(" load FAIL\n"); [[NSFileManager defaultManager] removeItemAtPath:td error:nil]; return k; } k.model = mdl; k.td = td; return k; } static BOOL ane_eval_io(Kern *k, IOSurfaceRef *ins, int nin, IOSurfaceRef *outs, int nout) { NSMutableArray *inArr = [NSMutableArray array], *inIdx = [NSMutableArray array]; NSMutableArray *outArr = [NSMutableArray array], *outIdx = [NSMutableArray array]; for (int i = 0; i < nin; i++) { [inArr addObject:((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ins[i])]; [inIdx addObject:@(i)]; } for (int i = 0; i < nout; i++) { [outArr addObject:((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), outs[i])]; [outIdx addObject:@(i)]; } id req = ((id(*)(Class,SEL,id,id,id,id,id,id,id))objc_msgSend)(g_AR, @selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:), inArr, inIdx, outArr, outIdx, nil, nil, @0); NSError *e = nil; return ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)( k->model, @selector(evaluateWithQoS:options:request:error:), 21, @{}, req, &e); } static void cleanup_kern(Kern *k) { if (!k->model) return; NSError *e = nil; ((BOOL(*)(id,SEL,unsigned int,NSError**))objc_msgSend)(k->model, @selector(unloadWithQoS:error:), 21, &e); [[NSFileManager defaultManager] removeItemAtPath:k->td error:nil]; } int main() { @autoreleasepool { setbuf(stdout, NULL); ane_init(); mach_timebase_info(&g_tb); srand48(42); float sc_d = 1.0f/sqrtf(DIM), sc_h = 1.0f/sqrtf(HIDDEN); float *Wq = (float*)malloc(DIM*DIM*4); for(int i=0;i({{\"coremlc-component-MIL\", \"3510.2.1\"}, " "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, " "{\"coremltools-version\", \"9.0\"}})]\n{\n" " func main(tensor x) {\n" // Conv boilerplate " string pt = const()[name = string(\"pt\"), val = string(\"valid\")];\n" " tensor st = const()[name = string(\"st\"), val = tensor([1, 1])];\n" " tensor pd = const()[name = string(\"pd\"), val = tensor([0, 0, 0, 0])];\n" " tensor dl = const()[name = string(\"dl\"), val = tensor([1, 1])];\n" " int32 gr1 = const()[name = string(\"g1\"), val = int32(1)];\n" // QKV weights " tensor Wq = const()[name = string(\"Wq\"), " "val = tensor(BLOBFILE(path = string(\"@model_path/weights/wq.bin\"), offset = uint64(64)))];\n" " tensor Wk = const()[name = string(\"Wk\"), " "val = tensor(BLOBFILE(path = string(\"@model_path/weights/wk.bin\"), offset = uint64(64)))];\n" " tensor Wv = const()[name = string(\"Wv\"), " "val = tensor(BLOBFILE(path = string(\"@model_path/weights/wv.bin\"), offset = uint64(64)))];\n" " tensor Wout = const()[name = string(\"Wo\"), " "val = tensor(BLOBFILE(path = string(\"@model_path/weights/wo.bin\"), offset = uint64(64)))];\n" // QKV projections " tensor q_flat = conv(dilations = dl, groups = gr1, pad = pd, " "pad_type = pt, strides = st, weight = Wq, x = x)[name = string(\"cq\")];\n" " tensor k_flat = conv(dilations = dl, groups = gr1, pad = pd, " "pad_type = pt, strides = st, weight = Wk, x = x)[name = string(\"ck\")];\n" " tensor v_flat = conv(dilations = dl, groups = gr1, pad = pd, " "pad_type = pt, strides = st, weight = Wv, x = x)[name = string(\"cv\")];\n" // Reshape: [1, DIM, 1, SEQ] → [1, HEADS, HD, SEQ] → transpose → [1, HEADS, SEQ, HD] " tensor qsh = const()[name = string(\"qsh\"), val = tensor([1, %d, %d, %d])];\n" " tensor q_4d = reshape(shape = qsh, x = q_flat)[name = string(\"rq\")];\n" " tensor perm = const()[name = string(\"pm\"), val = tensor([0, 1, 3, 2])];\n" " tensor q = transpose(perm = perm, x = q_4d)[name = string(\"tq\")];\n" " tensor k_4d = reshape(shape = qsh, x = k_flat)[name = string(\"rk\")];\n" " tensor k = transpose(perm = perm, x = k_4d)[name = string(\"tk\")];\n" " tensor v_4d = reshape(shape = qsh, x = v_flat)[name = string(\"rv\")];\n" " tensor v = transpose(perm = perm, x = v_4d)[name = string(\"tv\")];\n" // Q @ K^T " bool ty = const()[name = string(\"ty\"), val = bool(true)];\n" " bool tx = const()[name = string(\"tx\"), val = bool(false)];\n" " tensor scores = matmul(transpose_x = tx, transpose_y = ty, x = q, y = k)[name = string(\"mm1\")];\n" // Scale " fp16 sc = const()[name = string(\"sc\"), val = fp16(%f)];\n" " tensor scaled = mul(x = scores, y = sc)[name = string(\"scl\")];\n" // Causal mask " tensor cmask = const()[name = string(\"cm\"), " "val = tensor(BLOBFILE(path = string(\"@model_path/weights/mask.bin\"), offset = uint64(64)))];\n" " tensor masked = add(x = scaled, y = cmask)[name = string(\"msk\")];\n" // Softmax " int32 sax = const()[name = string(\"sax\"), val = int32(-1)];\n" " tensor attn_w = softmax(axis = sax, x = masked)[name = string(\"sm\")];\n" // scores @ V " tensor attn_4d = matmul(transpose_x = tx, transpose_y = tx, x = attn_w, y = v)[name = string(\"mm2\")];\n" // Reshape back: [1, HEADS, SEQ, HD] → transpose → [1, HEADS, HD, SEQ] → reshape → [1, DIM, 1, SEQ] " tensor attn_t = transpose(perm = perm, x = attn_4d)[name = string(\"ta\")];\n" " tensor osh = const()[name = string(\"osh\"), val = tensor([1, %d, 1, %d])];\n" " tensor attn_flat = reshape(shape = osh, x = attn_t)[name = string(\"ra\")];\n" // Wo projection " tensor out = conv(dilations = dl, groups = gr1, pad = pd, " "pad_type = pt, strides = st, weight = Wout, x = attn_flat)[name = string(\"co\")];\n" " } -> (out);\n}\n", DIM, SEQ, // input DIM,DIM,DIM,DIM, DIM,DIM,DIM,DIM, // Wq, Wk DIM,DIM,DIM,DIM, DIM,DIM,DIM,DIM, // Wv, Wo DIM, SEQ, DIM, SEQ, DIM, SEQ, // q_flat, k_flat, v_flat HEADS, HD, SEQ, // reshape shape HEADS, HD, SEQ, // q_4d HEADS, SEQ, HD, // q (after transpose) HEADS, HD, SEQ, // k_4d HEADS, SEQ, HD, // k HEADS, HD, SEQ, // v_4d HEADS, SEQ, HD, // v HEADS, SEQ, SEQ, // scores scale_val, HEADS, SEQ, SEQ, // scaled SEQ, SEQ, SEQ, SEQ, // mask HEADS, SEQ, SEQ, // masked HEADS, SEQ, SEQ, // attn_w (softmax) HEADS, SEQ, HD, // attn_4d HEADS, HD, SEQ, // attn_t DIM, SEQ, // reshape back DIM, SEQ, // attn_flat DIM, SEQ]; // out NSDictionary *wd = @{ @"@model_path/weights/wq.bin": @{@"offset":@0, @"data":build_blob(Wq,DIM,DIM)}, @"@model_path/weights/wk.bin": @{@"offset":@0, @"data":build_blob(Wk,DIM,DIM)}, @"@model_path/weights/wv.bin": @{@"offset":@0, @"data":build_blob(Wv,DIM,DIM)}, @"@model_path/weights/wo.bin": @{@"offset":@0, @"data":build_blob(Wo,DIM,DIM)}, @"@model_path/weights/mask.bin": @{@"offset":@0, @"data":build_blob_fp16(mask,SEQ*SEQ)}, }; free(mask); Kern k = compile_mil(mil, wd); if (k.model) { printf(" COMPILED! Full fused attention works on ANE!\n"); // Verify vs CPU float *x = (float*)malloc(SEQ*DIM*4); for (int i = 0; i < SEQ*DIM; i++) x[i] = 0.1f*(2*drand48()-1); IOSurfaceRef ioIn = make_surface(DIM*SEQ*2); IOSurfaceRef ioOut = make_surface(DIM*SEQ*2); IOSurfaceLock(ioIn, 0, NULL); _Float16 *p = (_Float16*)IOSurfaceGetBaseAddress(ioIn); for (int t = 0; t < SEQ; t++) for (int c = 0; c < DIM; c++) p[c*SEQ+t] = (_Float16)x[t*DIM+c]; IOSurfaceUnlock(ioIn, 0, NULL); IOSurfaceRef ins[] = {ioIn}, outs[] = {ioOut}; BOOL ok = ane_eval_io(&k, ins, 1, outs, 1); printf(" Eval: %s\n", ok?"OK":"FAIL"); if (ok) { // CPU reference float *q_cpu = (float*)calloc(SEQ*DIM, 4); float *k_cpu = (float*)calloc(SEQ*DIM, 4); float *v_cpu = (float*)calloc(SEQ*DIM, 4); for (int t=0;tmaxs) maxs=s; } float sum=0; for (int t2=0;t2<=t;t2++){sc2[t2]=expf(sc2[t2]-maxs);sum+=sc2[t2];} for (int t2=0;t2<=t;t2++) sc2[t2]/=sum; for (int d=0;dmaxdiff) maxdiff=diff; } IOSurfaceUnlock(ioOut, kIOSurfaceLockReadOnly, NULL); printf(" Max diff vs CPU: %.6f → %s\n", maxdiff, maxdiff<0.1f?"PASS":"FAIL"); // Benchmark for (int i=0;i<20;i++) ane_eval_io(&k, ins, 1, outs, 1); int N=500; uint64_t t0 = mach_absolute_time(); for (int i=0;i({{\"coremlc-component-MIL\", \"3510.2.1\"}, " "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, " "{\"coremltools-version\", \"9.0\"}})]\n{\n" " func main(tensor x) {\n" " string pt = const()[name = string(\"pt\"), val = string(\"valid\")];\n" " tensor st = const()[name = string(\"st\"), val = tensor([1, 1])];\n" " tensor pd = const()[name = string(\"pd\"), val = tensor([0, 0, 0, 0])];\n" " tensor dl = const()[name = string(\"dl\"), val = tensor([1, 1])];\n" " int32 gr = const()[name = string(\"gr\"), val = int32(1)];\n" " tensor W1 = const()[name = string(\"W1\"), " "val = tensor(BLOBFILE(path = string(\"@model_path/weights/w1.bin\"), offset = uint64(64)))];\n" " tensor W3 = const()[name = string(\"W3\"), " "val = tensor(BLOBFILE(path = string(\"@model_path/weights/w3.bin\"), offset = uint64(64)))];\n" " tensor W2 = const()[name = string(\"W2\"), " "val = tensor(BLOBFILE(path = string(\"@model_path/weights/w2.bin\"), offset = uint64(64)))];\n" " tensor h1 = conv(dilations = dl, groups = gr, pad = pd, " "pad_type = pt, strides = st, weight = W1, x = x)[name = string(\"c1\")];\n" " tensor h3 = conv(dilations = dl, groups = gr, pad = pd, " "pad_type = pt, strides = st, weight = W3, x = x)[name = string(\"c3\")];\n" " tensor sig = sigmoid(x = h1)[name = string(\"sg\")];\n" " tensor silu = mul(x = h1, y = sig)[name = string(\"si\")];\n" " tensor gate = mul(x = silu, y = h3)[name = string(\"gt\")];\n" " tensor out = conv(dilations = dl, groups = gr, pad = pd, " "pad_type = pt, strides = st, weight = W2, x = gate)[name = string(\"c2\")];\n" " } -> (out);\n}\n", DIM, SEQ, HIDDEN,DIM,HIDDEN,DIM, HIDDEN,DIM,HIDDEN,DIM, DIM,HIDDEN,DIM,HIDDEN, HIDDEN,SEQ, HIDDEN,SEQ, HIDDEN,SEQ, HIDDEN,SEQ, HIDDEN,SEQ, DIM,SEQ]; NSDictionary *wd = @{ @"@model_path/weights/w1.bin": @{@"offset":@0, @"data":build_blob(W1,HIDDEN,DIM)}, @"@model_path/weights/w3.bin": @{@"offset":@0, @"data":build_blob(W3,HIDDEN,DIM)}, @"@model_path/weights/w2.bin": @{@"offset":@0, @"data":build_blob(W2,DIM,HIDDEN)}, }; Kern k = compile_mil(mil, wd); printf(" FFN: %s\n", k.model?"OK":"FAIL"); if (k.model) { IOSurfaceRef ioIn = make_surface(DIM*SEQ*2), ioOut = make_surface(DIM*SEQ*2); IOSurfaceRef ins[]={ioIn}, outs[]={ioOut}; for (int i=0;i<20;i++) ane_eval_io(&k,ins,1,outs,1); int N=500; uint64_t t0 = mach_absolute_time(); for (int i=0;i