// test_weight_patch.m — Test whether ANE weights can be patched after compile #import #import #import #import #import #import #import #import #include #include #include "stories_io.h" // MIL: fp32 in → cast fp16 → conv → cast fp32 out (matches inmem_peak.m pattern) static NSString *gen_conv_mil(int ic, int oc, int sp) { NSMutableString *m = [NSMutableString string]; [m appendString:@"program(1.3)\n" "[buildInfo = dict({{\"coremlc-component-MIL\", \"3510.2.1\"}, " "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, " "{\"coremltools-version\", \"9.0\"}})]\n{\n"]; [m appendFormat:@" func main(tensor x) {\n", ic, sp]; [m appendString: @" string pt = const()[name = string(\"pt\"), val = string(\"valid\")];\n" " tensor st = const()[name = string(\"st\"), val = tensor([1, 1])];\n" " tensor pd = const()[name = string(\"pd\"), val = tensor([0, 0, 0, 0])];\n" " tensor dl = const()[name = string(\"dl\"), val = tensor([1, 1])];\n" " int32 gr = const()[name = string(\"gr\"), val = int32(1)];\n" " string to16 = const()[name = string(\"to16\"), val = string(\"fp16\")];\n"]; [m appendFormat:@" tensor xh = cast(dtype = to16, x = x)[name = string(\"cast_in\")];\n", ic, sp]; [m appendFormat:@" tensor W = const()[name = string(\"W\"), " "val = tensor(BLOBFILE(path = string(\"@model_path/weights/w.bin\"), offset = uint64(64)))];\n", oc, ic, oc, ic]; [m appendFormat:@" tensor yh = conv(dilations = dl, groups = gr, pad = pd, pad_type = pt, strides = st, weight = W, x = xh)" "[name = string(\"conv\")];\n", oc, sp]; [m appendString:@" string to32 = const()[name = string(\"to32\"), val = string(\"fp32\")];\n"]; [m appendFormat:@" tensor y = cast(dtype = to32, x = yh)[name = string(\"cast_out\")];\n", oc, sp]; [m appendString:@" } -> (y);\n}\n"]; return m; } int main(int argc, char **argv) { @autoreleasepool { mach_timebase_info(&g_tb); ane_init(); int IC = 256, OC = 256, SP = 64; int io_bytes = IC * SP * 4; // fp32 // Identity weight float *W_id = (float*)calloc(OC*IC, sizeof(float)); for (int i = 0; i < IC; i++) W_id[i*IC+i] = 1.0f; NSString *mil = gen_conv_mil(IC, OC, SP); NSDictionary *wd = @{@"@model_path/weights/w.bin": @{@"offset":@0, @"data":build_blob(W_id, OC, IC)}}; printf("=== Compiling conv %dx%d sp=%d ===\n", OC, IC, SP); Kern *k = compile_kern_mil_w(mil, wd, io_bytes, io_bytes); if (!k) { printf("COMPILE FAILED\n"); free(W_id); return 1; } printf("Compile OK!\n"); // Write fp32 input IOSurfaceLock(k->ioIn, 0, NULL); float *inp = (float*)IOSurfaceGetBaseAddress(k->ioIn); for (int i = 0; i < IC*SP; i++) inp[i] = (i % 100) * 0.01f; IOSurfaceUnlock(k->ioIn, 0, NULL); // Eval with identity ane_eval(k); IOSurfaceLock(k->ioOut, kIOSurfaceLockReadOnly, NULL); float *out = (float*)IOSurfaceGetBaseAddress(k->ioOut); printf("In: [%.3f, %.3f, %.3f, %.3f]\n", inp[0], inp[1], inp[2], inp[3]); printf("Out: [%.3f, %.3f, %.3f, %.3f]\n", out[0], out[1], out[2], out[3]); float max_err = 0; for (int i = 0; i < OC*SP; i++) { float err = fabsf(out[i] - inp[i]); if (err > max_err) max_err = err; } IOSurfaceUnlock(k->ioOut, kIOSurfaceLockReadOnly, NULL); printf("Identity max_err=%.6f %s\n\n", max_err, max_err < 0.1 ? "PASS" : "FAIL"); // === Approach 1: Patch weight on disk, unload+reload === printf("=== Approach 1: Disk patch + unload/reload ===\n"); float *W_2x = (float*)calloc(OC*IC, sizeof(float)); for (int i = 0; i < IC; i++) W_2x[i*IC+i] = 2.0f; [build_blob(W_2x, OC, IC) writeToFile: [(__bridge NSString*)k->tmpDir stringByAppendingPathComponent:@"weights/w.bin"] atomically:YES]; id mdl = (__bridge id)k->model; NSError *e = nil; ((BOOL(*)(id,SEL,unsigned int,NSError**))objc_msgSend)(mdl, @selector(unloadWithQoS:error:), 21, &e); e = nil; BOOL ok = ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(loadWithQoS:options:error:), 21, @{}, &e); printf("Reload: %s\n", ok?"OK":"FAIL"); if (ok) { // Re-create request after reload id wI = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), k->ioIn); id wO = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), k->ioOut); CFRelease(k->request); k->request = (void*)CFBridgingRetain(((id(*)(Class,SEL,id,id,id,id,id,id,id))objc_msgSend)(g_AR, @selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:), @[wI], @[@0], @[wO], @[@0], nil, nil, @0)); ane_eval(k); IOSurfaceLock(k->ioOut, kIOSurfaceLockReadOnly, NULL); printf("Out: [%.3f, %.3f, %.3f, %.3f]\n", out[0], out[1], out[2], out[3]); float sr = 0; int cnt = 0; for (int i = 0; i < OC*SP; i++) if (fabsf(inp[i]) > 0.01f) { sr += out[i]/inp[i]; cnt++; } IOSurfaceUnlock(k->ioOut, kIOSurfaceLockReadOnly, NULL); printf("Ratio: %.3f (2.0=patched, 1.0=cached)\n\n", cnt>0?sr/cnt:0); } // === Approach 2: Memory scan === printf("=== Approach 2: Memory scan ===\n"); uint16_t pat1[8] = {0x3C00, 0, 0, 0, 0, 0, 0, 0}; uint16_t pat2[8] = {0x4000, 0, 0, 0, 0, 0, 0, 0}; mach_port_t task = mach_task_self(); vm_address_t addr = 0; vm_size_t sz; natural_t depth = 1; int f1 = 0, f2 = 0; while (1) { struct vm_region_submap_info_64 info; mach_msg_type_number_t count = VM_REGION_SUBMAP_INFO_COUNT_64; if (vm_region_recurse_64(task, &addr, &sz, &depth, (vm_region_recurse_info_t)&info, &count) != KERN_SUCCESS) break; if (info.is_submap) { depth++; continue; } if (!(info.protection & VM_PROT_READ) || sz < (size_t)(OC*IC*2)) { addr += sz; continue; } uint8_t *base = (uint8_t*)addr; for (size_t off = 0; off + OC*IC*2 <= sz; off += 2) { int w = 0; if (memcmp(base+off, pat1, 16) == 0) w = 1; else if (memcmp(base+off, pat2, 16) == 0) w = 2; if (!w) continue; uint16_t *p = (uint16_t*)(base+off), diag = (w==1)?0x3C00:0x4000; int ok2 = 1; for (int r = 0; r < OC && ok2; r++) for (int c = 0; c < IC && ok2; c++) if (p[r*IC+c] != ((r==c)?diag:0)) ok2 = 0; if (!ok2) continue; if (w==1) f1++; else f2++; printf(" FOUND %dx @%p prot=%d/%d %s\n", w, (void*)(addr+off), info.protection, info.max_protection, (info.protection&VM_PROT_WRITE)?"WR":"RO"); } addr += sz; } printf("Found: 1x=%d 2x=%d\n", f1, f2); // Now patch ALL found weight patterns to 3× and re-eval if (f1 > 0 || f2 > 0) { printf("Patching all found patterns to 3x identity...\n"); addr = 0; depth = 1; while (1) { struct vm_region_submap_info_64 info2; mach_msg_type_number_t count2 = VM_REGION_SUBMAP_INFO_COUNT_64; if (vm_region_recurse_64(task, &addr, &sz, &depth, (vm_region_recurse_info_t)&info2, &count2) != KERN_SUCCESS) break; if (info2.is_submap) { depth++; continue; } if (!(info2.protection & VM_PROT_READ) || sz < (size_t)(OC*IC*2)) { addr += sz; continue; } uint8_t *base2 = (uint8_t*)addr; for (size_t off = 0; off + OC*IC*2 <= sz; off += 2) { int w2 = 0; if (memcmp(base2+off, pat1, 16) == 0) w2 = 1; else if (memcmp(base2+off, pat2, 16) == 0) w2 = 2; if (!w2) continue; uint16_t *p2 = (uint16_t*)(base2+off), diag2 = (w2==1)?0x3C00:0x4000; int ok3 = 1; for (int r = 0; r < OC && ok3; r++) for (int c = 0; c < IC && ok3; c++) if (p2[r*IC+c] != ((r==c)?diag2:0)) ok3 = 0; if (!ok3) continue; if (info2.protection & VM_PROT_WRITE) { printf(" Patching %dx @%p to 3x\n", w2, (void*)(addr+off)); for (int r = 0; r < OC; r++) for (int c = 0; c < IC; c++) p2[r*IC+c] = (r==c) ? 0x4200 : 0; // fp16(3.0) } } addr += sz; } printf("\n=== Eval after memory patch (expect 3x) ===\n"); ane_eval(k); IOSurfaceLock(k->ioOut, kIOSurfaceLockReadOnly, NULL); printf("Out: [%.3f, %.3f, %.3f, %.3f]\n", out[0], out[1], out[2], out[3]); float sr2 = 0; int cnt2 = 0; for (int i = 0; i < OC*SP; i++) if (fabsf(inp[i]) > 0.01f) { sr2 += out[i]/inp[i]; cnt2++; } IOSurfaceUnlock(k->ioOut, kIOSurfaceLockReadOnly, NULL); printf("Ratio: %.3f (3.0=mem patch works!, 1.0=ANE uses SRAM copy)\n", cnt2>0?sr2/cnt2:0); } printf("\n"); // === Approach 3: Explore classes === printf("=== ANE classes ===\n"); const char *cn[] = {"_ANEWeight", "_ANEProgramForEvaluation", "_ANEChainingRequest", NULL}; for (int i = 0; cn[i]; i++) { Class cls = NSClassFromString([NSString stringWithUTF8String:cn[i]]); if (!cls) { printf("%s: NOT FOUND\n", cn[i]); continue; } printf("%s:\n", cn[i]); unsigned int mc = 0; Method *ms = class_copyMethodList(cls, &mc); for (unsigned j = 0; j < mc; j++) printf(" - %s\n", sel_getName(method_getName(ms[j]))); free(ms); mc = 0; ms = class_copyMethodList(object_getClass(cls), &mc); for (unsigned j = 0; j < mc; j++) printf(" + %s\n", sel_getName(method_getName(ms[j]))); free(ms); printf("\n"); } @try { printf("programHandle: %s\n", [[[mdl valueForKey:@"programHandle"] description] UTF8String]); } @catch(id x) {} @try { printf("intermediateBufferHandle: %s\n", [[[mdl valueForKey:@"intermediateBufferHandle"] description] UTF8String]); } @catch(id x) {} // === Approach 4: _ANEWeight + updateWeightURL === printf("\n=== Approach 4: _ANEWeight API ===\n"); Class AW = NSClassFromString(@"_ANEWeight"); if (AW) { // Write 5× identity weights to a new file float *W_5x = (float*)calloc(OC*IC, sizeof(float)); for (int i = 0; i < IC; i++) W_5x[i*IC+i] = 5.0f; NSString *wpath = [NSTemporaryDirectory() stringByAppendingPathComponent:@"patched_w.bin"]; [build_blob(W_5x, OC, IC) writeToFile:wpath atomically:YES]; free(W_5x); NSURL *wurl = [NSURL fileURLWithPath:wpath]; id wobj = ((id(*)(Class,SEL,id,id))objc_msgSend)(AW, @selector(weightWithSymbolAndURL:weightURL:), @"W", wurl); printf(" _ANEWeight: %s\n", wobj ? [[wobj description] UTF8String] : "nil"); if (wobj) { printf(" weightSymbol: %s\n", [((id(*)(id,SEL))objc_msgSend)(wobj, @selector(weightSymbol)) UTF8String]); printf(" weightURL: %s\n", [[((id(*)(id,SEL))objc_msgSend)(wobj, @selector(weightURL)) description] UTF8String]); } // Try to pass as weightsBuffer in request printf("\n Trying weightsBuffer in request...\n"); id wI2 = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), k->ioIn); id wO2 = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), k->ioOut); // Try passing weight array as weightsBuffer if (wobj) { CFRelease(k->request); k->request = (void*)CFBridgingRetain(((id(*)(Class,SEL,id,id,id,id,id,id,id))objc_msgSend)(g_AR, @selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:), @[wI2], @[@0], @[wO2], @[@0], @[wobj], nil, @0)); printf(" Request with weightsBuffer created\n"); @try { ane_eval(k); IOSurfaceLock(k->ioOut, kIOSurfaceLockReadOnly, NULL); printf(" Out: [%.3f, %.3f, %.3f, %.3f]\n", out[0], out[1], out[2], out[3]); float sr3 = 0; int cnt3 = 0; for (int i2 = 0; i2 < OC*SP; i2++) if (fabsf(inp[i2]) > 0.01f) { sr3 += out[i2]/inp[i2]; cnt3++; } IOSurfaceUnlock(k->ioOut, kIOSurfaceLockReadOnly, NULL); printf(" Ratio: %.3f (5.0=weightsBuffer works!)\n", cnt3>0?sr3/cnt3:0); } @catch(NSException *ex) { printf(" Eval exception: %s\n", [[ex description] UTF8String]); } } // Also try IOSurface as weightsBuffer printf("\n Trying IOSurface as weightsBuffer...\n"); IOSurfaceRef wSurf = make_surface(OC*IC*2); // fp16 weights IOSurfaceLock(wSurf, 0, NULL); _Float16 *wfp16 = (_Float16*)IOSurfaceGetBaseAddress(wSurf); for (int r = 0; r < OC; r++) for (int c2 = 0; c2 < IC; c2++) wfp16[r*IC+c2] = (r==c2) ? (_Float16)7.0f : (_Float16)0.0f; // 7× identity IOSurfaceUnlock(wSurf, 0, NULL); id wSurfObj = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), wSurf); CFRelease(k->request); k->request = (void*)CFBridgingRetain(((id(*)(Class,SEL,id,id,id,id,id,id,id))objc_msgSend)(g_AR, @selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:), @[wI2], @[@0], @[wO2], @[@0], wSurfObj, nil, @0)); @try { ane_eval(k); IOSurfaceLock(k->ioOut, kIOSurfaceLockReadOnly, NULL); printf(" Out: [%.3f, %.3f, %.3f, %.3f]\n", out[0], out[1], out[2], out[3]); float sr4 = 0; int cnt4 = 0; for (int i3 = 0; i3 < OC*SP; i3++) if (fabsf(inp[i3]) > 0.01f) { sr4 += out[i3]/inp[i3]; cnt4++; } IOSurfaceUnlock(k->ioOut, kIOSurfaceLockReadOnly, NULL); printf(" Ratio: %.3f (7.0=IOSurface weights work!)\n", cnt4>0?sr4/cnt4:0); } @catch(NSException *ex) { printf(" Eval exception: %s\n", [[ex description] UTF8String]); } CFRelease(wSurf); } // === Approach 5: Weights packed into input IOSurface (fp16 with cast) === printf("\n=== Approach 5: Dynamic weights via input IOSurface ===\n"); // Element-wise mul: x * w where both come from input // Input [1, IC*2, 1, SP] fp32 → cast fp16 → slice → mul → cast fp32 { int C5 = IC; NSMutableString *m5 = [NSMutableString string]; [m5 appendString:@"program(1.3)\n" "[buildInfo = dict({{\"coremlc-component-MIL\", \"3510.2.1\"}, " "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, " "{\"coremltools-version\", \"9.0\"}})]\n{\n"]; [m5 appendFormat:@" func main(tensor x) {\n", C5*2, SP]; [m5 appendString:@" string to16 = const()[name = string(\"to16\"), val = string(\"fp16\")];\n"]; [m5 appendFormat:@" tensor xh = cast(dtype = to16, x = x)[name = string(\"cin\")];\n", C5*2, SP]; [m5 appendFormat:@" tensor b0 = const()[name = string(\"b0\"), val = tensor([0,0,0,0])];\n"]; [m5 appendFormat:@" tensor s0 = const()[name = string(\"s0\"), val = tensor([1,%d,1,%d])];\n", C5, SP]; [m5 appendFormat:@" tensor data = slice_by_size(x=xh,begin=b0,size=s0)[name=string(\"data\")];\n", C5, SP]; [m5 appendFormat:@" tensor b1 = const()[name = string(\"b1\"), val = tensor([0,%d,0,0])];\n", C5]; [m5 appendFormat:@" tensor wt = slice_by_size(x=xh,begin=b1,size=s0)[name=string(\"wt\")];\n", C5, SP]; [m5 appendFormat:@" tensor yh = mul(x=data,y=wt)[name=string(\"mul\")];\n", C5, SP]; [m5 appendString:@" string to32 = const()[name = string(\"to32\"), val = string(\"fp32\")];\n"]; [m5 appendFormat:@" tensor y = cast(dtype = to32, x = yh)[name = string(\"cout\")];\n", C5, SP]; [m5 appendString:@" } -> (y);\n}\n"]; int io5_in = C5*2*SP*4; int io5_out = C5*SP*4; Kern *k5 = compile_kern_mil_w(m5, @{}, io5_in, io5_out); if (k5) { printf("Compile OK!\n"); IOSurfaceLock(k5->ioIn, 0, NULL); float *in5 = (float*)IOSurfaceGetBaseAddress(k5->ioIn); for (int i = 0; i < C5*SP; i++) in5[i] = (i%100)*0.01f; for (int i = 0; i < C5*SP; i++) in5[C5*SP+i] = 2.0f; IOSurfaceUnlock(k5->ioIn, 0, NULL); ane_eval(k5); IOSurfaceLock(k5->ioOut, kIOSurfaceLockReadOnly, NULL); float *out5 = (float*)IOSurfaceGetBaseAddress(k5->ioOut); printf("data=[%.3f,%.3f,%.3f], w=2.0 → out=[%.3f,%.3f,%.3f]\n", in5[0],in5[1],in5[2], out5[0],out5[1],out5[2]); IOSurfaceUnlock(k5->ioOut, kIOSurfaceLockReadOnly, NULL); // Change weight dynamically — NO recompile! IOSurfaceLock(k5->ioIn, 0, NULL); for (int i = 0; i < C5*SP; i++) in5[C5*SP+i] = 5.0f; IOSurfaceUnlock(k5->ioIn, 0, NULL); ane_eval(k5); IOSurfaceLock(k5->ioOut, kIOSurfaceLockReadOnly, NULL); printf("w=5.0 → out=[%.3f,%.3f,%.3f] (expect 5×)\n", out5[0],out5[1],out5[2]); IOSurfaceUnlock(k5->ioOut, kIOSurfaceLockReadOnly, NULL); free_kern(k5); } else printf("Compile FAILED\n"); } // === Approach 6: matmul with dynamic weights from input === printf("\n=== Approach 6: matmul with dynamic W from input ===\n"); // Pack x[1,D,S,1] and W[1,D,1,D] into input, then reshape+matmul // Input shape: [1, D+D*D, 1, S] — first D channels=activations, rest=weight matrix flattened // Actually, matmul needs [1,H,S,D] shapes. Let's try: // Input: [1, D*(S+D), 1, 1] reshaped as needed // Simpler: just test matmul with two sliced inputs { int D6 = 64, S6 = 64; // small for test // Input: [1, D6+D6, S6, D6] — but that's 4D... // Actually ANE matmul works on [1,H,M,K] @ [1,H,K,N] → [1,H,M,N] // Let's pack x[1,1,S6,D6] and W[1,1,D6,D6] into [1,2,S6,D6] // Then slice → matmul NSMutableString *m6 = [NSMutableString string]; [m6 appendString:@"program(1.3)\n" "[buildInfo = dict({{\"coremlc-component-MIL\", \"3510.2.1\"}, " "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, " "{\"coremltools-version\", \"9.0\"}})]\n{\n"]; // Input: [1, D6+D6, 1, S6*D6] — flatten everything, then reshape // Actually simplest: two separate regions in channel dim // x_data: [1, D6, 1, S6] and W: [1, D6*D6, 1, 1] // Total input channels: D6 + D6*D6 int total_ch = D6 + D6*D6; [m6 appendFormat:@" func main(tensor x) {\n", total_ch, S6]; [m6 appendString:@" string to16 = const()[name = string(\"to16\"), val = string(\"fp16\")];\n"]; [m6 appendFormat:@" tensor xh = cast(dtype = to16, x = x)[name = string(\"cin\")];\n", total_ch, S6]; // Slice activations: [1, D6, 1, S6] [m6 appendFormat:@" tensor b0 = const()[name = string(\"b0\"), val = tensor([0,0,0,0])];\n"]; [m6 appendFormat:@" tensor sa = const()[name = string(\"sa\"), val = tensor([1,%d,1,%d])];\n", D6, S6]; [m6 appendFormat:@" tensor act = slice_by_size(x=xh,begin=b0,size=sa)[name=string(\"act\")];\n", D6, S6]; // Slice weight: [1, D6*D6, 1, S6] but we only need [D6, D6] → reshape [m6 appendFormat:@" tensor bw = const()[name = string(\"bw\"), val = tensor([0,%d,0,0])];\n", D6]; [m6 appendFormat:@" tensor sw = const()[name = string(\"sw\"), val = tensor([1,%d,1,%d])];\n", D6*D6, S6]; [m6 appendFormat:@" tensor wf = slice_by_size(x=xh,begin=bw,size=sw)[name=string(\"wf\")];\n", D6*D6, S6]; // Reshape weight to [1, D6, D6, S6] for matmul-like operation // Actually for conv: weight needs to be [OC, IC, 1, 1] const. Can't use dynamic weight with conv. // For matmul: need [1, 1, D6, D6] or similar // Let's try: reshape wf to [1, D6, D6, S6], take first slice [:,:,:,0] → no, that's hard // Simpler: reshape to [D6, D6] and use matmul // But matmul expects specific ranks... let me try: [m6 appendFormat:@" tensor ws = const()[name = string(\"ws\"), val = tensor([1, 1, %d, %d])];\n", D6, D6]; // Only take first column of wf to get [1, D6*D6, 1, 1] [m6 appendFormat:@" tensor sw1 = const()[name = string(\"sw1\"), val = tensor([1,%d,1,1])];\n", D6*D6]; [m6 appendFormat:@" tensor wf1 = slice_by_size(x=wf,begin=b0,size=sw1)[name=string(\"wf1\")];\n", D6*D6]; [m6 appendFormat:@" tensor W = reshape(shape=ws,x=wf1)[name=string(\"W\")];\n", D6, D6]; // Reshape act to [1, 1, S6, D6] for matmul [m6 appendFormat:@" tensor as2 = const()[name = string(\"as2\"), val = tensor([1, 1, %d, %d])];\n", D6, S6]; [m6 appendFormat:@" tensor pm = const()[name = string(\"pm\"), val = tensor([0, 1, 3, 2])];\n"]; [m6 appendFormat:@" tensor a2 = reshape(shape=as2,x=act)[name=string(\"a2\")];\n", D6, S6]; [m6 appendFormat:@" tensor a3 = transpose(perm=pm,x=a2)[name=string(\"a3\")];\n", S6, D6]; // matmul: [1,1,S6,D6] @ [1,1,D6,D6] → [1,1,S6,D6] [m6 appendString:@" bool bF = const()[name = string(\"bF\"), val = bool(false)];\n"]; [m6 appendFormat:@" tensor yh = matmul(transpose_x = bF, transpose_y = bF, x = a3, y = W)[name = string(\"mm\")];\n", S6, D6]; // Reshape back to [1, D6, 1, S6] [m6 appendFormat:@" tensor yt = transpose(perm=pm,x=yh)[name=string(\"yt\")];\n", D6, S6]; [m6 appendFormat:@" tensor os = const()[name = string(\"os\"), val = tensor([1,%d,1,%d])];\n", D6, S6]; [m6 appendFormat:@" tensor yr = reshape(shape=os,x=yt)[name=string(\"yr\")];\n", D6, S6]; [m6 appendString:@" string to32 = const()[name = string(\"to32\"), val = string(\"fp32\")];\n"]; [m6 appendFormat:@" tensor y = cast(dtype = to32, x = yr)[name = string(\"cout\")];\n", D6, S6]; [m6 appendString:@" } -> (y);\n}\n"]; int io6_in = total_ch * S6 * 4; int io6_out = D6 * S6 * 4; Kern *k6 = compile_kern_mil_w(m6, @{}, io6_in, io6_out); if (k6) { printf("Dynamic matmul compile OK!\n"); // Set up: identity W, ramp input IOSurfaceLock(k6->ioIn, 0, NULL); float *in6 = (float*)IOSurfaceGetBaseAddress(k6->ioIn); memset(in6, 0, io6_in); // Activations: [D6, S6] in channel-first layout for (int d = 0; d < D6; d++) for (int s = 0; s < S6; s++) in6[d*S6+s] = (d*S6+s) * 0.001f; // Weight: identity matrix [D6, D6] packed in channels D6..D6+D6*D6, only col 0 float *wbase = in6 + D6*S6; for (int r = 0; r < D6; r++) for (int c = 0; c < D6; c++) wbase[(r*D6+c)*S6] = (r==c) ? 1.0f : 0.0f; // only sp=0 matters IOSurfaceUnlock(k6->ioIn, 0, NULL); ane_eval(k6); IOSurfaceLock(k6->ioOut, kIOSurfaceLockReadOnly, NULL); float *out6 = (float*)IOSurfaceGetBaseAddress(k6->ioOut); printf("Identity W: in=[%.4f,%.4f,%.4f] out=[%.4f,%.4f,%.4f]\n", in6[0],in6[1],in6[2], out6[0],out6[1],out6[2]); // Check float me6 = 0; for (int i = 0; i < D6*S6; i++) { float e6 = fabsf(out6[i] - in6[i]); if (e6 > me6) me6 = e6; } IOSurfaceUnlock(k6->ioOut, kIOSurfaceLockReadOnly, NULL); printf("max_err=%.6f %s\n", me6, me6 < 0.1 ? "PASS" : "FAIL"); // Now: 2× identity — just change the IOSurface weight, no recompile! IOSurfaceLock(k6->ioIn, 0, NULL); for (int r = 0; r < D6; r++) for (int c = 0; c < D6; c++) wbase[(r*D6+c)*S6] = (r==c) ? 2.0f : 0.0f; IOSurfaceUnlock(k6->ioIn, 0, NULL); ane_eval(k6); IOSurfaceLock(k6->ioOut, kIOSurfaceLockReadOnly, NULL); printf("2× W: in=[%.4f,%.4f] out=[%.4f,%.4f] (expect 2×)\n", in6[0],in6[1], out6[0],out6[1]); IOSurfaceUnlock(k6->ioOut, kIOSurfaceLockReadOnly, NULL); free_kern(k6); } else printf("Dynamic matmul compile FAILED\n"); } free_kern(k); free(W_id); free(W_2x); printf("\nDone.\n"); } return 0; }