mirror of https://github.com/maderix/ANE.git
451 lines
27 KiB
Objective-C
451 lines
27 KiB
Objective-C
// test_weight_patch.m — Test whether ANE weights can be patched after compile
|
||
#import <Foundation/Foundation.h>
|
||
#import <objc/runtime.h>
|
||
#import <objc/message.h>
|
||
#import <dlfcn.h>
|
||
#import <IOSurface/IOSurface.h>
|
||
#import <mach/mach.h>
|
||
#import <mach/mach_time.h>
|
||
#import <mach/vm_map.h>
|
||
#include <arm_neon.h>
|
||
#include <Accelerate/Accelerate.h>
|
||
|
||
#include "stories_io.h"
|
||
|
||
// MIL: fp32 in → cast fp16 → conv → cast fp32 out (matches inmem_peak.m pattern)
|
||
static NSString *gen_conv_mil(int ic, int oc, int sp) {
|
||
NSMutableString *m = [NSMutableString string];
|
||
[m appendString:@"program(1.3)\n"
|
||
"[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, "
|
||
"{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, "
|
||
"{\"coremltools-version\", \"9.0\"}})]\n{\n"];
|
||
[m appendFormat:@" func main<ios18>(tensor<fp32, [1, %d, 1, %d]> x) {\n", ic, sp];
|
||
[m appendString:
|
||
@" string pt = const()[name = string(\"pt\"), val = string(\"valid\")];\n"
|
||
" tensor<int32, [2]> st = const()[name = string(\"st\"), val = tensor<int32, [2]>([1, 1])];\n"
|
||
" tensor<int32, [4]> pd = const()[name = string(\"pd\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
|
||
" tensor<int32, [2]> dl = const()[name = string(\"dl\"), val = tensor<int32, [2]>([1, 1])];\n"
|
||
" int32 gr = const()[name = string(\"gr\"), val = int32(1)];\n"
|
||
" string to16 = const()[name = string(\"to16\"), val = string(\"fp16\")];\n"];
|
||
[m appendFormat:@" tensor<fp16, [1, %d, 1, %d]> xh = cast(dtype = to16, x = x)[name = string(\"cast_in\")];\n", ic, sp];
|
||
[m appendFormat:@" tensor<fp16, [%d, %d, 1, 1]> W = const()[name = string(\"W\"), "
|
||
"val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = string(\"@model_path/weights/w.bin\"), offset = uint64(64)))];\n",
|
||
oc, ic, oc, ic];
|
||
[m appendFormat:@" tensor<fp16, [1, %d, 1, %d]> yh = conv(dilations = dl, groups = gr, pad = pd, pad_type = pt, strides = st, weight = W, x = xh)"
|
||
"[name = string(\"conv\")];\n", oc, sp];
|
||
[m appendString:@" string to32 = const()[name = string(\"to32\"), val = string(\"fp32\")];\n"];
|
||
[m appendFormat:@" tensor<fp32, [1, %d, 1, %d]> y = cast(dtype = to32, x = yh)[name = string(\"cast_out\")];\n", oc, sp];
|
||
[m appendString:@" } -> (y);\n}\n"];
|
||
return m;
|
||
}
|
||
|
||
int main(int argc, char **argv) {
|
||
@autoreleasepool {
|
||
mach_timebase_info(&g_tb);
|
||
ane_init();
|
||
|
||
int IC = 256, OC = 256, SP = 64;
|
||
int io_bytes = IC * SP * 4; // fp32
|
||
|
||
// Identity weight
|
||
float *W_id = (float*)calloc(OC*IC, sizeof(float));
|
||
for (int i = 0; i < IC; i++) W_id[i*IC+i] = 1.0f;
|
||
|
||
NSString *mil = gen_conv_mil(IC, OC, SP);
|
||
NSDictionary *wd = @{@"@model_path/weights/w.bin": @{@"offset":@0, @"data":build_blob(W_id, OC, IC)}};
|
||
|
||
printf("=== Compiling conv %dx%d sp=%d ===\n", OC, IC, SP);
|
||
Kern *k = compile_kern_mil_w(mil, wd, io_bytes, io_bytes);
|
||
if (!k) { printf("COMPILE FAILED\n"); free(W_id); return 1; }
|
||
printf("Compile OK!\n");
|
||
|
||
// Write fp32 input
|
||
IOSurfaceLock(k->ioIn, 0, NULL);
|
||
float *inp = (float*)IOSurfaceGetBaseAddress(k->ioIn);
|
||
for (int i = 0; i < IC*SP; i++) inp[i] = (i % 100) * 0.01f;
|
||
IOSurfaceUnlock(k->ioIn, 0, NULL);
|
||
|
||
// Eval with identity
|
||
ane_eval(k);
|
||
IOSurfaceLock(k->ioOut, kIOSurfaceLockReadOnly, NULL);
|
||
float *out = (float*)IOSurfaceGetBaseAddress(k->ioOut);
|
||
printf("In: [%.3f, %.3f, %.3f, %.3f]\n", inp[0], inp[1], inp[2], inp[3]);
|
||
printf("Out: [%.3f, %.3f, %.3f, %.3f]\n", out[0], out[1], out[2], out[3]);
|
||
float max_err = 0;
|
||
for (int i = 0; i < OC*SP; i++) {
|
||
float err = fabsf(out[i] - inp[i]);
|
||
if (err > max_err) max_err = err;
|
||
}
|
||
IOSurfaceUnlock(k->ioOut, kIOSurfaceLockReadOnly, NULL);
|
||
printf("Identity max_err=%.6f %s\n\n", max_err, max_err < 0.1 ? "PASS" : "FAIL");
|
||
|
||
// === Approach 1: Patch weight on disk, unload+reload ===
|
||
printf("=== Approach 1: Disk patch + unload/reload ===\n");
|
||
float *W_2x = (float*)calloc(OC*IC, sizeof(float));
|
||
for (int i = 0; i < IC; i++) W_2x[i*IC+i] = 2.0f;
|
||
[build_blob(W_2x, OC, IC) writeToFile:
|
||
[(__bridge NSString*)k->tmpDir stringByAppendingPathComponent:@"weights/w.bin"] atomically:YES];
|
||
|
||
id mdl = (__bridge id)k->model;
|
||
NSError *e = nil;
|
||
((BOOL(*)(id,SEL,unsigned int,NSError**))objc_msgSend)(mdl, @selector(unloadWithQoS:error:), 21, &e);
|
||
e = nil;
|
||
BOOL ok = ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(loadWithQoS:options:error:), 21, @{}, &e);
|
||
printf("Reload: %s\n", ok?"OK":"FAIL");
|
||
if (ok) {
|
||
// Re-create request after reload
|
||
id wI = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), k->ioIn);
|
||
id wO = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), k->ioOut);
|
||
CFRelease(k->request);
|
||
k->request = (void*)CFBridgingRetain(((id(*)(Class,SEL,id,id,id,id,id,id,id))objc_msgSend)(g_AR,
|
||
@selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:),
|
||
@[wI], @[@0], @[wO], @[@0], nil, nil, @0));
|
||
ane_eval(k);
|
||
IOSurfaceLock(k->ioOut, kIOSurfaceLockReadOnly, NULL);
|
||
printf("Out: [%.3f, %.3f, %.3f, %.3f]\n", out[0], out[1], out[2], out[3]);
|
||
float sr = 0; int cnt = 0;
|
||
for (int i = 0; i < OC*SP; i++)
|
||
if (fabsf(inp[i]) > 0.01f) { sr += out[i]/inp[i]; cnt++; }
|
||
IOSurfaceUnlock(k->ioOut, kIOSurfaceLockReadOnly, NULL);
|
||
printf("Ratio: %.3f (2.0=patched, 1.0=cached)\n\n", cnt>0?sr/cnt:0);
|
||
}
|
||
|
||
// === Approach 2: Memory scan ===
|
||
printf("=== Approach 2: Memory scan ===\n");
|
||
uint16_t pat1[8] = {0x3C00, 0, 0, 0, 0, 0, 0, 0};
|
||
uint16_t pat2[8] = {0x4000, 0, 0, 0, 0, 0, 0, 0};
|
||
mach_port_t task = mach_task_self();
|
||
vm_address_t addr = 0; vm_size_t sz; natural_t depth = 1;
|
||
int f1 = 0, f2 = 0;
|
||
while (1) {
|
||
struct vm_region_submap_info_64 info;
|
||
mach_msg_type_number_t count = VM_REGION_SUBMAP_INFO_COUNT_64;
|
||
if (vm_region_recurse_64(task, &addr, &sz, &depth, (vm_region_recurse_info_t)&info, &count) != KERN_SUCCESS) break;
|
||
if (info.is_submap) { depth++; continue; }
|
||
if (!(info.protection & VM_PROT_READ) || sz < (size_t)(OC*IC*2)) { addr += sz; continue; }
|
||
uint8_t *base = (uint8_t*)addr;
|
||
for (size_t off = 0; off + OC*IC*2 <= sz; off += 2) {
|
||
int w = 0;
|
||
if (memcmp(base+off, pat1, 16) == 0) w = 1;
|
||
else if (memcmp(base+off, pat2, 16) == 0) w = 2;
|
||
if (!w) continue;
|
||
uint16_t *p = (uint16_t*)(base+off), diag = (w==1)?0x3C00:0x4000;
|
||
int ok2 = 1;
|
||
for (int r = 0; r < OC && ok2; r++)
|
||
for (int c = 0; c < IC && ok2; c++)
|
||
if (p[r*IC+c] != ((r==c)?diag:0)) ok2 = 0;
|
||
if (!ok2) continue;
|
||
if (w==1) f1++; else f2++;
|
||
printf(" FOUND %dx @%p prot=%d/%d %s\n", w, (void*)(addr+off),
|
||
info.protection, info.max_protection, (info.protection&VM_PROT_WRITE)?"WR":"RO");
|
||
}
|
||
addr += sz;
|
||
}
|
||
printf("Found: 1x=%d 2x=%d\n", f1, f2);
|
||
|
||
// Now patch ALL found weight patterns to 3× and re-eval
|
||
if (f1 > 0 || f2 > 0) {
|
||
printf("Patching all found patterns to 3x identity...\n");
|
||
addr = 0; depth = 1;
|
||
while (1) {
|
||
struct vm_region_submap_info_64 info2;
|
||
mach_msg_type_number_t count2 = VM_REGION_SUBMAP_INFO_COUNT_64;
|
||
if (vm_region_recurse_64(task, &addr, &sz, &depth, (vm_region_recurse_info_t)&info2, &count2) != KERN_SUCCESS) break;
|
||
if (info2.is_submap) { depth++; continue; }
|
||
if (!(info2.protection & VM_PROT_READ) || sz < (size_t)(OC*IC*2)) { addr += sz; continue; }
|
||
uint8_t *base2 = (uint8_t*)addr;
|
||
for (size_t off = 0; off + OC*IC*2 <= sz; off += 2) {
|
||
int w2 = 0;
|
||
if (memcmp(base2+off, pat1, 16) == 0) w2 = 1;
|
||
else if (memcmp(base2+off, pat2, 16) == 0) w2 = 2;
|
||
if (!w2) continue;
|
||
uint16_t *p2 = (uint16_t*)(base2+off), diag2 = (w2==1)?0x3C00:0x4000;
|
||
int ok3 = 1;
|
||
for (int r = 0; r < OC && ok3; r++)
|
||
for (int c = 0; c < IC && ok3; c++)
|
||
if (p2[r*IC+c] != ((r==c)?diag2:0)) ok3 = 0;
|
||
if (!ok3) continue;
|
||
if (info2.protection & VM_PROT_WRITE) {
|
||
printf(" Patching %dx @%p to 3x\n", w2, (void*)(addr+off));
|
||
for (int r = 0; r < OC; r++)
|
||
for (int c = 0; c < IC; c++)
|
||
p2[r*IC+c] = (r==c) ? 0x4200 : 0; // fp16(3.0)
|
||
}
|
||
}
|
||
addr += sz;
|
||
}
|
||
|
||
printf("\n=== Eval after memory patch (expect 3x) ===\n");
|
||
ane_eval(k);
|
||
IOSurfaceLock(k->ioOut, kIOSurfaceLockReadOnly, NULL);
|
||
printf("Out: [%.3f, %.3f, %.3f, %.3f]\n", out[0], out[1], out[2], out[3]);
|
||
float sr2 = 0; int cnt2 = 0;
|
||
for (int i = 0; i < OC*SP; i++)
|
||
if (fabsf(inp[i]) > 0.01f) { sr2 += out[i]/inp[i]; cnt2++; }
|
||
IOSurfaceUnlock(k->ioOut, kIOSurfaceLockReadOnly, NULL);
|
||
printf("Ratio: %.3f (3.0=mem patch works!, 1.0=ANE uses SRAM copy)\n", cnt2>0?sr2/cnt2:0);
|
||
}
|
||
printf("\n");
|
||
|
||
// === Approach 3: Explore classes ===
|
||
printf("=== ANE classes ===\n");
|
||
const char *cn[] = {"_ANEWeight", "_ANEProgramForEvaluation", "_ANEChainingRequest", NULL};
|
||
for (int i = 0; cn[i]; i++) {
|
||
Class cls = NSClassFromString([NSString stringWithUTF8String:cn[i]]);
|
||
if (!cls) { printf("%s: NOT FOUND\n", cn[i]); continue; }
|
||
printf("%s:\n", cn[i]);
|
||
unsigned int mc = 0; Method *ms = class_copyMethodList(cls, &mc);
|
||
for (unsigned j = 0; j < mc; j++) printf(" - %s\n", sel_getName(method_getName(ms[j])));
|
||
free(ms);
|
||
mc = 0; ms = class_copyMethodList(object_getClass(cls), &mc);
|
||
for (unsigned j = 0; j < mc; j++) printf(" + %s\n", sel_getName(method_getName(ms[j])));
|
||
free(ms); printf("\n");
|
||
}
|
||
@try { printf("programHandle: %s\n", [[[mdl valueForKey:@"programHandle"] description] UTF8String]); } @catch(id x) {}
|
||
@try { printf("intermediateBufferHandle: %s\n", [[[mdl valueForKey:@"intermediateBufferHandle"] description] UTF8String]); } @catch(id x) {}
|
||
|
||
// === Approach 4: _ANEWeight + updateWeightURL ===
|
||
printf("\n=== Approach 4: _ANEWeight API ===\n");
|
||
Class AW = NSClassFromString(@"_ANEWeight");
|
||
if (AW) {
|
||
// Write 5× identity weights to a new file
|
||
float *W_5x = (float*)calloc(OC*IC, sizeof(float));
|
||
for (int i = 0; i < IC; i++) W_5x[i*IC+i] = 5.0f;
|
||
NSString *wpath = [NSTemporaryDirectory() stringByAppendingPathComponent:@"patched_w.bin"];
|
||
[build_blob(W_5x, OC, IC) writeToFile:wpath atomically:YES];
|
||
free(W_5x);
|
||
|
||
NSURL *wurl = [NSURL fileURLWithPath:wpath];
|
||
id wobj = ((id(*)(Class,SEL,id,id))objc_msgSend)(AW,
|
||
@selector(weightWithSymbolAndURL:weightURL:), @"W", wurl);
|
||
printf(" _ANEWeight: %s\n", wobj ? [[wobj description] UTF8String] : "nil");
|
||
if (wobj) {
|
||
printf(" weightSymbol: %s\n", [((id(*)(id,SEL))objc_msgSend)(wobj, @selector(weightSymbol)) UTF8String]);
|
||
printf(" weightURL: %s\n", [[((id(*)(id,SEL))objc_msgSend)(wobj, @selector(weightURL)) description] UTF8String]);
|
||
}
|
||
|
||
// Try to pass as weightsBuffer in request
|
||
printf("\n Trying weightsBuffer in request...\n");
|
||
id wI2 = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), k->ioIn);
|
||
id wO2 = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), k->ioOut);
|
||
|
||
// Try passing weight array as weightsBuffer
|
||
if (wobj) {
|
||
CFRelease(k->request);
|
||
k->request = (void*)CFBridgingRetain(((id(*)(Class,SEL,id,id,id,id,id,id,id))objc_msgSend)(g_AR,
|
||
@selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:),
|
||
@[wI2], @[@0], @[wO2], @[@0], @[wobj], nil, @0));
|
||
printf(" Request with weightsBuffer created\n");
|
||
@try {
|
||
ane_eval(k);
|
||
IOSurfaceLock(k->ioOut, kIOSurfaceLockReadOnly, NULL);
|
||
printf(" Out: [%.3f, %.3f, %.3f, %.3f]\n", out[0], out[1], out[2], out[3]);
|
||
float sr3 = 0; int cnt3 = 0;
|
||
for (int i2 = 0; i2 < OC*SP; i2++)
|
||
if (fabsf(inp[i2]) > 0.01f) { sr3 += out[i2]/inp[i2]; cnt3++; }
|
||
IOSurfaceUnlock(k->ioOut, kIOSurfaceLockReadOnly, NULL);
|
||
printf(" Ratio: %.3f (5.0=weightsBuffer works!)\n", cnt3>0?sr3/cnt3:0);
|
||
} @catch(NSException *ex) {
|
||
printf(" Eval exception: %s\n", [[ex description] UTF8String]);
|
||
}
|
||
}
|
||
|
||
// Also try IOSurface as weightsBuffer
|
||
printf("\n Trying IOSurface as weightsBuffer...\n");
|
||
IOSurfaceRef wSurf = make_surface(OC*IC*2); // fp16 weights
|
||
IOSurfaceLock(wSurf, 0, NULL);
|
||
_Float16 *wfp16 = (_Float16*)IOSurfaceGetBaseAddress(wSurf);
|
||
for (int r = 0; r < OC; r++)
|
||
for (int c2 = 0; c2 < IC; c2++)
|
||
wfp16[r*IC+c2] = (r==c2) ? (_Float16)7.0f : (_Float16)0.0f; // 7× identity
|
||
IOSurfaceUnlock(wSurf, 0, NULL);
|
||
id wSurfObj = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), wSurf);
|
||
CFRelease(k->request);
|
||
k->request = (void*)CFBridgingRetain(((id(*)(Class,SEL,id,id,id,id,id,id,id))objc_msgSend)(g_AR,
|
||
@selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:),
|
||
@[wI2], @[@0], @[wO2], @[@0], wSurfObj, nil, @0));
|
||
@try {
|
||
ane_eval(k);
|
||
IOSurfaceLock(k->ioOut, kIOSurfaceLockReadOnly, NULL);
|
||
printf(" Out: [%.3f, %.3f, %.3f, %.3f]\n", out[0], out[1], out[2], out[3]);
|
||
float sr4 = 0; int cnt4 = 0;
|
||
for (int i3 = 0; i3 < OC*SP; i3++)
|
||
if (fabsf(inp[i3]) > 0.01f) { sr4 += out[i3]/inp[i3]; cnt4++; }
|
||
IOSurfaceUnlock(k->ioOut, kIOSurfaceLockReadOnly, NULL);
|
||
printf(" Ratio: %.3f (7.0=IOSurface weights work!)\n", cnt4>0?sr4/cnt4:0);
|
||
} @catch(NSException *ex) {
|
||
printf(" Eval exception: %s\n", [[ex description] UTF8String]);
|
||
}
|
||
CFRelease(wSurf);
|
||
}
|
||
|
||
// === Approach 5: Weights packed into input IOSurface (fp16 with cast) ===
|
||
printf("\n=== Approach 5: Dynamic weights via input IOSurface ===\n");
|
||
// Element-wise mul: x * w where both come from input
|
||
// Input [1, IC*2, 1, SP] fp32 → cast fp16 → slice → mul → cast fp32
|
||
{
|
||
int C5 = IC;
|
||
NSMutableString *m5 = [NSMutableString string];
|
||
[m5 appendString:@"program(1.3)\n"
|
||
"[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, "
|
||
"{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, "
|
||
"{\"coremltools-version\", \"9.0\"}})]\n{\n"];
|
||
[m5 appendFormat:@" func main<ios18>(tensor<fp32, [1, %d, 1, %d]> x) {\n", C5*2, SP];
|
||
[m5 appendString:@" string to16 = const()[name = string(\"to16\"), val = string(\"fp16\")];\n"];
|
||
[m5 appendFormat:@" tensor<fp16, [1, %d, 1, %d]> xh = cast(dtype = to16, x = x)[name = string(\"cin\")];\n", C5*2, SP];
|
||
[m5 appendFormat:@" tensor<int32, [4]> b0 = const()[name = string(\"b0\"), val = tensor<int32, [4]>([0,0,0,0])];\n"];
|
||
[m5 appendFormat:@" tensor<int32, [4]> s0 = const()[name = string(\"s0\"), val = tensor<int32, [4]>([1,%d,1,%d])];\n", C5, SP];
|
||
[m5 appendFormat:@" tensor<fp16, [1,%d,1,%d]> data = slice_by_size(x=xh,begin=b0,size=s0)[name=string(\"data\")];\n", C5, SP];
|
||
[m5 appendFormat:@" tensor<int32, [4]> b1 = const()[name = string(\"b1\"), val = tensor<int32, [4]>([0,%d,0,0])];\n", C5];
|
||
[m5 appendFormat:@" tensor<fp16, [1,%d,1,%d]> wt = slice_by_size(x=xh,begin=b1,size=s0)[name=string(\"wt\")];\n", C5, SP];
|
||
[m5 appendFormat:@" tensor<fp16, [1,%d,1,%d]> yh = mul(x=data,y=wt)[name=string(\"mul\")];\n", C5, SP];
|
||
[m5 appendString:@" string to32 = const()[name = string(\"to32\"), val = string(\"fp32\")];\n"];
|
||
[m5 appendFormat:@" tensor<fp32, [1,%d,1,%d]> y = cast(dtype = to32, x = yh)[name = string(\"cout\")];\n", C5, SP];
|
||
[m5 appendString:@" } -> (y);\n}\n"];
|
||
|
||
int io5_in = C5*2*SP*4;
|
||
int io5_out = C5*SP*4;
|
||
Kern *k5 = compile_kern_mil_w(m5, @{}, io5_in, io5_out);
|
||
if (k5) {
|
||
printf("Compile OK!\n");
|
||
IOSurfaceLock(k5->ioIn, 0, NULL);
|
||
float *in5 = (float*)IOSurfaceGetBaseAddress(k5->ioIn);
|
||
for (int i = 0; i < C5*SP; i++) in5[i] = (i%100)*0.01f;
|
||
for (int i = 0; i < C5*SP; i++) in5[C5*SP+i] = 2.0f;
|
||
IOSurfaceUnlock(k5->ioIn, 0, NULL);
|
||
ane_eval(k5);
|
||
IOSurfaceLock(k5->ioOut, kIOSurfaceLockReadOnly, NULL);
|
||
float *out5 = (float*)IOSurfaceGetBaseAddress(k5->ioOut);
|
||
printf("data=[%.3f,%.3f,%.3f], w=2.0 → out=[%.3f,%.3f,%.3f]\n",
|
||
in5[0],in5[1],in5[2], out5[0],out5[1],out5[2]);
|
||
IOSurfaceUnlock(k5->ioOut, kIOSurfaceLockReadOnly, NULL);
|
||
|
||
// Change weight dynamically — NO recompile!
|
||
IOSurfaceLock(k5->ioIn, 0, NULL);
|
||
for (int i = 0; i < C5*SP; i++) in5[C5*SP+i] = 5.0f;
|
||
IOSurfaceUnlock(k5->ioIn, 0, NULL);
|
||
ane_eval(k5);
|
||
IOSurfaceLock(k5->ioOut, kIOSurfaceLockReadOnly, NULL);
|
||
printf("w=5.0 → out=[%.3f,%.3f,%.3f] (expect 5×)\n", out5[0],out5[1],out5[2]);
|
||
IOSurfaceUnlock(k5->ioOut, kIOSurfaceLockReadOnly, NULL);
|
||
free_kern(k5);
|
||
} else printf("Compile FAILED\n");
|
||
}
|
||
|
||
// === Approach 6: matmul with dynamic weights from input ===
|
||
printf("\n=== Approach 6: matmul with dynamic W from input ===\n");
|
||
// Pack x[1,D,S,1] and W[1,D,1,D] into input, then reshape+matmul
|
||
// Input shape: [1, D+D*D, 1, S] — first D channels=activations, rest=weight matrix flattened
|
||
// Actually, matmul needs [1,H,S,D] shapes. Let's try:
|
||
// Input: [1, D*(S+D), 1, 1] reshaped as needed
|
||
// Simpler: just test matmul with two sliced inputs
|
||
{
|
||
int D6 = 64, S6 = 64; // small for test
|
||
// Input: [1, D6+D6, S6, D6] — but that's 4D...
|
||
// Actually ANE matmul works on [1,H,M,K] @ [1,H,K,N] → [1,H,M,N]
|
||
// Let's pack x[1,1,S6,D6] and W[1,1,D6,D6] into [1,2,S6,D6]
|
||
// Then slice → matmul
|
||
NSMutableString *m6 = [NSMutableString string];
|
||
[m6 appendString:@"program(1.3)\n"
|
||
"[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, "
|
||
"{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, "
|
||
"{\"coremltools-version\", \"9.0\"}})]\n{\n"];
|
||
// Input: [1, D6+D6, 1, S6*D6] — flatten everything, then reshape
|
||
// Actually simplest: two separate regions in channel dim
|
||
// x_data: [1, D6, 1, S6] and W: [1, D6*D6, 1, 1]
|
||
// Total input channels: D6 + D6*D6
|
||
int total_ch = D6 + D6*D6;
|
||
[m6 appendFormat:@" func main<ios18>(tensor<fp32, [1, %d, 1, %d]> x) {\n", total_ch, S6];
|
||
[m6 appendString:@" string to16 = const()[name = string(\"to16\"), val = string(\"fp16\")];\n"];
|
||
[m6 appendFormat:@" tensor<fp16, [1, %d, 1, %d]> xh = cast(dtype = to16, x = x)[name = string(\"cin\")];\n", total_ch, S6];
|
||
// Slice activations: [1, D6, 1, S6]
|
||
[m6 appendFormat:@" tensor<int32, [4]> b0 = const()[name = string(\"b0\"), val = tensor<int32, [4]>([0,0,0,0])];\n"];
|
||
[m6 appendFormat:@" tensor<int32, [4]> sa = const()[name = string(\"sa\"), val = tensor<int32, [4]>([1,%d,1,%d])];\n", D6, S6];
|
||
[m6 appendFormat:@" tensor<fp16, [1,%d,1,%d]> act = slice_by_size(x=xh,begin=b0,size=sa)[name=string(\"act\")];\n", D6, S6];
|
||
// Slice weight: [1, D6*D6, 1, S6] but we only need [D6, D6] → reshape
|
||
[m6 appendFormat:@" tensor<int32, [4]> bw = const()[name = string(\"bw\"), val = tensor<int32, [4]>([0,%d,0,0])];\n", D6];
|
||
[m6 appendFormat:@" tensor<int32, [4]> sw = const()[name = string(\"sw\"), val = tensor<int32, [4]>([1,%d,1,%d])];\n", D6*D6, S6];
|
||
[m6 appendFormat:@" tensor<fp16, [1,%d,1,%d]> wf = slice_by_size(x=xh,begin=bw,size=sw)[name=string(\"wf\")];\n", D6*D6, S6];
|
||
// Reshape weight to [1, D6, D6, S6] for matmul-like operation
|
||
// Actually for conv: weight needs to be [OC, IC, 1, 1] const. Can't use dynamic weight with conv.
|
||
// For matmul: need [1, 1, D6, D6] or similar
|
||
// Let's try: reshape wf to [1, D6, D6, S6], take first slice [:,:,:,0] → no, that's hard
|
||
// Simpler: reshape to [D6, D6] and use matmul
|
||
// But matmul expects specific ranks... let me try:
|
||
[m6 appendFormat:@" tensor<int32, [4]> ws = const()[name = string(\"ws\"), val = tensor<int32, [4]>([1, 1, %d, %d])];\n", D6, D6];
|
||
// Only take first column of wf to get [1, D6*D6, 1, 1]
|
||
[m6 appendFormat:@" tensor<int32, [4]> sw1 = const()[name = string(\"sw1\"), val = tensor<int32, [4]>([1,%d,1,1])];\n", D6*D6];
|
||
[m6 appendFormat:@" tensor<fp16, [1,%d,1,1]> wf1 = slice_by_size(x=wf,begin=b0,size=sw1)[name=string(\"wf1\")];\n", D6*D6];
|
||
[m6 appendFormat:@" tensor<fp16, [1,1,%d,%d]> W = reshape(shape=ws,x=wf1)[name=string(\"W\")];\n", D6, D6];
|
||
// Reshape act to [1, 1, S6, D6] for matmul
|
||
[m6 appendFormat:@" tensor<int32, [4]> as2 = const()[name = string(\"as2\"), val = tensor<int32, [4]>([1, 1, %d, %d])];\n", D6, S6];
|
||
[m6 appendFormat:@" tensor<int32, [4]> pm = const()[name = string(\"pm\"), val = tensor<int32, [4]>([0, 1, 3, 2])];\n"];
|
||
[m6 appendFormat:@" tensor<fp16, [1,1,%d,%d]> a2 = reshape(shape=as2,x=act)[name=string(\"a2\")];\n", D6, S6];
|
||
[m6 appendFormat:@" tensor<fp16, [1,1,%d,%d]> a3 = transpose(perm=pm,x=a2)[name=string(\"a3\")];\n", S6, D6];
|
||
// matmul: [1,1,S6,D6] @ [1,1,D6,D6] → [1,1,S6,D6]
|
||
[m6 appendString:@" bool bF = const()[name = string(\"bF\"), val = bool(false)];\n"];
|
||
[m6 appendFormat:@" tensor<fp16, [1, 1, %d, %d]> yh = matmul(transpose_x = bF, transpose_y = bF, x = a3, y = W)[name = string(\"mm\")];\n", S6, D6];
|
||
// Reshape back to [1, D6, 1, S6]
|
||
[m6 appendFormat:@" tensor<fp16, [1,1,%d,%d]> yt = transpose(perm=pm,x=yh)[name=string(\"yt\")];\n", D6, S6];
|
||
[m6 appendFormat:@" tensor<int32, [4]> os = const()[name = string(\"os\"), val = tensor<int32, [4]>([1,%d,1,%d])];\n", D6, S6];
|
||
[m6 appendFormat:@" tensor<fp16, [1,%d,1,%d]> yr = reshape(shape=os,x=yt)[name=string(\"yr\")];\n", D6, S6];
|
||
[m6 appendString:@" string to32 = const()[name = string(\"to32\"), val = string(\"fp32\")];\n"];
|
||
[m6 appendFormat:@" tensor<fp32, [1,%d,1,%d]> y = cast(dtype = to32, x = yr)[name = string(\"cout\")];\n", D6, S6];
|
||
[m6 appendString:@" } -> (y);\n}\n"];
|
||
|
||
int io6_in = total_ch * S6 * 4;
|
||
int io6_out = D6 * S6 * 4;
|
||
Kern *k6 = compile_kern_mil_w(m6, @{}, io6_in, io6_out);
|
||
if (k6) {
|
||
printf("Dynamic matmul compile OK!\n");
|
||
// Set up: identity W, ramp input
|
||
IOSurfaceLock(k6->ioIn, 0, NULL);
|
||
float *in6 = (float*)IOSurfaceGetBaseAddress(k6->ioIn);
|
||
memset(in6, 0, io6_in);
|
||
// Activations: [D6, S6] in channel-first layout
|
||
for (int d = 0; d < D6; d++)
|
||
for (int s = 0; s < S6; s++)
|
||
in6[d*S6+s] = (d*S6+s) * 0.001f;
|
||
// Weight: identity matrix [D6, D6] packed in channels D6..D6+D6*D6, only col 0
|
||
float *wbase = in6 + D6*S6;
|
||
for (int r = 0; r < D6; r++)
|
||
for (int c = 0; c < D6; c++)
|
||
wbase[(r*D6+c)*S6] = (r==c) ? 1.0f : 0.0f; // only sp=0 matters
|
||
IOSurfaceUnlock(k6->ioIn, 0, NULL);
|
||
|
||
ane_eval(k6);
|
||
IOSurfaceLock(k6->ioOut, kIOSurfaceLockReadOnly, NULL);
|
||
float *out6 = (float*)IOSurfaceGetBaseAddress(k6->ioOut);
|
||
printf("Identity W: in=[%.4f,%.4f,%.4f] out=[%.4f,%.4f,%.4f]\n",
|
||
in6[0],in6[1],in6[2], out6[0],out6[1],out6[2]);
|
||
|
||
// Check
|
||
float me6 = 0;
|
||
for (int i = 0; i < D6*S6; i++) {
|
||
float e6 = fabsf(out6[i] - in6[i]);
|
||
if (e6 > me6) me6 = e6;
|
||
}
|
||
IOSurfaceUnlock(k6->ioOut, kIOSurfaceLockReadOnly, NULL);
|
||
printf("max_err=%.6f %s\n", me6, me6 < 0.1 ? "PASS" : "FAIL");
|
||
|
||
// Now: 2× identity — just change the IOSurface weight, no recompile!
|
||
IOSurfaceLock(k6->ioIn, 0, NULL);
|
||
for (int r = 0; r < D6; r++)
|
||
for (int c = 0; c < D6; c++)
|
||
wbase[(r*D6+c)*S6] = (r==c) ? 2.0f : 0.0f;
|
||
IOSurfaceUnlock(k6->ioIn, 0, NULL);
|
||
ane_eval(k6);
|
||
IOSurfaceLock(k6->ioOut, kIOSurfaceLockReadOnly, NULL);
|
||
printf("2× W: in=[%.4f,%.4f] out=[%.4f,%.4f] (expect 2×)\n",
|
||
in6[0],in6[1], out6[0],out6[1]);
|
||
IOSurfaceUnlock(k6->ioOut, kIOSurfaceLockReadOnly, NULL);
|
||
free_kern(k6);
|
||
} else printf("Dynamic matmul compile FAILED\n");
|
||
}
|
||
|
||
free_kern(k); free(W_id); free(W_2x);
|
||
printf("\nDone.\n");
|
||
}
|
||
return 0;
|
||
}
|