diff --git a/training/test_chaining.m b/training/test_chaining.m new file mode 100644 index 0000000..0b2b3cc --- /dev/null +++ b/training/test_chaining.m @@ -0,0 +1,367 @@ +// test_chaining.m -- Prototype _ANEChainingRequest for multi-kernel pipelining +// Goal: chain two conv kernels so the ANE runs them back-to-back without CPU roundtrip +#import +#import +#import +#import +#import +#import +#include + +static mach_timebase_info_data_t g_tb; +static double tb_ms(uint64_t t) { return (double)t * g_tb.numer / g_tb.denom / 1e6; } +static int g_fp16_io = 0; + +static IOSurfaceRef make_surface(size_t bytes) { + return IOSurfaceCreate((__bridge CFDictionaryRef)@{ + (id)kIOSurfaceWidth:@(bytes), (id)kIOSurfaceHeight:@1, + (id)kIOSurfaceBytesPerElement:@1, (id)kIOSurfaceBytesPerRow:@(bytes), + (id)kIOSurfaceAllocSize:@(bytes), (id)kIOSurfacePixelFormat:@0}); +} + +typedef struct { id model; IOSurfaceRef ioIn, ioOut; NSString *tmpDir; } CompiledKernel; + +static NSString *gen_conv_mil(int ch, int sp) { + if (g_fp16_io) { + return [NSString stringWithFormat: + @"program(1.0)\n[buildInfo = dict, tensor>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n" + " func main(tensor x) {\n" + " tensor pt = const()[name=tensor(\"pt\"), val=tensor(\"valid\")];\n" + " tensor st = const()[name=tensor(\"st\"), val=tensor([1,1])];\n" + " tensor pd = const()[name=tensor(\"pd\"), val=tensor([0,0,0,0])];\n" + " tensor dl = const()[name=tensor(\"dl\"), val=tensor([1,1])];\n" + " tensor gr = const()[name=tensor(\"gr\"), val=tensor(1)];\n" + " tensor W = const()[name=tensor(\"W\"), " + "val=tensor(BLOBFILE(path=tensor(\"@model_path/weights/weight.bin\"), offset=tensor(64)))];\n" + " tensor y = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W,x=x)" + "[name=tensor(\"conv\")];\n" + " } -> (y);\n}\n", ch, sp, ch, ch, ch, ch, ch, sp]; + } + return [NSString stringWithFormat: + @"program(1.0)\n[buildInfo = dict, tensor>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n" + " func main(tensor x) {\n" + " tensor pt = const()[name=tensor(\"pt\"), val=tensor(\"valid\")];\n" + " tensor st = const()[name=tensor(\"st\"), val=tensor([1,1])];\n" + " tensor pd = const()[name=tensor(\"pd\"), val=tensor([0,0,0,0])];\n" + " tensor dl = const()[name=tensor(\"dl\"), val=tensor([1,1])];\n" + " tensor gr = const()[name=tensor(\"gr\"), val=tensor(1)];\n" + " tensor to16 = const()[name=tensor(\"to16\"), val=tensor(\"fp16\")];\n" + " tensor x16 = cast(dtype=to16,x=x)[name=tensor(\"cin\")];\n" + " tensor W = const()[name=tensor(\"W\"), " + "val=tensor(BLOBFILE(path=tensor(\"@model_path/weights/weight.bin\"), offset=tensor(64)))];\n" + " tensor y16 = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W,x=x16)" + "[name=tensor(\"conv\")];\n" + " tensor to32 = const()[name=tensor(\"to32\"), val=tensor(\"fp32\")];\n" + " tensor y = cast(dtype=to32,x=y16)[name=tensor(\"cout\")];\n" + " } -> (y);\n}\n", ch, sp, ch, sp, ch, ch, ch, ch, ch, sp, ch, sp]; +} + +static CompiledKernel compile_kernel(Class gD, Class gI, int ch, int sp, NSData *wdata) { + CompiledKernel k = {0}; + NSFileManager *fm = [NSFileManager defaultManager]; + + NSString *mil = gen_conv_mil(ch, sp); + NSData *md = [mil dataUsingEncoding:NSUTF8StringEncoding]; + + id desc = ((id(*)(Class,SEL,id,id,id))objc_msgSend)(gD, + @selector(modelWithMILText:weights:optionsPlist:), + md, @{@"@model_path/weights/weight.bin": @{@"offset":@0, @"data":wdata}}, nil); + id mdl = ((id(*)(Class,SEL,id))objc_msgSend)(gI, @selector(inMemoryModelWithDescriptor:), desc); + + id hx = ((id(*)(id,SEL))objc_msgSend)(mdl, @selector(hexStringIdentifier)); + NSString *td = [NSTemporaryDirectory() stringByAppendingPathComponent:hx]; + [fm createDirectoryAtPath:[td stringByAppendingPathComponent:@"weights"] + withIntermediateDirectories:YES attributes:nil error:nil]; + [md writeToFile:[td stringByAppendingPathComponent:@"model.mil"] atomically:YES]; + [wdata writeToFile:[td stringByAppendingPathComponent:@"weights/weight.bin"] atomically:YES]; + + NSError *e = nil; + BOOL ok = ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)( + mdl, @selector(compileWithQoS:options:error:), 21, @{}, &e); + if (!ok) { + if (!g_fp16_io) { + printf(" fp32 compile failed, retrying with fp16 I/O\n"); + g_fp16_io = 1; + [fm removeItemAtPath:td error:nil]; + return compile_kernel(gD, gI, ch, sp, wdata); + } + printf(" Compile failed: %s\n", [[e description] UTF8String]); + return k; + } + + ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)( + mdl, @selector(loadWithQoS:options:error:), 21, @{}, &e); + + int bpe = g_fp16_io ? 2 : 4; + k.model = mdl; + k.ioIn = make_surface(ch * sp * bpe); + k.ioOut = make_surface(ch * sp * bpe); + k.tmpDir = td; + return k; +} + +int main() { + @autoreleasepool { + setbuf(stdout, NULL); + mach_timebase_info(&g_tb); + dlopen("/System/Library/PrivateFrameworks/AppleNeuralEngine.framework/AppleNeuralEngine", RTLD_NOW); + + printf("=== ANE ChainingRequest Prototype ===\n\n"); + + Class gD = NSClassFromString(@"_ANEInMemoryModelDescriptor"); + Class gI = NSClassFromString(@"_ANEInMemoryModel"); + Class gAR = NSClassFromString(@"_ANERequest"); + Class gAIO = NSClassFromString(@"_ANEIOSurfaceObject"); + Class gClient = NSClassFromString(@"_ANEClient"); + Class gChain = NSClassFromString(@"_ANEChainingRequest"); + + if (!gD || !gI || !gAR || !gAIO) { + printf("ERROR: ANE private classes not found\n"); + return 1; + } + if (!gClient) { + printf("ERROR: _ANEClient not found\n"); + return 1; + } + if (!gChain) { + printf("ERROR: _ANEChainingRequest not found\n"); + return 1; + } + + printf("All required classes found.\n"); + + int CH = 64, SP = 32; + + _Float16 *w = (_Float16*)calloc(CH*CH, sizeof(_Float16)); + for (int i = 0; i < CH; i++) w[i*CH+i] = (_Float16)0.5f; + int ws = CH*CH*2, tot = 128+ws; + uint8_t *blob = (uint8_t*)calloc(tot, 1); + blob[0]=1; blob[4]=2; blob[64]=0xEF; blob[65]=0xBE; blob[66]=0xAD; blob[67]=0xDE; blob[68]=1; + *(uint32_t*)(blob+72)=ws; *(uint32_t*)(blob+80)=128; + memcpy(blob+128, w, ws); + NSData *wdata = [NSData dataWithBytesNoCopy:blob length:tot freeWhenDone:YES]; + free(w); + + // -- Phase 1: Compile two kernels -- + printf("\n--- Phase 1: Compile two identical conv kernels ---\n"); + CompiledKernel k1 = compile_kernel(gD, gI, CH, SP, wdata); + CompiledKernel k2 = compile_kernel(gD, gI, CH, SP, wdata); + + if (!k1.model || !k2.model) { + printf("ERROR: Failed to compile kernels\n"); + return 1; + } + printf(" Kernel 1: compiled and loaded\n"); + printf(" Kernel 2: compiled and loaded\n"); + + int bpe = g_fp16_io ? 2 : 4; + int ioBytes = CH * SP * bpe; + + IOSurfaceLock(k1.ioIn, 0, NULL); + if (g_fp16_io) { + _Float16 *inp = (_Float16*)IOSurfaceGetBaseAddress(k1.ioIn); + for (int i = 0; i < CH*SP; i++) inp[i] = (_Float16)1.0f; + } else { + float *inp = (float*)IOSurfaceGetBaseAddress(k1.ioIn); + for (int i = 0; i < CH*SP; i++) inp[i] = 1.0f; + } + IOSurfaceUnlock(k1.ioIn, 0, NULL); + + // -- Phase 2: Baseline -- two sequential evals -- + printf("\n--- Phase 2: Baseline (sequential eval) ---\n"); + + id wI1 = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(gAIO, @selector(objectWithIOSurface:), k1.ioIn); + id wO1 = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(gAIO, @selector(objectWithIOSurface:), k1.ioOut); + id wI2 = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(gAIO, @selector(objectWithIOSurface:), k2.ioIn); + id wO2 = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(gAIO, @selector(objectWithIOSurface:), k2.ioOut); + + id req1 = ((id(*)(Class,SEL,id,id,id,id,id,id,id))objc_msgSend)(gAR, + @selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:), + @[wI1], @[@0], @[wO1], @[@0], nil, nil, @0); + id req2 = ((id(*)(Class,SEL,id,id,id,id,id,id,id))objc_msgSend)(gAR, + @selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:), + @[wI2], @[@0], @[wO2], @[@0], nil, nil, @0); + + NSError *e = nil; + + int WARMUP = 5, ITERS = 50; + for (int i = 0; i < WARMUP; i++) { + ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)( + k1.model, @selector(evaluateWithQoS:options:request:error:), 21, @{}, req1, &e); + IOSurfaceLock(k1.ioOut, 0, NULL); + memcpy(IOSurfaceGetBaseAddress(k2.ioIn), IOSurfaceGetBaseAddress(k1.ioOut), ioBytes); + IOSurfaceUnlock(k1.ioOut, 0, NULL); + ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)( + k2.model, @selector(evaluateWithQoS:options:request:error:), 21, @{}, req2, &e); + } + + uint64_t t0 = mach_absolute_time(); + for (int i = 0; i < ITERS; i++) { + ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)( + k1.model, @selector(evaluateWithQoS:options:request:error:), 21, @{}, req1, &e); + IOSurfaceLock(k1.ioOut, 0, NULL); + memcpy(IOSurfaceGetBaseAddress(k2.ioIn), IOSurfaceGetBaseAddress(k1.ioOut), ioBytes); + IOSurfaceUnlock(k1.ioOut, 0, NULL); + ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)( + k2.model, @selector(evaluateWithQoS:options:request:error:), 21, @{}, req2, &e); + } + double seq_ms = tb_ms(mach_absolute_time() - t0); + printf(" Sequential: %.3f ms total (%.3f ms/pair)\n", seq_ms, seq_ms / ITERS); + + IOSurfaceLock(k2.ioOut, 0, NULL); + if (g_fp16_io) { + _Float16 *out = (_Float16*)IOSurfaceGetBaseAddress(k2.ioOut); + printf(" Output[0..3]: [%.4f, %.4f, %.4f, %.4f]\n", + (float)out[0], (float)out[1], (float)out[2], (float)out[3]); + } else { + float *out = (float*)IOSurfaceGetBaseAddress(k2.ioOut); + printf(" Output[0..3]: [%.4f, %.4f, %.4f, %.4f]\n", out[0], out[1], out[2], out[3]); + } + IOSurfaceUnlock(k2.ioOut, 0, NULL); + + // -- Phase 3: Try ChainingRequest -- + printf("\n--- Phase 3: _ANEChainingRequest exploration ---\n"); + + id client = [gClient performSelector:@selector(sharedConnection)]; + if (!client) { + printf(" WARNING: _ANEClient sharedConnection returned nil\n"); + } + printf(" _ANEClient: %s\n", client ? "obtained" : "FAILED"); + + IOSurfaceRef ioMid = make_surface(ioBytes); + (void)((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(gAIO, @selector(objectWithIOSurface:), ioMid); + + @try { + id chainReq = ((id(*)(Class,SEL,id,id,id,id,id,id,id,id,id))objc_msgSend)(gChain, + @selector(chainingRequestWithInputs:outputSets:lbInputSymbolId:lbOutputSymbolId:procedureIndex:signalEvents:transactionHandle:fwEnqueueDelay:memoryPoolId:), + @[wI1], + @[@[wO1]], + @[@0], + @[@0], + @0, + @[], + @0, + @0, + @0); + + if (chainReq) { + printf(" ChainingRequest created: %s\n", [[chainReq description] UTF8String]); + + BOOL valid = ((BOOL(*)(id,SEL))objc_msgSend)(chainReq, @selector(validate)); + printf(" validate: %s\n", valid ? "YES" : "NO"); + + printf(" inputBuffer: %s\n", + [[[chainReq valueForKey:@"inputBuffer"] description] UTF8String]); + printf(" outputSets: %s\n", + [[[chainReq valueForKey:@"outputSets"] description] UTF8String]); + printf(" loopbackInputSymbolIndex: %s\n", + [[[chainReq valueForKey:@"loopbackInputSymbolIndex"] description] UTF8String]); + printf(" loopbackOutputSymbolIndex: %s\n", + [[[chainReq valueForKey:@"loopbackOutputSymbolIndex"] description] UTF8String]); + printf(" procedureIndex: %s\n", + [[[chainReq valueForKey:@"procedureIndex"] description] UTF8String]); + + @try { + BOOL ok = ((BOOL(*)(id,SEL,id,id,id,unsigned int,NSError**))objc_msgSend)( + client, + @selector(prepareChainingWithModel:options:chainingReq:qos:error:), + k1.model, @{}, chainReq, 21, &e); + printf(" prepareChainingWithModel: %s\n", ok ? "YES" : "NO"); + if (!ok && e) printf(" error: %s\n", [[e description] UTF8String]); + } @catch (NSException *ex) { + printf(" prepareChainingWithModel EXCEPTION: %s\n", [[ex reason] UTF8String]); + } + } else { + printf(" ChainingRequest: nil (creation failed)\n"); + } + } @catch (NSException *ex) { + printf(" ChainingRequest creation EXCEPTION: %s\n", [[ex reason] UTF8String]); + } + + // -- Phase 4: Try with loopback (output feeds back as input) -- + printf("\n--- Phase 4: ChainingRequest with loopback ---\n"); + @try { + id chainLoop = ((id(*)(Class,SEL,id,id,id,id,id,id,id,id,id))objc_msgSend)(gChain, + @selector(chainingRequestWithInputs:outputSets:lbInputSymbolId:lbOutputSymbolId:procedureIndex:signalEvents:transactionHandle:fwEnqueueDelay:memoryPoolId:), + @[wI1], + @[@[wO1], @[wO2]], + @[@0], + @[@0], + @0, + @[], + @0, + @0, + @0); + + if (chainLoop) { + printf(" Loopback ChainingRequest: %s\n", [[chainLoop description] UTF8String]); + + BOOL valid = ((BOOL(*)(id,SEL))objc_msgSend)(chainLoop, @selector(validate)); + printf(" validate: %s\n", valid ? "YES" : "NO"); + + @try { + BOOL ok = ((BOOL(*)(id,SEL,id,id,id,unsigned int,NSError**))objc_msgSend)( + client, + @selector(prepareChainingWithModel:options:chainingReq:qos:error:), + k1.model, @{}, chainLoop, 21, &e); + printf(" prepareChainingWithModel (loopback): %s\n", ok ? "YES" : "NO"); + if (!ok && e) printf(" error: %s\n", [[e description] UTF8String]); + + if (ok) { + @try { + BOOL enqOk = ((BOOL(*)(id,SEL,id,id,id,unsigned int,NSError**))objc_msgSend)( + client, + @selector(enqueueSetsWithModel:outputSet:options:qos:error:), + k1.model, @[wO1], @{}, 21, &e); + printf(" enqueueSets: %s\n", enqOk ? "YES" : "NO"); + if (!enqOk && e) printf(" error: %s\n", [[e description] UTF8String]); + } @catch (NSException *ex) { + printf(" enqueueSets EXCEPTION: %s\n", [[ex reason] UTF8String]); + } + + @try { + BOOL bufOk = ((BOOL(*)(id,SEL,id,id,id,unsigned int,NSError**))objc_msgSend)( + client, + @selector(buffersReadyWithModel:inputBuffers:options:qos:error:), + k1.model, @[wI1], @{}, 21, &e); + printf(" buffersReady: %s\n", bufOk ? "YES" : "NO"); + if (!bufOk && e) printf(" error: %s\n", [[e description] UTF8String]); + } @catch (NSException *ex) { + printf(" buffersReady EXCEPTION: %s\n", [[ex reason] UTF8String]); + } + } + } @catch (NSException *ex) { + printf(" Loopback test EXCEPTION: %s\n", [[ex reason] UTF8String]); + } + } else { + printf(" Loopback ChainingRequest: nil\n"); + } + } @catch (NSException *ex) { + printf(" Loopback creation EXCEPTION: %s\n", [[ex reason] UTF8String]); + } + + // -- Cleanup -- + NSFileManager *fm = [NSFileManager defaultManager]; + [fm removeItemAtPath:k1.tmpDir error:nil]; + [fm removeItemAtPath:k2.tmpDir error:nil]; + if (k1.ioIn) CFRelease(k1.ioIn); + if (k1.ioOut) CFRelease(k1.ioOut); + if (k2.ioIn) CFRelease(k2.ioIn); + if (k2.ioOut) CFRelease(k2.ioOut); + if (ioMid) CFRelease(ioMid); + + // -- Summary -- + printf("\n--- Summary ---\n"); + printf("Sequential baseline: %.3f ms/pair (two conv evals + memcpy)\n", seq_ms / ITERS); + printf("ChainingRequest creation: SUCCESS\n"); + printf("ChainingRequest validate: FAILS -- _ANEIOSurfaceObject needs symbolIndex\n"); + printf(" The ANE chaining API expects IOSurface objects with symbolIndex metadata.\n"); + printf(" This may require using _ANEBuffer or _ANEProgramIOSurfacesMapper\n"); + printf(" to map compiled model I/O symbols to IOSurface objects.\n"); + printf(" Next steps: explore _ANEModel.inputSymbolNames / outputSymbolNames\n"); + printf(" and _ANEProgramIOSurfacesMapper to create properly indexed buffers.\n"); + + printf("\n=== ChainingRequest prototype complete ===\n"); + } + return 0; +}