// test_throughput_ceiling.m — Experiment I: Multi-kernel throughput ceiling // Measures CPU round-trip overhead for sequential ANE kernel execution // Build: make test_throughput_ceiling && ./test_throughput_ceiling #import #import #include #include "ane_runtime.h" static int g_fp16_io = 1; static NSString *gen_conv_mil_fp16(int ch, int sp) { return [NSString stringWithFormat: @"program(1.0)\n[buildInfo = dict, tensor>" "({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n" " func main(tensor x) {\n" " tensor pt = const()[name=tensor(\"pt\")," " val=tensor(\"valid\")];\n" " tensor st = const()[name=tensor(\"st\")," " val=tensor([1,1])];\n" " tensor pd = const()[name=tensor(\"pd\")," " val=tensor([0,0,0,0])];\n" " tensor dl = const()[name=tensor(\"dl\")," " val=tensor([1,1])];\n" " tensor gr = const()[name=tensor(\"gr\")," " val=tensor(1)];\n" " tensor W = const()[name=tensor(\"W\"), " "val=tensor(BLOBFILE(path=tensor" "(\"@model_path/weights/weight.bin\"), offset=tensor(64)))];\n" " tensor y = conv(dilations=dl,groups=gr," "pad=pd,pad_type=pt,strides=st,weight=W,x=x)" "[name=tensor(\"conv\")];\n" " } -> (y);\n}\n", ch, sp, ch, ch, ch, ch, ch, sp]; } static ANEKernel *compile_fp16_kernel(int ch, int sp) { int ws = ch * ch * 2; int tot = 128 + ws; uint8_t *blob = (uint8_t *)calloc((size_t)tot, 1); blob[0] = 1; blob[4] = 2; blob[64] = 0xEF; blob[65] = 0xBE; blob[66] = 0xAD; blob[67] = 0xDE; blob[68] = 1; *(uint32_t *)(blob + 72) = (uint32_t)ws; *(uint32_t *)(blob + 80) = 128; _Float16 *wp = (_Float16 *)(blob + 128); for (int i = 0; i < ch; i++) wp[i * ch + i] = (_Float16)1.0f; NSData *wdata = [NSData dataWithBytesNoCopy:blob length:(NSUInteger)tot freeWhenDone:YES]; NSString *mil = gen_conv_mil_fp16(ch, sp); NSData *md = [mil dataUsingEncoding:NSUTF8StringEncoding]; size_t ioBytes = (size_t)ch * sp * 2; return ane_compile(md, wdata, 1, &ioBytes, 1, &ioBytes); } int main(int argc, const char *argv[]) { (void)argc; (void)argv; @autoreleasepool { mach_timebase_info_data_t tb; mach_timebase_info(&tb); printf("============================================================\n"); printf(" Experiment I: Multi-Kernel Throughput Ceiling\n"); printf(" Measuring CPU round-trip overhead for sequential ANE ops\n"); printf("============================================================\n\n"); ane_init(); if (!g_ane_ok) { printf("ANE not available\n"); return 1; } typedef struct { int ch; int sp; const char *name; } Config; Config configs[] = { {64, 32, "64x32 (test)"}, {256, 64, "256x64 (small)"}, {768, 256, "768x256 (prod)"}, }; int nconfigs = sizeof(configs) / sizeof(configs[0]); for (int ci = 0; ci < nconfigs; ci++) { Config cfg = configs[ci]; printf("=== Config: %s ===\n", cfg.name); int nlayers = 12; ANEKernel *kernels[12]; int compiled = 0; for (int i = 0; i < nlayers; i++) { @try { kernels[i] = compile_fp16_kernel(cfg.ch, cfg.sp); if (!kernels[i]) { printf(" Kernel %d compile failed\n", i); break; } compiled++; } @catch (NSException *ex) { printf(" Kernel %d exception: %s\n", i, [[ex reason] UTF8String]); break; } } printf(" Compiled %d/%d kernels\n", compiled, nlayers); if (compiled < 2) { printf(" Need at least 2 kernels, skipping\n\n"); for (int i = 0; i < compiled; i++) ane_free(kernels[i]); continue; } size_t ioBytes = (size_t)cfg.ch * cfg.sp * 2; int warmup = 5; int iters = 50; // --- Test 1: Sequential (run + memcpy chain) --- printf("\n --- Test 1: Sequential (run + memcpy) ---\n"); { for (int w = 0; w < warmup; w++) { @try { for (int i = 0; i < compiled; i++) ane_eval(kernels[i]); } @catch (NSException *ex) { (void)ex; } } uint64_t t0 = mach_absolute_time(); for (int it = 0; it < iters; it++) { for (int i = 0; i < compiled - 1; i++) { @try { ane_eval(kernels[i]); IOSurfaceLock(kernels[i]->ioOutputs[0], kIOSurfaceLockReadOnly, NULL); IOSurfaceLock(kernels[i+1]->ioInputs[0], 0, NULL); memcpy( IOSurfaceGetBaseAddress(kernels[i+1]->ioInputs[0]), IOSurfaceGetBaseAddress(kernels[i]->ioOutputs[0]), ioBytes); IOSurfaceUnlock(kernels[i+1]->ioInputs[0], 0, NULL); IOSurfaceUnlock(kernels[i]->ioOutputs[0], kIOSurfaceLockReadOnly, NULL); } @catch (NSException *ex) { (void)ex; } } @try { ane_eval(kernels[compiled - 1]); } @catch (NSException *ex) { (void)ex; } } double totalMs = (double)(mach_absolute_time() - t0) * tb.numer / tb.denom / 1e6; double perIter = totalMs / iters; double perKernel = perIter / compiled; printf(" Total: %.2f ms/pass (%d kernels)\n", perIter, compiled); printf(" Per kernel: %.3f ms\n", perKernel); printf(" Throughput: %.0f kernels/s\n", compiled * 1000.0 / perIter); } // --- Test 2: Run-only (no memcpy, pure ANE overhead) --- printf("\n --- Test 2: Run-only (no memcpy between) ---\n"); { uint64_t t0 = mach_absolute_time(); for (int it = 0; it < iters; it++) { for (int i = 0; i < compiled; i++) { @try { ane_eval(kernels[i]); } @catch (NSException *ex) { (void)ex; } } } double totalMs = (double)(mach_absolute_time() - t0) * tb.numer / tb.denom / 1e6; double perIter = totalMs / iters; double perKernel = perIter / compiled; printf(" Total: %.2f ms/pass (%d kernels)\n", perIter, compiled); printf(" Per kernel: %.3f ms\n", perKernel); printf(" Throughput: %.0f kernels/s\n", compiled * 1000.0 / perIter); } // --- Test 3: Memcpy-only overhead --- printf("\n --- Test 3: Memcpy-only overhead ---\n"); { uint64_t t0 = mach_absolute_time(); for (int it = 0; it < iters * 10; it++) { for (int i = 0; i < compiled - 1; i++) { IOSurfaceLock(kernels[i]->ioOutputs[0], kIOSurfaceLockReadOnly, NULL); IOSurfaceLock(kernels[i+1]->ioInputs[0], 0, NULL); memcpy( IOSurfaceGetBaseAddress(kernels[i+1]->ioInputs[0]), IOSurfaceGetBaseAddress(kernels[i]->ioOutputs[0]), ioBytes); IOSurfaceUnlock(kernels[i+1]->ioInputs[0], 0, NULL); IOSurfaceUnlock(kernels[i]->ioOutputs[0], kIOSurfaceLockReadOnly, NULL); } } double totalMs = (double)(mach_absolute_time() - t0) * tb.numer / tb.denom / 1e6; double perIter = totalMs / (iters * 10); double perCopy = perIter / (compiled - 1); printf(" Total: %.3f ms/pass (%d copies)\n", perIter, compiled - 1); printf(" Per memcpy: %.4f ms (%lu bytes)\n", perCopy, (unsigned long)ioBytes); } // --- Test 4: GCD serial queue --- printf("\n --- Test 4: GCD serial queue ---\n"); { ANEKernel **kptrs = (ANEKernel **)malloc( (size_t)compiled * sizeof(ANEKernel *)); for (int i = 0; i < compiled; i++) kptrs[i] = kernels[i]; dispatch_queue_t q = dispatch_queue_create( "ane.throughput", DISPATCH_QUEUE_SERIAL); dispatch_semaphore_t sem = dispatch_semaphore_create(0); const int ncomp = compiled; uint64_t t0 = mach_absolute_time(); for (int it = 0; it < iters; it++) { __block int done = 0; for (int i = 0; i < ncomp; i++) { ANEKernel *kp = kptrs[i]; dispatch_async(q, ^{ @try { ane_eval(kp); } @catch (NSException *ex) { (void)ex; } done++; if (done == ncomp) dispatch_semaphore_signal(sem); }); } dispatch_semaphore_wait(sem, DISPATCH_TIME_FOREVER); } double totalMs = (double)(mach_absolute_time() - t0) * tb.numer / tb.denom / 1e6; double perIter = totalMs / iters; printf(" Total: %.2f ms/pass (%d kernels, serial queue)\n", perIter, ncomp); printf(" Per kernel: %.3f ms\n", perIter / ncomp); free(kptrs); } printf("\n --- CPU Round-trip Overhead ---\n"); printf(" Overhead = (Sequential - RunOnly) / %d copies\n", compiled - 1); printf(" This is what chaining would eliminate per layer.\n"); for (int i = 0; i < compiled; i++) ane_free(kernels[i]); printf("\n"); } printf("Done.\n"); } return 0; }