ANE/training/test_throughput_ceiling.m

239 lines
11 KiB
Objective-C

// test_throughput_ceiling.m — Experiment I: Multi-kernel throughput ceiling
// Measures CPU round-trip overhead for sequential ANE kernel execution
// Build: make test_throughput_ceiling && ./test_throughput_ceiling
#import <Foundation/Foundation.h>
#import <mach/mach_time.h>
#include <dispatch/dispatch.h>
#include "ane_runtime.h"
static int g_fp16_io = 1;
static NSString *gen_conv_mil_fp16(int ch, int sp) {
return [NSString stringWithFormat:
@"program(1.0)\n[buildInfo = dict<tensor<string, []>, tensor<string, []>>"
"({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n"
" func main<ios16>(tensor<fp16, [1, %d, 1, %d]> x) {\n"
" tensor<string, []> pt = const()[name=tensor<string, []>(\"pt\"),"
" val=tensor<string, []>(\"valid\")];\n"
" tensor<int32, [2]> st = const()[name=tensor<string, []>(\"st\"),"
" val=tensor<int32, [2]>([1,1])];\n"
" tensor<int32, [4]> pd = const()[name=tensor<string, []>(\"pd\"),"
" val=tensor<int32, [4]>([0,0,0,0])];\n"
" tensor<int32, [2]> dl = const()[name=tensor<string, []>(\"dl\"),"
" val=tensor<int32, [2]>([1,1])];\n"
" tensor<int32, []> gr = const()[name=tensor<string, []>(\"gr\"),"
" val=tensor<int32, []>(1)];\n"
" tensor<fp16, [%d,%d,1,1]> W = const()[name=tensor<string, []>(\"W\"), "
"val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=tensor<string, []>"
"(\"@model_path/weights/weight.bin\"), offset=tensor<uint64, []>(64)))];\n"
" tensor<fp16, [1,%d,1,%d]> y = conv(dilations=dl,groups=gr,"
"pad=pd,pad_type=pt,strides=st,weight=W,x=x)"
"[name=tensor<string, []>(\"conv\")];\n"
" } -> (y);\n}\n", ch, sp, ch, ch, ch, ch, ch, sp];
}
static ANEKernel *compile_fp16_kernel(int ch, int sp) {
int ws = ch * ch * 2;
int tot = 128 + ws;
uint8_t *blob = (uint8_t *)calloc((size_t)tot, 1);
blob[0] = 1; blob[4] = 2;
blob[64] = 0xEF; blob[65] = 0xBE; blob[66] = 0xAD; blob[67] = 0xDE;
blob[68] = 1;
*(uint32_t *)(blob + 72) = (uint32_t)ws;
*(uint32_t *)(blob + 80) = 128;
_Float16 *wp = (_Float16 *)(blob + 128);
for (int i = 0; i < ch; i++) wp[i * ch + i] = (_Float16)1.0f;
NSData *wdata = [NSData dataWithBytesNoCopy:blob length:(NSUInteger)tot
freeWhenDone:YES];
NSString *mil = gen_conv_mil_fp16(ch, sp);
NSData *md = [mil dataUsingEncoding:NSUTF8StringEncoding];
size_t ioBytes = (size_t)ch * sp * 2;
return ane_compile(md, wdata, 1, &ioBytes, 1, &ioBytes);
}
int main(int argc, const char *argv[]) {
(void)argc; (void)argv;
@autoreleasepool {
mach_timebase_info_data_t tb;
mach_timebase_info(&tb);
printf("============================================================\n");
printf(" Experiment I: Multi-Kernel Throughput Ceiling\n");
printf(" Measuring CPU round-trip overhead for sequential ANE ops\n");
printf("============================================================\n\n");
ane_init();
if (!g_ane_ok) { printf("ANE not available\n"); return 1; }
typedef struct { int ch; int sp; const char *name; } Config;
Config configs[] = {
{64, 32, "64x32 (test)"},
{256, 64, "256x64 (small)"},
{768, 256, "768x256 (prod)"},
};
int nconfigs = sizeof(configs) / sizeof(configs[0]);
for (int ci = 0; ci < nconfigs; ci++) {
Config cfg = configs[ci];
printf("=== Config: %s ===\n", cfg.name);
int nlayers = 12;
ANEKernel *kernels[12];
int compiled = 0;
for (int i = 0; i < nlayers; i++) {
@try {
kernels[i] = compile_fp16_kernel(cfg.ch, cfg.sp);
if (!kernels[i]) {
printf(" Kernel %d compile failed\n", i);
break;
}
compiled++;
} @catch (NSException *ex) {
printf(" Kernel %d exception: %s\n", i,
[[ex reason] UTF8String]);
break;
}
}
printf(" Compiled %d/%d kernels\n", compiled, nlayers);
if (compiled < 2) {
printf(" Need at least 2 kernels, skipping\n\n");
for (int i = 0; i < compiled; i++) ane_free(kernels[i]);
continue;
}
size_t ioBytes = (size_t)cfg.ch * cfg.sp * 2;
int warmup = 5;
int iters = 50;
// --- Test 1: Sequential (run + memcpy chain) ---
printf("\n --- Test 1: Sequential (run + memcpy) ---\n");
{
for (int w = 0; w < warmup; w++) {
@try {
for (int i = 0; i < compiled; i++)
ane_eval(kernels[i]);
} @catch (NSException *ex) { (void)ex; }
}
uint64_t t0 = mach_absolute_time();
for (int it = 0; it < iters; it++) {
for (int i = 0; i < compiled - 1; i++) {
@try {
ane_eval(kernels[i]);
IOSurfaceLock(kernels[i]->ioOutputs[0],
kIOSurfaceLockReadOnly, NULL);
IOSurfaceLock(kernels[i+1]->ioInputs[0], 0, NULL);
memcpy(
IOSurfaceGetBaseAddress(kernels[i+1]->ioInputs[0]),
IOSurfaceGetBaseAddress(kernels[i]->ioOutputs[0]),
ioBytes);
IOSurfaceUnlock(kernels[i+1]->ioInputs[0], 0, NULL);
IOSurfaceUnlock(kernels[i]->ioOutputs[0],
kIOSurfaceLockReadOnly, NULL);
} @catch (NSException *ex) { (void)ex; }
}
@try {
ane_eval(kernels[compiled - 1]);
} @catch (NSException *ex) { (void)ex; }
}
double totalMs = (double)(mach_absolute_time() - t0) * tb.numer / tb.denom / 1e6;
double perIter = totalMs / iters;
double perKernel = perIter / compiled;
printf(" Total: %.2f ms/pass (%d kernels)\n", perIter, compiled);
printf(" Per kernel: %.3f ms\n", perKernel);
printf(" Throughput: %.0f kernels/s\n", compiled * 1000.0 / perIter);
}
// --- Test 2: Run-only (no memcpy, pure ANE overhead) ---
printf("\n --- Test 2: Run-only (no memcpy between) ---\n");
{
uint64_t t0 = mach_absolute_time();
for (int it = 0; it < iters; it++) {
for (int i = 0; i < compiled; i++) {
@try {
ane_eval(kernels[i]);
} @catch (NSException *ex) { (void)ex; }
}
}
double totalMs = (double)(mach_absolute_time() - t0) * tb.numer / tb.denom / 1e6;
double perIter = totalMs / iters;
double perKernel = perIter / compiled;
printf(" Total: %.2f ms/pass (%d kernels)\n", perIter, compiled);
printf(" Per kernel: %.3f ms\n", perKernel);
printf(" Throughput: %.0f kernels/s\n", compiled * 1000.0 / perIter);
}
// --- Test 3: Memcpy-only overhead ---
printf("\n --- Test 3: Memcpy-only overhead ---\n");
{
uint64_t t0 = mach_absolute_time();
for (int it = 0; it < iters * 10; it++) {
for (int i = 0; i < compiled - 1; i++) {
IOSurfaceLock(kernels[i]->ioOutputs[0], kIOSurfaceLockReadOnly, NULL);
IOSurfaceLock(kernels[i+1]->ioInputs[0], 0, NULL);
memcpy(
IOSurfaceGetBaseAddress(kernels[i+1]->ioInputs[0]),
IOSurfaceGetBaseAddress(kernels[i]->ioOutputs[0]),
ioBytes);
IOSurfaceUnlock(kernels[i+1]->ioInputs[0], 0, NULL);
IOSurfaceUnlock(kernels[i]->ioOutputs[0], kIOSurfaceLockReadOnly, NULL);
}
}
double totalMs = (double)(mach_absolute_time() - t0) * tb.numer / tb.denom / 1e6;
double perIter = totalMs / (iters * 10);
double perCopy = perIter / (compiled - 1);
printf(" Total: %.3f ms/pass (%d copies)\n", perIter, compiled - 1);
printf(" Per memcpy: %.4f ms (%lu bytes)\n", perCopy, (unsigned long)ioBytes);
}
// --- Test 4: GCD serial queue ---
printf("\n --- Test 4: GCD serial queue ---\n");
{
ANEKernel **kptrs = (ANEKernel **)malloc(
(size_t)compiled * sizeof(ANEKernel *));
for (int i = 0; i < compiled; i++) kptrs[i] = kernels[i];
dispatch_queue_t q = dispatch_queue_create(
"ane.throughput", DISPATCH_QUEUE_SERIAL);
dispatch_semaphore_t sem = dispatch_semaphore_create(0);
const int ncomp = compiled;
uint64_t t0 = mach_absolute_time();
for (int it = 0; it < iters; it++) {
__block int done = 0;
for (int i = 0; i < ncomp; i++) {
ANEKernel *kp = kptrs[i];
dispatch_async(q, ^{
@try {
ane_eval(kp);
} @catch (NSException *ex) { (void)ex; }
done++;
if (done == ncomp)
dispatch_semaphore_signal(sem);
});
}
dispatch_semaphore_wait(sem, DISPATCH_TIME_FOREVER);
}
double totalMs = (double)(mach_absolute_time() - t0)
* tb.numer / tb.denom / 1e6;
double perIter = totalMs / iters;
printf(" Total: %.2f ms/pass (%d kernels, serial queue)\n",
perIter, ncomp);
printf(" Per kernel: %.3f ms\n", perIter / ncomp);
free(kptrs);
}
printf("\n --- CPU Round-trip Overhead ---\n");
printf(" Overhead = (Sequential - RunOnly) / %d copies\n", compiled - 1);
printf(" This is what chaining would eliminate per layer.\n");
for (int i = 0; i < compiled; i++) ane_free(kernels[i]);
printf("\n");
}
printf("Done.\n");
}
return 0;
}