mirror of https://github.com/maderix/ANE.git
239 lines
11 KiB
Objective-C
239 lines
11 KiB
Objective-C
// test_throughput_ceiling.m — Experiment I: Multi-kernel throughput ceiling
|
|
// Measures CPU round-trip overhead for sequential ANE kernel execution
|
|
// Build: make test_throughput_ceiling && ./test_throughput_ceiling
|
|
#import <Foundation/Foundation.h>
|
|
#import <mach/mach_time.h>
|
|
#include <dispatch/dispatch.h>
|
|
#include "ane_runtime.h"
|
|
|
|
static int g_fp16_io = 1;
|
|
|
|
static NSString *gen_conv_mil_fp16(int ch, int sp) {
|
|
return [NSString stringWithFormat:
|
|
@"program(1.0)\n[buildInfo = dict<tensor<string, []>, tensor<string, []>>"
|
|
"({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n"
|
|
" func main<ios16>(tensor<fp16, [1, %d, 1, %d]> x) {\n"
|
|
" tensor<string, []> pt = const()[name=tensor<string, []>(\"pt\"),"
|
|
" val=tensor<string, []>(\"valid\")];\n"
|
|
" tensor<int32, [2]> st = const()[name=tensor<string, []>(\"st\"),"
|
|
" val=tensor<int32, [2]>([1,1])];\n"
|
|
" tensor<int32, [4]> pd = const()[name=tensor<string, []>(\"pd\"),"
|
|
" val=tensor<int32, [4]>([0,0,0,0])];\n"
|
|
" tensor<int32, [2]> dl = const()[name=tensor<string, []>(\"dl\"),"
|
|
" val=tensor<int32, [2]>([1,1])];\n"
|
|
" tensor<int32, []> gr = const()[name=tensor<string, []>(\"gr\"),"
|
|
" val=tensor<int32, []>(1)];\n"
|
|
" tensor<fp16, [%d,%d,1,1]> W = const()[name=tensor<string, []>(\"W\"), "
|
|
"val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=tensor<string, []>"
|
|
"(\"@model_path/weights/weight.bin\"), offset=tensor<uint64, []>(64)))];\n"
|
|
" tensor<fp16, [1,%d,1,%d]> y = conv(dilations=dl,groups=gr,"
|
|
"pad=pd,pad_type=pt,strides=st,weight=W,x=x)"
|
|
"[name=tensor<string, []>(\"conv\")];\n"
|
|
" } -> (y);\n}\n", ch, sp, ch, ch, ch, ch, ch, sp];
|
|
}
|
|
|
|
static ANEKernel *compile_fp16_kernel(int ch, int sp) {
|
|
int ws = ch * ch * 2;
|
|
int tot = 128 + ws;
|
|
uint8_t *blob = (uint8_t *)calloc((size_t)tot, 1);
|
|
blob[0] = 1; blob[4] = 2;
|
|
blob[64] = 0xEF; blob[65] = 0xBE; blob[66] = 0xAD; blob[67] = 0xDE;
|
|
blob[68] = 1;
|
|
*(uint32_t *)(blob + 72) = (uint32_t)ws;
|
|
*(uint32_t *)(blob + 80) = 128;
|
|
_Float16 *wp = (_Float16 *)(blob + 128);
|
|
for (int i = 0; i < ch; i++) wp[i * ch + i] = (_Float16)1.0f;
|
|
NSData *wdata = [NSData dataWithBytesNoCopy:blob length:(NSUInteger)tot
|
|
freeWhenDone:YES];
|
|
|
|
NSString *mil = gen_conv_mil_fp16(ch, sp);
|
|
NSData *md = [mil dataUsingEncoding:NSUTF8StringEncoding];
|
|
size_t ioBytes = (size_t)ch * sp * 2;
|
|
return ane_compile(md, wdata, 1, &ioBytes, 1, &ioBytes);
|
|
}
|
|
|
|
int main(int argc, const char *argv[]) {
|
|
(void)argc; (void)argv;
|
|
@autoreleasepool {
|
|
mach_timebase_info_data_t tb;
|
|
mach_timebase_info(&tb);
|
|
|
|
printf("============================================================\n");
|
|
printf(" Experiment I: Multi-Kernel Throughput Ceiling\n");
|
|
printf(" Measuring CPU round-trip overhead for sequential ANE ops\n");
|
|
printf("============================================================\n\n");
|
|
|
|
ane_init();
|
|
if (!g_ane_ok) { printf("ANE not available\n"); return 1; }
|
|
|
|
typedef struct { int ch; int sp; const char *name; } Config;
|
|
Config configs[] = {
|
|
{64, 32, "64x32 (test)"},
|
|
{256, 64, "256x64 (small)"},
|
|
{768, 256, "768x256 (prod)"},
|
|
};
|
|
int nconfigs = sizeof(configs) / sizeof(configs[0]);
|
|
|
|
for (int ci = 0; ci < nconfigs; ci++) {
|
|
Config cfg = configs[ci];
|
|
printf("=== Config: %s ===\n", cfg.name);
|
|
|
|
int nlayers = 12;
|
|
ANEKernel *kernels[12];
|
|
int compiled = 0;
|
|
for (int i = 0; i < nlayers; i++) {
|
|
@try {
|
|
kernels[i] = compile_fp16_kernel(cfg.ch, cfg.sp);
|
|
if (!kernels[i]) {
|
|
printf(" Kernel %d compile failed\n", i);
|
|
break;
|
|
}
|
|
compiled++;
|
|
} @catch (NSException *ex) {
|
|
printf(" Kernel %d exception: %s\n", i,
|
|
[[ex reason] UTF8String]);
|
|
break;
|
|
}
|
|
}
|
|
printf(" Compiled %d/%d kernels\n", compiled, nlayers);
|
|
if (compiled < 2) {
|
|
printf(" Need at least 2 kernels, skipping\n\n");
|
|
for (int i = 0; i < compiled; i++) ane_free(kernels[i]);
|
|
continue;
|
|
}
|
|
|
|
size_t ioBytes = (size_t)cfg.ch * cfg.sp * 2;
|
|
int warmup = 5;
|
|
int iters = 50;
|
|
|
|
// --- Test 1: Sequential (run + memcpy chain) ---
|
|
printf("\n --- Test 1: Sequential (run + memcpy) ---\n");
|
|
{
|
|
for (int w = 0; w < warmup; w++) {
|
|
@try {
|
|
for (int i = 0; i < compiled; i++)
|
|
ane_eval(kernels[i]);
|
|
} @catch (NSException *ex) { (void)ex; }
|
|
}
|
|
|
|
uint64_t t0 = mach_absolute_time();
|
|
for (int it = 0; it < iters; it++) {
|
|
for (int i = 0; i < compiled - 1; i++) {
|
|
@try {
|
|
ane_eval(kernels[i]);
|
|
IOSurfaceLock(kernels[i]->ioOutputs[0],
|
|
kIOSurfaceLockReadOnly, NULL);
|
|
IOSurfaceLock(kernels[i+1]->ioInputs[0], 0, NULL);
|
|
memcpy(
|
|
IOSurfaceGetBaseAddress(kernels[i+1]->ioInputs[0]),
|
|
IOSurfaceGetBaseAddress(kernels[i]->ioOutputs[0]),
|
|
ioBytes);
|
|
IOSurfaceUnlock(kernels[i+1]->ioInputs[0], 0, NULL);
|
|
IOSurfaceUnlock(kernels[i]->ioOutputs[0],
|
|
kIOSurfaceLockReadOnly, NULL);
|
|
} @catch (NSException *ex) { (void)ex; }
|
|
}
|
|
@try {
|
|
ane_eval(kernels[compiled - 1]);
|
|
} @catch (NSException *ex) { (void)ex; }
|
|
}
|
|
double totalMs = (double)(mach_absolute_time() - t0) * tb.numer / tb.denom / 1e6;
|
|
double perIter = totalMs / iters;
|
|
double perKernel = perIter / compiled;
|
|
printf(" Total: %.2f ms/pass (%d kernels)\n", perIter, compiled);
|
|
printf(" Per kernel: %.3f ms\n", perKernel);
|
|
printf(" Throughput: %.0f kernels/s\n", compiled * 1000.0 / perIter);
|
|
}
|
|
|
|
// --- Test 2: Run-only (no memcpy, pure ANE overhead) ---
|
|
printf("\n --- Test 2: Run-only (no memcpy between) ---\n");
|
|
{
|
|
uint64_t t0 = mach_absolute_time();
|
|
for (int it = 0; it < iters; it++) {
|
|
for (int i = 0; i < compiled; i++) {
|
|
@try {
|
|
ane_eval(kernels[i]);
|
|
} @catch (NSException *ex) { (void)ex; }
|
|
}
|
|
}
|
|
double totalMs = (double)(mach_absolute_time() - t0) * tb.numer / tb.denom / 1e6;
|
|
double perIter = totalMs / iters;
|
|
double perKernel = perIter / compiled;
|
|
printf(" Total: %.2f ms/pass (%d kernels)\n", perIter, compiled);
|
|
printf(" Per kernel: %.3f ms\n", perKernel);
|
|
printf(" Throughput: %.0f kernels/s\n", compiled * 1000.0 / perIter);
|
|
}
|
|
|
|
// --- Test 3: Memcpy-only overhead ---
|
|
printf("\n --- Test 3: Memcpy-only overhead ---\n");
|
|
{
|
|
uint64_t t0 = mach_absolute_time();
|
|
for (int it = 0; it < iters * 10; it++) {
|
|
for (int i = 0; i < compiled - 1; i++) {
|
|
IOSurfaceLock(kernels[i]->ioOutputs[0], kIOSurfaceLockReadOnly, NULL);
|
|
IOSurfaceLock(kernels[i+1]->ioInputs[0], 0, NULL);
|
|
memcpy(
|
|
IOSurfaceGetBaseAddress(kernels[i+1]->ioInputs[0]),
|
|
IOSurfaceGetBaseAddress(kernels[i]->ioOutputs[0]),
|
|
ioBytes);
|
|
IOSurfaceUnlock(kernels[i+1]->ioInputs[0], 0, NULL);
|
|
IOSurfaceUnlock(kernels[i]->ioOutputs[0], kIOSurfaceLockReadOnly, NULL);
|
|
}
|
|
}
|
|
double totalMs = (double)(mach_absolute_time() - t0) * tb.numer / tb.denom / 1e6;
|
|
double perIter = totalMs / (iters * 10);
|
|
double perCopy = perIter / (compiled - 1);
|
|
printf(" Total: %.3f ms/pass (%d copies)\n", perIter, compiled - 1);
|
|
printf(" Per memcpy: %.4f ms (%lu bytes)\n", perCopy, (unsigned long)ioBytes);
|
|
}
|
|
|
|
// --- Test 4: GCD serial queue ---
|
|
printf("\n --- Test 4: GCD serial queue ---\n");
|
|
{
|
|
ANEKernel **kptrs = (ANEKernel **)malloc(
|
|
(size_t)compiled * sizeof(ANEKernel *));
|
|
for (int i = 0; i < compiled; i++) kptrs[i] = kernels[i];
|
|
|
|
dispatch_queue_t q = dispatch_queue_create(
|
|
"ane.throughput", DISPATCH_QUEUE_SERIAL);
|
|
dispatch_semaphore_t sem = dispatch_semaphore_create(0);
|
|
const int ncomp = compiled;
|
|
|
|
uint64_t t0 = mach_absolute_time();
|
|
for (int it = 0; it < iters; it++) {
|
|
__block int done = 0;
|
|
for (int i = 0; i < ncomp; i++) {
|
|
ANEKernel *kp = kptrs[i];
|
|
dispatch_async(q, ^{
|
|
@try {
|
|
ane_eval(kp);
|
|
} @catch (NSException *ex) { (void)ex; }
|
|
done++;
|
|
if (done == ncomp)
|
|
dispatch_semaphore_signal(sem);
|
|
});
|
|
}
|
|
dispatch_semaphore_wait(sem, DISPATCH_TIME_FOREVER);
|
|
}
|
|
double totalMs = (double)(mach_absolute_time() - t0)
|
|
* tb.numer / tb.denom / 1e6;
|
|
double perIter = totalMs / iters;
|
|
printf(" Total: %.2f ms/pass (%d kernels, serial queue)\n",
|
|
perIter, ncomp);
|
|
printf(" Per kernel: %.3f ms\n", perIter / ncomp);
|
|
free(kptrs);
|
|
}
|
|
|
|
printf("\n --- CPU Round-trip Overhead ---\n");
|
|
printf(" Overhead = (Sequential - RunOnly) / %d copies\n", compiled - 1);
|
|
printf(" This is what chaining would eliminate per layer.\n");
|
|
|
|
for (int i = 0; i < compiled; i++) ane_free(kernels[i]);
|
|
printf("\n");
|
|
}
|
|
|
|
printf("Done.\n");
|
|
}
|
|
return 0;
|
|
}
|