feat(training): add M5 ANE pipeline benchmark suite

This commit is contained in:
Livia 2026-03-04 14:13:21 -05:00
parent b8d2069c48
commit 6f398781d7
2 changed files with 818 additions and 1 deletions

View File

@ -39,13 +39,16 @@ test_ane_advanced: test_ane_advanced.m
m5_performance_suite: m5_performance_suite.m ane_runtime.h
$(CC) $(CFLAGS) -o $@ $< $(LDFLAGS)
m5_pipeline_suite: m5_pipeline_suite.m ane_runtime.h ane_mil_gen.h
$(CC) $(CFLAGS) -Wno-unused-function -Wno-gnu-folding-constant -o $@ $< $(LDFLAGS)
probes: $(PROBES)
tokenize:
python3 tokenize.py
clean:
rm -f train train_large train_large_ane $(PROBES) test_rmsnorm_bwd test_classifier m5_performance_suite
rm -f train train_large train_large_ane $(PROBES) test_rmsnorm_bwd test_classifier m5_performance_suite m5_pipeline_suite
.PHONY: clean tokenize probes

View File

@ -0,0 +1,814 @@
/*
* m5_pipeline_suite.m
* M5 ANE Pipeline Benchmark Suite
* High-fidelity benchmarking for training pipeline simulation
*/
#import <Foundation/Foundation.h>
#import <objc/runtime.h>
#import <objc/message.h>
#import <dlfcn.h>
#import <IOSurface/IOSurface.h>
#import <mach/mach_time.h>
#import <mach/mach.h>
#include <string.h>
#include <stdlib.h>
#include "ane_runtime.h"
const uint32_t ANE_QOS_CLASS = 21;
const uint32_t WARMUP_ITERATIONS = 10;
const uint32_t BENCHMARK_ITERATIONS = 100;
const uint32_t IOSURFACE_ALIGNMENT_BYTES = 128;
const uint32_t IOSURFACE_LOCK_READ_ONLY = 1;
const uint32_t IOSURFACE_LOCK_DEFAULT = 0;
const double NANOSECONDS_PER_MILLISECOND = 1e6;
const double NANOSECONDS_PER_MICROSECOND = 1e3;
const double NANOSECONDS_PER_SECOND = 1e9;
const double BYTES_PER_MEGABYTE = 1e6;
const double BYTES_PER_GIGABYTE = 1e9;
const int STRESS_TEST_LAYERS = 24;
const int STRESS_TEST_DIM = 4096;
const int LONG_SEQ_DIM = 768;
const int TRAINING_DIM = 768;
const int TRAINING_SEQ = 1024;
const int STRESS_TEST_SEQ = 1;
static NSString* const MIL_VERSION_1_3 = @"1.3";
static NSString* const MIL_VERSION_1_5 = @"1.5";
static NSString* const MIL_TARGET_IOS17 = @"ios17";
static NSString* const MIL_TARGET_IOS18 = @"ios18";
static NSString* const ANE_FRAMEWORK_PATH = @"/System/Library/PrivateFrameworks/AppleNeuralEngine.framework/AppleNeuralEngine";
static NSString* const MIL_BUILD_INFO_COMPONENT_MIL_KEY = @"coremlc-component-MIL";
static NSString* const MIL_BUILD_INFO_COMPONENT_MIL_VAL = @"3510.2.1";
static NSString* const MIL_BUILD_INFO_VER_KEY = @"coremlc-version";
static NSString* const MIL_BUILD_INFO_VER_VAL = @"3505.4.1";
static NSString* const MIL_BUILD_INFO_MILINTERNAL_KEY = @"coremltools-component-milinternal";
static NSString* const MIL_BUILD_INFO_MILINTERNAL_VAL = @"";
static NSString* const MIL_BUILD_INFO_TOOLS_VER_KEY = @"coremltools-version";
static NSString* const MIL_BUILD_INFO_TOOLS_VER_VAL = @"9.0";
static Class g_D, g_I, g_AR, g_AIO;
static mach_timebase_info_data_t g_tb;
typedef struct {
void *model;
IOSurfaceRef ioIn;
IOSurfaceRef ioWeights;
IOSurfaceRef ioOut;
void *request;
void *tmpDir;
} Kern;
typedef struct {
int dimension;
int num_layers;
double total_pipeline_ms;
double per_layer_ms;
double context_switch_overhead_us;
double cumulative_gflops;
double weight_tensor_mb;
bool success;
} LayerStressResult;
typedef struct {
int dimension;
int sequence_length;
double eval_ms;
double gflops;
double bandwidth_gbps;
double scaling;
bool success;
} SequenceSweepResult;
typedef struct {
int dimension;
int num_layers;
int sequence_length;
double weight_update_ms;
double forward_pass_ms;
double total_step_ms;
double tokens_per_second;
double memory_io_ratio;
double compute_ratio;
bool success;
} TrainingSimResult;
typedef id (*MakeDescriptorFunc)(Class, SEL, id, id, id);
typedef id (*MakeModelFunc)(Class, SEL, id);
typedef BOOL (*CompileModelFunc)(id, SEL, unsigned int, id, id*);
typedef BOOL (*LoadModelFunc)(id, SEL, unsigned int, id, id*);
typedef BOOL (*UnloadModelFunc)(id, SEL, unsigned int, id*);
typedef BOOL (*EvaluateModelFunc)(id, SEL, unsigned int, id, id, id*);
typedef id (*MakeAIOFunc)(Class, SEL, IOSurfaceRef);
typedef id (*MakeRequestFunc)(Class, SEL, id, id, id, id, id, id, id);
static void suite_ane_init(void) {
static bool loaded = false;
if (loaded) return;
mach_timebase_info(&g_tb);
void *handle = dlopen(ANE_FRAMEWORK_PATH.UTF8String, RTLD_NOW);
if (!handle) {
fprintf(stderr, "ERROR: Failed to load AppleNeuralEngine framework: %s\n", dlerror());
return;
}
g_D = NSClassFromString(@"_ANEInMemoryModelDescriptor");
g_I = NSClassFromString(@"_ANEInMemoryModel");
g_AR = NSClassFromString(@"_ANERequest");
g_AIO= NSClassFromString(@"_ANEIOSurfaceObject");
if (!g_D || !g_I || !g_AR || !g_AIO) {
fprintf(stderr, "ERROR: Failed to load ANE classes\n");
return;
}
loaded = true;
printf("ANE framework loaded successfully\n");
}
static double tb_ms(uint64_t t) {
return (double)t * g_tb.numer / g_tb.denom / NANOSECONDS_PER_MILLISECOND;
}
static double tb_us(uint64_t t) {
return (double)t * g_tb.numer / g_tb.denom / NANOSECONDS_PER_MICROSECOND;
}
static double tb_s(uint64_t t) {
return (double)t * g_tb.numer / g_tb.denom / NANOSECONDS_PER_SECOND;
}
static IOSurfaceRef make_surface(size_t bytes) {
size_t aligned = ((bytes + (IOSURFACE_ALIGNMENT_BYTES - 1)) / IOSURFACE_ALIGNMENT_BYTES) * IOSURFACE_ALIGNMENT_BYTES;
return IOSurfaceCreate((__bridge CFDictionaryRef)@{
(__bridge id)kIOSurfaceWidth: @(aligned),
(__bridge id)kIOSurfaceHeight: @1,
(__bridge id)kIOSurfaceBytesPerElement: @1,
(__bridge id)kIOSurfaceBytesPerRow: @(aligned),
(__bridge id)kIOSurfaceAllocSize: @(aligned),
(__bridge id)kIOSurfacePixelFormat: @0
});
}
static IOSurfaceRef make_weights_surface(size_t bytes) {
size_t aligned = ((bytes + (IOSURFACE_ALIGNMENT_BYTES - 1)) / IOSURFACE_ALIGNMENT_BYTES) * IOSURFACE_ALIGNMENT_BYTES;
if (aligned < IOSURFACE_ALIGNMENT_BYTES) aligned = IOSURFACE_ALIGNMENT_BYTES;
NSMutableDictionary *props = [NSMutableDictionary dictionaryWithObjectsAndKeys:
@(aligned), (__bridge id)kIOSurfaceWidth,
@1, (__bridge id)kIOSurfaceHeight,
@1, (__bridge id)kIOSurfaceBytesPerElement,
@(aligned), (__bridge id)kIOSurfaceBytesPerRow,
@(aligned), (__bridge id)kIOSurfaceAllocSize,
@0, (__bridge id)kIOSurfacePixelFormat,
nil];
[props setObject:@YES forKey:(__bridge id)kIOSurfaceIsGlobal];
return IOSurfaceCreate((__bridge CFDictionaryRef)props);
}
static NSString *gen_packed_matmul_mil_v1_3(int ic, int oc, int seq) {
NSMutableString *m = [NSMutableString string];
[m appendFormat:@"program(1.3)\n"
"[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, "
"{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, "
"{\"coremltools-version\", \"9.0\"}})]\n{\n"];
int sp_total = seq + oc;
[m appendFormat:@" func main<ios17>(tensor<fp32, [1, %d, 1, %d]> x) {\n", ic, sp_total];
[m appendString:@" string to16 = const()[name = string(\"to16\"), val = string(\"fp16\")];\n"];
[m appendFormat:@" tensor<fp16, [1, %d, 1, %d]> xh = cast(dtype = to16, x = x)[name = string(\"cin\")];\n", ic, sp_total];
[m appendString:@" tensor<int32, [4]> ba = const()[name = string(\"ba\"), val = tensor<int32, [4]>([0,0,0,0])];\n"];
[m appendFormat:@" tensor<int32, [4]> sa = const()[name = string(\"sa\"), val = tensor<int32, [4]>([1,%d,1,%d])];\n", ic, seq];
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> act = slice_by_size(x=xh,begin=ba,size=sa)[name=string(\"act\")];\n", ic, seq];
[m appendFormat:@" tensor<int32, [4]> bw = const()[name = string(\"bw\"), val = tensor<int32, [4]>([0,0,0,%d])];\n", seq];
[m appendFormat:@" tensor<int32, [4]> sw = const()[name = string(\"sw\"), val = tensor<int32, [4]>([1,%d,1,%d])];\n", ic, oc];
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> wt = slice_by_size(x=xh,begin=bw,size=sw)[name=string(\"wt\")];\n", ic, oc];
[m appendFormat:@" tensor<int32, [4]> ra = const()[name = string(\"ra\"), val = tensor<int32, [4]>([1,1,%d,%d])];\n", ic, seq];
[m appendFormat:@" tensor<fp16, [1,1,%d,%d]> a2 = reshape(shape=ra,x=act)[name=string(\"a2\")];\n", ic, seq];
[m appendString:@" tensor<int32, [4]> pm = const()[name = string(\"pm\"), val = tensor<int32, [4]>([0,1,3,2])];\n"];
[m appendFormat:@" tensor<fp16, [1,1,%d,%d]> a3 = transpose(perm=pm,x=a2)[name=string(\"a3\")];\n", seq, ic];
[m appendFormat:@" tensor<int32, [4]> rw = const()[name = string(\"rw\"), val = tensor<int32, [4]>([1,1,%d,%d])];\n", ic, oc];
[m appendFormat:@" tensor<fp16, [1,1,%d,%d]> W = reshape(shape=rw,x=wt)[name=string(\"W\")];\n", ic, oc];
[m appendString:@" bool bF = const()[name = string(\"bF\"), val = bool(false)];\n"];
[m appendFormat:@" tensor<fp16, [1,1,%d,%d]> yh = matmul(transpose_x=bF,transpose_y=bF,x=a3,y=W)[name=string(\"mm\")];\n", seq, oc];
[m appendFormat:@" tensor<fp16, [1,1,%d,%d]> yt = transpose(perm=pm,x=yh)[name=string(\"yt\")];\n", oc, seq];
[m appendFormat:@" tensor<int32, [4]> ro = const()[name = string(\"ro\"), val = tensor<int32, [4]>([1,%d,1,%d])];\n", oc, seq];
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> yr = reshape(shape=ro,x=yt)[name=string(\"yr\")];\n", oc, seq];
[m appendString:@" string to32 = const()[name = string(\"to32\"), val = string(\"fp32\")];\n"];
[m appendFormat:@" tensor<fp32, [1,%d,1,%d]> y = cast(dtype = to32, x = yr)[name = string(\"cout\")];\n", oc, seq];
[m appendString:@" } -> (y);\n}\n"];
return m;
}
static NSString *gen_packed_matmul_mil_v1_5(int ic, int oc, int seq) {
// MIL 1.5/ios18 not supported by ANE compiler, fallback to 1.3/ios17
return gen_packed_matmul_mil_v1_3(ic, oc, seq);
}
static NSString *gen_dynamic_matmul_mil(int ic, int oc, int seq) {
return [NSString stringWithFormat:
@"program(1.3)\n"
"[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, "
"{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, "
"{\"coremltools-version\", \"9.0\"}})]\n"
"{\n"
" func main<ios17>(tensor<fp32, [1, 1, %d, %d]> x, tensor<fp32, [1, 1, %d, %d]> weights) {\n"
" string to_fp16 = const()[name = string(\"to_fp16\"), val = string(\"fp16\")];\n"
" tensor<fp16, [1, 1, %d, %d]> x16 = cast(dtype = to_fp16, x = x)[name = string(\"cast_x\")];\n"
" tensor<fp16, [1, 1, %d, %d]> w16 = cast(dtype = to_fp16, x = weights)[name = string(\"cast_w\")];\n"
" bool tx = const()[name = string(\"tx\"), val = bool(false)];\n"
" bool ty = const()[name = string(\"ty\"), val = bool(false)];\n"
" tensor<fp16, [1, 1, %d, %d]> y16 = matmul(transpose_x = tx, transpose_y = ty, x = x16, y = w16)[name = string(\"matmul\")];\n"
" string to_fp32 = const()[name = string(\"to_fp32\"), val = string(\"fp32\")];\n"
" tensor<fp32, [1, 1, %d, %d]> y = cast(dtype = to_fp32, x = y16)[name = string(\"cast_out\")];\n"
" } -> (y);\n"
"}\n",
seq, ic, ic, oc,
seq, ic, ic, oc,
seq, oc, seq, oc];
}
static Kern *compile_kern_mil(NSString *mil, size_t in_bytes, size_t out_bytes, size_t weight_bytes) {
@autoreleasepool {
NSData *md = [mil dataUsingEncoding:NSUTF8StringEncoding];
MakeDescriptorFunc makeDesc = (MakeDescriptorFunc)objc_msgSend;
id desc = makeDesc(g_D, @selector(modelWithMILText:weights:optionsPlist:), md, @{}, nil);
if (!desc) {
fprintf(stderr, " [compile] desc=NULL\n");
return NULL;
}
MakeModelFunc makeModel = (MakeModelFunc)objc_msgSend;
id mdl = makeModel(g_I, @selector(inMemoryModelWithDescriptor:), desc);
id hx = ((id(*)(id,SEL))objc_msgSend)(mdl, @selector(hexStringIdentifier));
NSString *td = [NSTemporaryDirectory() stringByAppendingPathComponent:hx];
NSString *weightsDir = [td stringByAppendingPathComponent:@"weights"];
NSString *modelPath = [td stringByAppendingPathComponent:@"model.mil"];
[[NSFileManager defaultManager] createDirectoryAtPath:weightsDir withIntermediateDirectories:YES attributes:nil error:nil];
[md writeToFile:modelPath atomically:YES];
NSError *e = nil;
CompileModelFunc compileModel = (CompileModelFunc)objc_msgSend;
if (!compileModel(mdl, @selector(compileWithQoS:options:error:), ANE_QOS_CLASS, @{}, &e)) {
fprintf(stderr, " [compile] FAIL: %s\n", e ? [[e description] UTF8String] : "no error");
return NULL;
}
LoadModelFunc loadModel = (LoadModelFunc)objc_msgSend;
if (!loadModel(mdl, @selector(loadWithQoS:options:error:), ANE_QOS_CLASS, @{}, &e)) {
fprintf(stderr, " [compile] load FAIL\n");
return NULL;
}
Kern *k = (Kern*)calloc(1, sizeof(Kern));
k->model = (void*)CFBridgingRetain(mdl);
k->ioIn = make_surface(in_bytes);
k->ioOut = make_surface(out_bytes);
MakeAIOFunc makeAIO = (MakeAIOFunc)objc_msgSend;
id wI = makeAIO(g_AIO, @selector(objectWithIOSurface:), k->ioIn);
id wO = makeAIO(g_AIO, @selector(objectWithIOSurface:), k->ioOut);
NSArray *inputs = @[wI];
NSArray *inputIndices = @[@0];
if (weight_bytes > 0) {
k->ioWeights = make_weights_surface(weight_bytes);
id wW = makeAIO(g_AIO, @selector(objectWithIOSurface:), k->ioWeights);
inputs = @[wI, wW];
inputIndices = @[@0, @1];
}
MakeRequestFunc makeReq = (MakeRequestFunc)objc_msgSend;
k->request = (void*)CFBridgingRetain(makeReq(g_AR,
@selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:),
inputs, inputIndices, @[wO], @[@0], nil, nil, @0));
k->tmpDir = (void*)CFBridgingRetain(td);
return k;
}
}
static void free_kern(Kern *k) {
if (!k) return;
id mdl = (__bridge id)k->model;
NSError *e = nil;
UnloadModelFunc unloadModel = (UnloadModelFunc)objc_msgSend;
unloadModel(mdl, @selector(unloadWithQoS:error:), ANE_QOS_CLASS, &e);
CFRelease(k->ioIn);
CFRelease(k->ioOut);
if (k->ioWeights) {
CFRelease(k->ioWeights);
}
[[NSFileManager defaultManager] removeItemAtPath:(__bridge id)k->tmpDir error:nil];
CFRelease(k->model);
CFRelease(k->request);
CFRelease(k->tmpDir);
free(k);
}
static void suite_ane_eval_sync(Kern *k) {
id mdl = (__bridge id)k->model;
id req = (__bridge id)k->request;
NSError *e = nil;
EvaluateModelFunc evalModel = (EvaluateModelFunc)objc_msgSend;
evalModel(mdl, @selector(evaluateWithQoS:options:request:error:), ANE_QOS_CLASS, @{}, req, &e);
IOSurfaceLock(k->ioOut, IOSURFACE_LOCK_READ_ONLY, NULL);
IOSurfaceUnlock(k->ioOut, IOSURFACE_LOCK_READ_ONLY, NULL);
}
static NSString *get_macos_version(void) {
NSProcessInfo *pi = [NSProcessInfo processInfo];
NSOperatingSystemVersion v = [pi operatingSystemVersion];
return [NSString stringWithFormat:@"%ld.%ld.%ld", (long)v.majorVersion, (long)v.minorVersion, (long)v.patchVersion];
}
static void print_header(const char *chip_name, const char *mil_version, const char *ios_target) {
printf("\n");
printf("╔══════════════════════════════════════════════════════════════════════════════╗\n");
printf("║ M5 ANE Pipeline Benchmark Suite ║\n");
printf("╠══════════════════════════════════════════════════════════════════════════════╣\n");
printf("║ Hardware: Apple %-4s ║\n", chip_name);
NSString *macos_ver = get_macos_version();
const char *macos_str = macos_ver ? [macos_ver UTF8String] : "Unknown";
printf("║ macOS: %-10s ║\n", macos_str);
printf("║ MIL Version: %-4s (%-6s target) ║\n", mil_version, ios_target);
printf("║ ANE QoS: %d ║\n", ANE_QOS_CLASS);
printf("╚══════════════════════════════════════════════════════════════════════════════╝\n");
printf("\n");
}
static void print_section_header(const char *title) {
printf("\n");
printf("┌──────────────────────────────────────────────────────────────────────────────┐\n");
printf("│ %-76s│\n", title);
printf("└──────────────────────────────────────────────────────────────────────────────┘\n");
}
static void run_layer_stress_test(int dim, int num_layers, bool is_m5, LayerStressResult *result) {
printf("\n");
printf("┌──────────────────────────────────────────────────────────────────────────────┐\n");
printf("│ BENCHMARK 1: %d-Layer Stress Test │\n", num_layers);
printf("├──────────────────────────────────────────────────────────────────────────────┤\n");
printf("│ Configuration: │\n");
printf("│ Dimension: %d x %d │\n", dim, dim);
printf("│ Layers: %d │\n", num_layers);
printf("│ Sequence: %d │\n", STRESS_TEST_SEQ);
printf("├──────────────────────────────────────────────────────────────────────────────┤\n");
memset(result, 0, sizeof(LayerStressResult));
result->dimension = dim;
result->num_layers = num_layers;
result->weight_tensor_mb = (double)dim * dim * sizeof(float) / BYTES_PER_MEGABYTE;
const int sp_total = STRESS_TEST_SEQ + dim;
size_t in_bytes = (size_t)dim * sp_total * sizeof(float);
size_t out_bytes = (size_t)dim * STRESS_TEST_SEQ * sizeof(float);
size_t weight_bytes = 0;
NSString *mil = is_m5 ? gen_packed_matmul_mil_v1_5(dim, dim, STRESS_TEST_SEQ) : gen_packed_matmul_mil_v1_3(dim, dim, STRESS_TEST_SEQ);
printf("│ [Compiling MIL program...] │\n");
uint64_t t0 = mach_absolute_time();
Kern *k = compile_kern_mil(mil, in_bytes, out_bytes, weight_bytes);
uint64_t compile_us = tb_us(mach_absolute_time() - t0);
if (!k) {
printf("│ ✗ Compilation FAILED │\n");
printf("└──────────────────────────────────────────────────────────────────────────────┘\n");
result->success = false;
return;
}
printf("│ ✓ Compiled in %.1f ms │\n", compile_us / NANOSECONDS_PER_MICROSECOND);
printf("│ ✓ Weight tensor: %.2f MB per layer │\n", result->weight_tensor_mb);
float **weight_sets = (float**)calloc(num_layers, sizeof(float*));
for (int layer = 0; layer < num_layers; layer++) {
weight_sets[layer] = (float*)calloc(dim * dim, sizeof(float));
for (int i = 0; i < dim * dim; i++) {
weight_sets[layer][i] = ((float)arc4random() / UINT32_MAX - 0.5f) * 0.01f;
}
}
float *input_data = (float*)calloc(in_bytes / sizeof(float), sizeof(float));
for (size_t i = 0; i < in_bytes / sizeof(float); i++) {
input_data[i] = ((float)arc4random() / UINT32_MAX - 0.5f) * 0.1f;
}
IOSurfaceLock(k->ioIn, IOSURFACE_LOCK_DEFAULT, NULL);
memcpy(IOSurfaceGetBaseAddress(k->ioIn), input_data, in_bytes);
IOSurfaceUnlock(k->ioIn, IOSURFACE_LOCK_DEFAULT, NULL);
printf("│ [Warming up...] │\n");
for (uint32_t i = 0; i < WARMUP_ITERATIONS; i++) {
suite_ane_eval_sync(k);
}
printf("│ [Running %d-layer pipeline...] │\n", num_layers);
uint64_t *layer_times = (uint64_t*)calloc(num_layers, sizeof(uint64_t));
uint64_t total_start = mach_absolute_time();
for (int layer = 0; layer < num_layers; layer++) {
uint64_t layer_start = mach_absolute_time();
IOSurfaceLock(k->ioIn, IOSURFACE_LOCK_DEFAULT, NULL);
float *buf = (float*)IOSurfaceGetBaseAddress(k->ioIn);
for (int d = 0; d < dim; d++) {
memcpy(buf + d * sp_total + STRESS_TEST_SEQ, weight_sets[layer] + d * dim, dim * sizeof(float));
}
IOSurfaceUnlock(k->ioIn, IOSURFACE_LOCK_DEFAULT, NULL);
suite_ane_eval_sync(k);
layer_times[layer] = mach_absolute_time() - layer_start;
}
uint64_t total_end = mach_absolute_time();
double total_ms = tb_ms(total_end - total_start);
double per_layer_ms = total_ms / num_layers;
long long flops_per_layer_ll = 2LL * (long long)1 * (long long)dim * (long long)dim;
long long total_flops_ll = flops_per_layer_ll * (long long)num_layers;
double total_time_seconds = tb_s(total_end - total_start);
double total_gflops = (double)total_flops_ll / (total_time_seconds * 1e9);
double tflops = (total_gflops > 100.0) ? (total_gflops / 1000.0) : 0.0;
double per_layer_time_seconds = per_layer_ms / 1000.0;
double per_layer_gflops = (double)flops_per_layer_ll / (per_layer_time_seconds * 1e9);
double sum_layer_ms = 0;
for (int layer = 0; layer < num_layers; layer++) {
sum_layer_ms += tb_ms(layer_times[layer]);
}
double context_overhead_us = (total_ms - sum_layer_ms) * NANOSECONDS_PER_MICROSECOND / NANOSECONDS_PER_MILLISECOND;
result->total_pipeline_ms = total_ms;
result->per_layer_ms = per_layer_ms;
result->context_switch_overhead_us = context_overhead_us;
result->cumulative_gflops = total_gflops;
result->success = true;
printf("├──────────────────────────────────────────────────────────────────────────────┤\n");
printf("│ Results: │\n");
printf("│ Total Pipeline Latency: %8.2f ms │\n", total_ms);
printf("│ Per-Layer Average: %8.3f ms │\n", per_layer_ms);
printf("│ Context Switch Overhead: %8.3f µs │\n", context_overhead_us);
printf("│ Per-Layer Performance: %8.2f GFLOPS │\n", per_layer_gflops);
if (total_gflops < 1.0) {
printf("│ Total Pipeline Throughput: %8.4f GFLOPS │\n", total_gflops);
} else if (total_gflops < 100.0) {
printf("│ Total Pipeline Throughput: %8.2f GFLOPS │\n", total_gflops);
} else {
printf("│ Total Pipeline Throughput: %8.4f TFLOPS │\n", tflops);
}
printf("│ Weight Tensor Size: %8.2f MB per layer │\n", result->weight_tensor_mb);
printf("└──────────────────────────────────────────────────────────────────────────────┘\n");
for (int layer = 0; layer < num_layers; layer++) {
free(weight_sets[layer]);
}
free(weight_sets);
free(input_data);
free(layer_times);
free_kern(k);
}
static void run_long_sequence_sweep(int dim, const int *seq_values, int num_seq, SequenceSweepResult *results) {
printf("\n");
printf("┌──────────────────────────────────────────────────────────────────────────────┐\n");
printf("│ BENCHMARK 2: Long-Sequence Sweep │\n");
printf("├──────────────────────────────────────────────────────────────────────────────┤\n");
printf("│ Configuration: dim=%d │\n", dim);
printf("├──────────────────────────────────────────────────────────────────────────────┤\n");
printf("│ SEQ │ Eval Time (ms) │ GFLOPS* │ Bandwidth (GB/s)* │ Scaling │\n");
printf("├─────────┼──────────────────┼──────────┼────────────────────┼────────────────┤\n");
double base_tflops = 0;
for (int i = 0; i < num_seq; i++) {
int seq = seq_values[i];
memset(&results[i], 0, sizeof(SequenceSweepResult));
results[i].dimension = dim;
results[i].sequence_length = seq;
size_t in_bytes = (size_t)seq * dim * sizeof(float);
size_t weight_bytes = (size_t)dim * dim * sizeof(float);
size_t out_bytes = (size_t)seq * dim * sizeof(float);
NSString *mil = gen_dynamic_matmul_mil(dim, dim, seq);
Kern *k = compile_kern_mil(mil, in_bytes, out_bytes, weight_bytes);
if (!k) {
printf("│ %5d │ COMPILATION FAILED │\n", seq);
results[i].success = false;
continue;
}
float *input_data = (float*)calloc(in_bytes / sizeof(float), sizeof(float));
float *weight_data = (float*)calloc(weight_bytes / sizeof(float), sizeof(float));
for (size_t j = 0; j < in_bytes / sizeof(float); j++) {
input_data[j] = ((float)arc4random() / UINT32_MAX - 0.5f) * 0.1f;
}
for (size_t j = 0; j < weight_bytes / sizeof(float); j++) {
weight_data[j] = ((float)arc4random() / UINT32_MAX - 0.5f) * 0.01f;
}
IOSurfaceLock(k->ioIn, IOSURFACE_LOCK_DEFAULT, NULL);
memcpy(IOSurfaceGetBaseAddress(k->ioIn), input_data, in_bytes);
IOSurfaceUnlock(k->ioIn, IOSURFACE_LOCK_DEFAULT, NULL);
IOSurfaceLock(k->ioWeights, IOSURFACE_LOCK_DEFAULT, NULL);
memcpy(IOSurfaceGetBaseAddress(k->ioWeights), weight_data, weight_bytes);
IOSurfaceUnlock(k->ioWeights, IOSURFACE_LOCK_DEFAULT, NULL);
for (uint32_t w = 0; w < WARMUP_ITERATIONS; w++) {
suite_ane_eval_sync(k);
}
uint64_t t0 = mach_absolute_time();
for (uint32_t iter = 0; iter < BENCHMARK_ITERATIONS; iter++) {
suite_ane_eval_sync(k);
}
double eval_ms = tb_ms(mach_absolute_time() - t0) / BENCHMARK_ITERATIONS;
long long flops_ll = 2LL * (long long)seq * (long long)dim * (long long)dim;
double eval_time_seconds = eval_ms / 1000.0;
double gflops = (double)flops_ll / (eval_time_seconds * 1e9);
double total_bytes = (double)in_bytes + (double)out_bytes + (double)weight_bytes;
double bandwidth = total_bytes / eval_time_seconds / BYTES_PER_GIGABYTE;
if (i == 0) {
base_tflops = gflops;
results[i].scaling = 1.0;
} else {
results[i].scaling = gflops / base_tflops;
}
results[i].eval_ms = eval_ms;
results[i].gflops = gflops;
results[i].bandwidth_gbps = bandwidth;
results[i].success = true;
printf("│ %5d │ %8.3f │ %7.2f* │ %8.2f* │ %5.2fx │\n",
seq, eval_ms, gflops, bandwidth, results[i].scaling);
free(input_data);
free(weight_data);
free_kern(k);
}
printf("├──────────────────────────────────────────────────────────────────────────────┤\n");
bool linear_scaling = true;
for (int i = 1; i < num_seq; i++) {
if (results[i].success && results[i].scaling < results[i-1].scaling * 0.8) {
linear_scaling = false;
break;
}
}
int threshold_seq = -1;
for (int i = 1; i < num_seq; i++) {
if (results[i].success && results[i].gflops > results[0].gflops * 1.5) {
threshold_seq = seq_values[i];
break;
}
}
printf("│ Analysis: TFLOPS scales %-10s with sequence length │\n",
linear_scaling ? "linearly" : "sub-linearly");
if (threshold_seq > 0) {
printf("│ Compute-bound threshold: SEQ >= %-5d │\n", threshold_seq);
} else {
printf("│ Compute-bound threshold: Not reached in tested range │\n");
}
printf("└──────────────────────────────────────────────────────────────────────────────┘\n");
printf(" * SRAM: ANE internal cache bandwidth (exceeds system RAM limits)\n");
}
static void run_training_simulator(int dim, int layers, int seq, TrainingSimResult *result) {
printf("\n");
printf("┌──────────────────────────────────────────────────────────────────────────────┐\n");
printf("│ BENCHMARK 3: End-to-End Training Throughput Simulator │\n");
printf("├──────────────────────────────────────────────────────────────────────────────┤\n");
printf("│ Configuration: │\n");
printf("│ Dimension: %d │\n", dim);
printf("│ Layers: %d │\n", layers);
printf("│ Sequence: %d │\n", seq);
printf("├──────────────────────────────────────────────────────────────────────────────┤\n");
memset(result, 0, sizeof(TrainingSimResult));
result->dimension = dim;
result->num_layers = layers;
result->sequence_length = seq;
size_t in_bytes = (size_t)seq * dim * sizeof(float);
size_t weight_bytes = (size_t)dim * dim * sizeof(float);
size_t out_bytes = (size_t)seq * dim * sizeof(float);
NSString *mil = gen_dynamic_matmul_mil(dim, dim, seq);
printf("│ [Compiling MIL program...] │\n");
uint64_t t0 = mach_absolute_time();
Kern *k = compile_kern_mil(mil, in_bytes, out_bytes, weight_bytes);
uint64_t compile_us = tb_us(mach_absolute_time() - t0);
if (!k) {
printf("│ ✗ Compilation FAILED │\n");
printf("└──────────────────────────────────────────────────────────────────────────────┘\n");
result->success = false;
return;
}
printf("│ ✓ Compiled in %.1f ms │\n", compile_us / NANOSECONDS_PER_MICROSECOND);
float **weight_sets = (float**)calloc(layers, sizeof(float*));
for (int layer = 0; layer < layers; layer++) {
weight_sets[layer] = (float*)calloc(dim * dim, sizeof(float));
for (int i = 0; i < dim * dim; i++) {
weight_sets[layer][i] = ((float)arc4random() / UINT32_MAX - 0.5f) * 0.01f;
}
}
float *input_data = (float*)calloc(in_bytes / sizeof(float), sizeof(float));
for (size_t i = 0; i < in_bytes / sizeof(float); i++) {
input_data[i] = ((float)arc4random() / UINT32_MAX - 0.5f) * 0.1f;
}
IOSurfaceLock(k->ioIn, IOSURFACE_LOCK_DEFAULT, NULL);
memcpy(IOSurfaceGetBaseAddress(k->ioIn), input_data, in_bytes);
IOSurfaceUnlock(k->ioIn, IOSURFACE_LOCK_DEFAULT, NULL);
printf("│ [Warming up...] │\n");
for (uint32_t i = 0; i < WARMUP_ITERATIONS; i++) {
IOSurfaceLock(k->ioWeights, IOSURFACE_LOCK_DEFAULT, NULL);
memcpy(IOSurfaceGetBaseAddress(k->ioWeights), weight_sets[0], weight_bytes);
IOSurfaceUnlock(k->ioWeights, IOSURFACE_LOCK_DEFAULT, NULL);
suite_ane_eval_sync(k);
}
printf("│ [Simulating %d-layer training step...] │\n", layers);
double total_update_us = 0;
double total_forward_us = 0;
for (int layer = 0; layer < layers; layer++) {
uint64_t update_start = mach_absolute_time();
IOSurfaceLock(k->ioWeights, IOSURFACE_LOCK_DEFAULT, NULL);
memcpy(IOSurfaceGetBaseAddress(k->ioWeights), weight_sets[layer], weight_bytes);
IOSurfaceUnlock(k->ioWeights, IOSURFACE_LOCK_DEFAULT, NULL);
uint64_t update_end = mach_absolute_time();
total_update_us += tb_us(update_end - update_start);
uint64_t forward_start = mach_absolute_time();
suite_ane_eval_sync(k);
uint64_t forward_end = mach_absolute_time();
total_forward_us += tb_us(forward_end - forward_start);
}
double total_update_ms = total_update_us / NANOSECONDS_PER_MICROSECOND;
double total_forward_ms = total_forward_us / NANOSECONDS_PER_MICROSECOND;
double total_step_ms = total_update_ms + total_forward_ms;
double total_step_seconds = total_step_ms / 1000.0;
double tps = (double)seq / total_step_seconds;
double memory_io_ratio = total_update_ms / total_forward_ms;
double compute_ratio = total_forward_ms / total_step_ms;
double weight_update_bytes = (double)weight_bytes * (double)layers;
double update_time_seconds = total_update_ms / 1000.0;
double bandwidth_gbps = weight_update_bytes / update_time_seconds / BYTES_PER_GIGABYTE;
long long flops_per_layer_ll = 2LL * (long long)seq * (long long)dim * (long long)dim;
long long total_flops_ll = flops_per_layer_ll * (long long)layers;
double total_gflops = (double)total_flops_ll / (total_step_seconds * 1e9);
double tflops = (total_gflops > 100.0) ? (total_gflops / 1000.0) : 0.0;
double per_layer_time_seconds = (total_forward_ms / (double)layers) / 1000.0;
double per_layer_gflops = (double)flops_per_layer_ll / (per_layer_time_seconds * 1e9);
result->weight_update_ms = total_update_ms;
result->forward_pass_ms = total_forward_ms;
result->total_step_ms = total_step_ms;
result->tokens_per_second = tps;
result->memory_io_ratio = memory_io_ratio;
result->compute_ratio = compute_ratio;
result->success = true;
printf("├──────────────────────────────────────────────────────────────────────────────┤\n");
printf("│ Timing Breakdown: │\n");
printf("│ Weight Update (Memory I/O): %8.2f ms (%5.1f%%) │\n",
total_update_ms, (total_update_ms / total_step_ms) * 100);
printf("│ Forward Pass (ANE Compute): %8.2f ms (%5.1f%%) │\n",
total_forward_ms, (total_forward_ms / total_step_ms) * 100);
printf("│ Total Step Time: %8.2f ms │\n", total_step_ms);
printf("├──────────────────────────────────────────────────────────────────────────────┤\n");
printf("│ Throughput Metrics: │\n");
printf("│ Tokens Per Second: %8.2f TPS │\n", tps);
printf("│ Memory Bandwidth: %8.2f GB/s │\n", bandwidth_gbps);
printf("│ Per-Layer Compute: %8.2f GFLOPS │\n", per_layer_gflops);
if (total_gflops < 1.0) {
printf("│ Total Pipeline Throughput: %8.4f GFLOPS │\n", total_gflops);
} else if (total_gflops < 100.0) {
printf("│ Total Pipeline Throughput: %8.2f GFLOPS │\n", total_gflops);
} else {
printf("│ Total Pipeline Throughput: %8.4f TFLOPS │\n", tflops);
}
printf("│ Memory/Compute Ratio: %8.2f (%s) │\n",
memory_io_ratio, memory_io_ratio > 1.0 ? "I/O bound" : "Compute bound");
printf("└──────────────────────────────────────────────────────────────────────────────┘\n");
for (int layer = 0; layer < layers; layer++) {
free(weight_sets[layer]);
}
free(weight_sets);
free(input_data);
free_kern(k);
}
int main(int argc, char *argv[]) {
@autoreleasepool {
suite_ane_init();
const char *chip_name = ane_get_chip_name();
bool is_m5 = ane_supports_mil_1_5();
const char *mil_version = MIL_VERSION_1_3.UTF8String;
const char *ios_target = MIL_TARGET_IOS17.UTF8String;
print_header(chip_name, mil_version, ios_target);
LayerStressResult stress_result;
SequenceSweepResult seq_results[3];
TrainingSimResult train_result;
print_section_header("BENCHMARK 1: 24-Layer Stress Test");
run_layer_stress_test(STRESS_TEST_DIM, STRESS_TEST_LAYERS, is_m5, &stress_result);
print_section_header("BENCHMARK 2: Long-Sequence Sweep");
const int seq_values[] = {128, 512, 1024};
run_long_sequence_sweep(LONG_SEQ_DIM, seq_values, 3, seq_results);
print_section_header("BENCHMARK 3: Training Throughput Simulator");
run_training_simulator(TRAINING_DIM, STRESS_TEST_LAYERS, TRAINING_SEQ, &train_result);
printf("\n");
printf("║ M5 PIPELINE SUITE SUMMARY ║\n");
printf("╠══════════════════════════════════════════════════════════════════════════════╣\n");
printf("║ Benchmark │ Key Metric │ Value ║\n");
printf("╠═════════════════════════╪═══════════════════════╪════════════════════════════╣\n");
if (stress_result.success) {
printf("║ 24-Layer Stress │ Per-Layer GFLOPS │ %8.2f GFLOPS ║\n",
stress_result.cumulative_gflops);
} else {
printf("║ 24-Layer Stress │ Status │ FAILED ║\n");
}
if (seq_results[2].success) {
printf("║ Long-Sequence (1024) │ Peak GFLOPS │ %8.2f GFLOPS ║\n",
seq_results[2].gflops);
} else if (seq_results[1].success) {
printf("║ Long-Sequence (512) │ Peak GFLOPS │ %8.2f GFLOPS ║\n",
seq_results[1].gflops);
} else if (seq_results[0].success) {
printf("║ Long-Sequence (128) │ Peak GFLOPS │ %8.2f GFLOPS ║\n",
seq_results[0].gflops);
} else {
printf("║ Long-Sequence │ Status │ FAILED ║\n");
}
if (train_result.success) {
printf("║ Training Simulator │ Tokens/Second │ %8.2f TPS ║\n",
train_result.tokens_per_second);
} else {
printf("║ Training Simulator │ Status │ FAILED ║\n");
}
printf("╚══════════════════════════════════════════════════════════════════════════════╝\n");
printf("\n");
return 0;
}
}