Add ANE probe tests and training telemetry for M5 optimization

Four standalone probe tests to characterize the M5 ANE:
- test_weight_reload: Can weights be hot-swapped via unload+load without recompilation?
- test_perf_stats: Enumerate _ANEPerformanceStats methods/properties and hardware counters
- test_qos_sweep: Measure compile/load/eval latency across QoS 0-63
- test_ane_advanced: Probe SharedEvents, weightsBuffer IOSurface, procedureIndex, VirtualClient

Training telemetry (train_large.m):
- JSON lines to stderr with per-step timing breakdown and per-batch TFLOPS metrics
- Enables external monitoring tools to visualize ANE utilization in real-time

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
m0at 2026-03-01 22:54:58 -08:00
parent 4d67db1bdb
commit 40d3f45631
6 changed files with 924 additions and 2 deletions

View File

@ -11,10 +11,26 @@ train: train.m ane_runtime.h ane_mil_gen.h model.h forward.h backward.h
train_large: train_large.m $(HEADERS_LARGE)
$(CC) $(CFLAGS) -o $@ train_large.m $(LDFLAGS) -framework Accelerate
PROBES = test_weight_reload test_perf_stats test_qos_sweep test_ane_advanced
test_weight_reload: test_weight_reload.m
$(CC) $(CFLAGS) -o $@ $< $(LDFLAGS)
test_perf_stats: test_perf_stats.m
$(CC) $(CFLAGS) -o $@ $< $(LDFLAGS)
test_qos_sweep: test_qos_sweep.m
$(CC) $(CFLAGS) -o $@ $< $(LDFLAGS)
test_ane_advanced: test_ane_advanced.m
$(CC) $(CFLAGS) -o $@ $< $(LDFLAGS)
probes: $(PROBES)
tokenize:
python3 tokenize.py
clean:
rm -f train train_large
rm -f train train_large $(PROBES)
.PHONY: clean tokenize
.PHONY: clean tokenize probes

View File

@ -0,0 +1,240 @@
// test_ane_advanced.m Probe advanced ANE interfaces
// SharedEvents, weightsBuffer, procedureIndex, VirtualClient, ChainingRequest
#import <Foundation/Foundation.h>
#import <objc/runtime.h>
#import <objc/message.h>
#import <dlfcn.h>
#import <IOSurface/IOSurface.h>
#import <mach/mach_time.h>
static mach_timebase_info_data_t g_tb;
static double tb_ms(uint64_t t) { return (double)t * g_tb.numer / g_tb.denom / 1e6; }
static void dump_class(const char *name) {
Class cls = NSClassFromString([NSString stringWithUTF8String:name]);
if (!cls) { printf(" %s: NOT FOUND\n", name); return; }
printf("\n=== %s ===\n", name);
unsigned int count;
Method *methods = class_copyMethodList(object_getClass(cls), &count);
if (count) printf(" Class methods:\n");
for (unsigned int i = 0; i < count; i++) {
SEL s = method_getName(methods[i]);
const char *enc = method_getTypeEncoding(methods[i]);
printf(" + %s [%s]\n", sel_getName(s), enc ? enc : "?");
}
free(methods);
methods = class_copyMethodList(cls, &count);
if (count) printf(" Instance methods:\n");
for (unsigned int i = 0; i < count; i++) {
SEL s = method_getName(methods[i]);
const char *enc = method_getTypeEncoding(methods[i]);
printf(" - %s [%s]\n", sel_getName(s), enc ? enc : "?");
}
free(methods);
unsigned int pcount;
objc_property_t *props = class_copyPropertyList(cls, &pcount);
if (pcount) printf(" Properties:\n");
for (unsigned int i = 0; i < pcount; i++) {
const char *pname = property_getName(props[i]);
const char *pattr = property_getAttributes(props[i]);
printf(" @property %s [%s]\n", pname, pattr ? pattr : "?");
}
free(props);
}
static IOSurfaceRef make_surface(size_t bytes) {
return IOSurfaceCreate((__bridge CFDictionaryRef)@{
(id)kIOSurfaceWidth:@(bytes), (id)kIOSurfaceHeight:@1,
(id)kIOSurfaceBytesPerElement:@1, (id)kIOSurfaceBytesPerRow:@(bytes),
(id)kIOSurfaceAllocSize:@(bytes), (id)kIOSurfacePixelFormat:@0});
}
int main() {
@autoreleasepool {
setbuf(stdout, NULL);
mach_timebase_info(&g_tb);
dlopen("/System/Library/PrivateFrameworks/AppleNeuralEngine.framework/AppleNeuralEngine", RTLD_NOW);
printf("=== ANE Advanced Interface Probe ===\n");
// === Part 1: Event/Sync classes ===
printf("\n--- Part 1: Event/Sync Classes ---\n");
dump_class("_ANESharedEvents");
dump_class("_ANESharedSignalEvent");
dump_class("_ANESharedWaitEvent");
dump_class("_ANEEvent");
dump_class("_ANEFenceEvent");
// Try instantiate
const char *event_classes[] = {
"_ANESharedEvents", "_ANESharedSignalEvent", "_ANESharedWaitEvent",
"_ANEEvent", "_ANEFenceEvent", NULL
};
for (int i = 0; event_classes[i]; i++) {
Class cls = NSClassFromString([NSString stringWithUTF8String:event_classes[i]]);
if (!cls) continue;
@try {
id obj = [[cls alloc] init];
printf(" %s alloc/init: %s\n", event_classes[i],
obj ? [[obj description] UTF8String] : "nil");
} @catch (NSException *ex) {
printf(" %s alloc/init: EXCEPTION: %s\n", event_classes[i], [[ex reason] UTF8String]);
}
}
// === Part 2: VirtualClient and ChainingRequest ===
printf("\n--- Part 2: VirtualClient / ChainingRequest ---\n");
dump_class("_ANEVirtualClient");
dump_class("_ANEChainingRequest");
dump_class("_ANEMultiRequest");
dump_class("_ANEBatchRequest");
// === Part 3: weightsBuffer parameter test ===
printf("\n--- Part 3: weightsBuffer IOSurface test ---\n");
Class g_D = NSClassFromString(@"_ANEInMemoryModelDescriptor");
Class g_I = NSClassFromString(@"_ANEInMemoryModel");
Class g_AR = NSClassFromString(@"_ANERequest");
Class g_AIO= NSClassFromString(@"_ANEIOSurfaceObject");
int IC = 4, OC = 4, SP = 4;
_Float16 weights[16];
for (int i = 0; i < 16; i++) weights[i] = (i/4 == i%4) ? (_Float16)1.0f : (_Float16)0.0f;
int ws = 16*2, tot = 128+ws;
uint8_t *blob = (uint8_t*)calloc(tot,1);
blob[0]=1; blob[4]=2; blob[64]=0xEF; blob[65]=0xBE; blob[66]=0xAD; blob[67]=0xDE; blob[68]=1;
*(uint32_t*)(blob+72)=ws; *(uint32_t*)(blob+80)=128;
memcpy(blob+128, weights, ws);
NSData *wdata = [NSData dataWithBytesNoCopy:blob length:tot freeWhenDone:YES];
NSString *mil = [NSString stringWithFormat:
@"program(1.3)\n"
"[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, "
"{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, "
"{\"coremltools-version\", \"9.0\"}})]\n"
"{\n"
" func main<ios18>(tensor<fp16, [1, %d, 1, %d]> x) {\n"
" string pt = const()[name=string(\"pt\"), val=string(\"valid\")];\n"
" tensor<int32, [2]> st = const()[name=string(\"st\"), val=tensor<int32, [2]>([1,1])];\n"
" tensor<int32, [4]> pd = const()[name=string(\"pd\"), val=tensor<int32, [4]>([0,0,0,0])];\n"
" tensor<int32, [2]> dl = const()[name=string(\"dl\"), val=tensor<int32, [2]>([1,1])];\n"
" int32 gr = const()[name=string(\"gr\"), val=int32(1)];\n"
" tensor<fp16, [%d,%d,1,1]> W = const()[name=string(\"W\"), "
"val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/weight.bin\"), offset=uint64(64)))];\n"
" tensor<fp16, [1,%d,1,%d]> y = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W,x=x)"
"[name=string(\"conv\")];\n"
" } -> (y);\n"
"}\n", IC, SP, OC, IC, OC, IC, OC, SP];
NSData *md = [mil dataUsingEncoding:NSUTF8StringEncoding];
id desc = ((id(*)(Class,SEL,id,id,id))objc_msgSend)(g_D, @selector(modelWithMILText:weights:optionsPlist:),
md, @{@"@model_path/weights/weight.bin": @{@"offset":@0, @"data":wdata}}, nil);
id mdl = ((id(*)(Class,SEL,id))objc_msgSend)(g_I, @selector(inMemoryModelWithDescriptor:), desc);
id hx = ((id(*)(id,SEL))objc_msgSend)(mdl, @selector(hexStringIdentifier));
NSString *td = [NSTemporaryDirectory() stringByAppendingPathComponent:hx];
NSFileManager *fm = [NSFileManager defaultManager];
[fm createDirectoryAtPath:[td stringByAppendingPathComponent:@"weights"]
withIntermediateDirectories:YES attributes:nil error:nil];
[md writeToFile:[td stringByAppendingPathComponent:@"model.mil"] atomically:YES];
[wdata writeToFile:[td stringByAppendingPathComponent:@"weights/weight.bin"] atomically:YES];
NSError *e = nil;
((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(compileWithQoS:options:error:), 21, @{}, &e);
((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(loadWithQoS:options:error:), 21, @{}, &e);
IOSurfaceRef ioIn = make_surface(IC*SP*2);
IOSurfaceRef ioOut = make_surface(OC*SP*2);
// Write input
IOSurfaceLock(ioIn, 0, NULL);
_Float16 *inp = (_Float16*)IOSurfaceGetBaseAddress(ioIn);
for (int c = 0; c < IC; c++) for (int s = 0; s < SP; s++) inp[c*SP+s] = (_Float16)(s+1.0f);
IOSurfaceUnlock(ioIn, 0, NULL);
// Normal eval first (baseline)
id wI = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioIn);
id wO = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioOut);
id req0 = ((id(*)(Class,SEL,id,id,id,id,id,id,id))objc_msgSend)(g_AR,
@selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:),
@[wI], @[@0], @[wO], @[@0], nil, nil, @0);
BOOL ok = ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)(
mdl, @selector(evaluateWithQoS:options:request:error:), 21, @{}, req0, &e);
printf(" Baseline eval (weightsBuffer=nil, procIdx=0): %s\n", ok ? "OK" : "FAIL");
IOSurfaceLock(ioOut, kIOSurfaceLockReadOnly, NULL);
_Float16 *out0 = (_Float16*)IOSurfaceGetBaseAddress(ioOut);
printf(" Output: [%.1f, %.1f, %.1f, %.1f, ...]\n",
(float)out0[0], (float)out0[1], (float)out0[2], (float)out0[3]);
IOSurfaceUnlock(ioOut, kIOSurfaceLockReadOnly, NULL);
// Test weightsBuffer: create IOSurface with weight data
printf("\n Testing weightsBuffer IOSurface...\n");
_Float16 weights2[16];
for (int i = 0; i < 16; i++) weights2[i] = (i/4 == i%4) ? (_Float16)3.0f : (_Float16)0.0f;
IOSurfaceRef ioW = make_surface(ws);
IOSurfaceLock(ioW, 0, NULL);
memcpy(IOSurfaceGetBaseAddress(ioW), weights2, ws);
IOSurfaceUnlock(ioW, 0, NULL);
id wW = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioW);
// Try with weightsBuffer
wI = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioIn);
wO = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioOut);
id req_wb = ((id(*)(Class,SEL,id,id,id,id,id,id,id))objc_msgSend)(g_AR,
@selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:),
@[wI], @[@0], @[wO], @[@0], wW, nil, @0);
printf(" Request with weightsBuffer: %s\n", req_wb ? "created" : "nil");
if (req_wb) {
ok = ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)(
mdl, @selector(evaluateWithQoS:options:request:error:), 21, @{}, req_wb, &e);
printf(" Eval with weightsBuffer: %s\n", ok ? "OK" : [[e description] UTF8String]);
if (ok) {
IOSurfaceLock(ioOut, kIOSurfaceLockReadOnly, NULL);
_Float16 *outW = (_Float16*)IOSurfaceGetBaseAddress(ioOut);
printf(" Output (3x identity via weightsBuffer): [%.1f, %.1f, %.1f, %.1f, ...]\n",
(float)outW[0], (float)outW[1], (float)outW[2], (float)outW[3]);
bool is_3x = fabsf((float)outW[0] - 3.0f) < 0.1f;
printf(" weightsBuffer override %s\n", is_3x ? "WORKS!" : "does NOT work (output unchanged)");
IOSurfaceUnlock(ioOut, kIOSurfaceLockReadOnly, NULL);
}
}
CFRelease(ioW);
// === Part 4: procedureIndex sweep ===
printf("\n--- Part 4: procedureIndex sweep (0-15) ---\n");
for (int pi = 0; pi < 16; pi++) {
wI = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioIn);
wO = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioOut);
id req_p = ((id(*)(Class,SEL,id,id,id,id,id,id,id))objc_msgSend)(g_AR,
@selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:),
@[wI], @[@0], @[wO], @[@0], nil, nil, @(pi));
if (!req_p) { printf(" procIdx %2d: request=nil\n", pi); continue; }
ok = ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)(
mdl, @selector(evaluateWithQoS:options:request:error:), 21, @{}, req_p, &e);
printf(" procIdx %2d: %s%s\n", pi, ok ? "OK" : "FAIL",
!ok && e ? [NSString stringWithFormat:@" (%@)", [e localizedDescription]].UTF8String : "");
}
// === Part 5: Scan all ANE classes ===
printf("\n--- Part 5: All ANE-prefixed classes ---\n");
unsigned int classCount;
Class *allClasses = objc_copyClassList(&classCount);
for (unsigned int i = 0; i < classCount; i++) {
const char *name = class_getName(allClasses[i]);
if (strstr(name, "ANE") || strstr(name, "ane")) {
printf(" %s\n", name);
}
}
free(allClasses);
// Cleanup
((BOOL(*)(id,SEL,unsigned int,NSError**))objc_msgSend)(mdl, @selector(unloadWithQoS:error:), 21, &e);
[fm removeItemAtPath:td error:nil];
CFRelease(ioIn); CFRelease(ioOut);
printf("\nDone.\n");
}
return 0;
}

248
training/test_perf_stats.m Normal file
View File

@ -0,0 +1,248 @@
// test_perf_stats.m What does _ANEPerformanceStats expose?
// Probe class methods, properties, instantiate, pass to request, read back.
#import <Foundation/Foundation.h>
#import <objc/runtime.h>
#import <objc/message.h>
#import <dlfcn.h>
#import <IOSurface/IOSurface.h>
#import <mach/mach_time.h>
static mach_timebase_info_data_t g_tb;
static double tb_ms(uint64_t t) { return (double)t * g_tb.numer / g_tb.denom / 1e6; }
static void dump_class(const char *name) {
Class cls = NSClassFromString([NSString stringWithUTF8String:name]);
if (!cls) { printf(" %s: NOT FOUND\n", name); return; }
printf("\n=== %s ===\n", name);
// Class methods
unsigned int count;
Method *methods = class_copyMethodList(object_getClass(cls), &count);
if (count) printf(" Class methods:\n");
for (unsigned int i = 0; i < count; i++) {
SEL s = method_getName(methods[i]);
const char *enc = method_getTypeEncoding(methods[i]);
printf(" + %s [%s]\n", sel_getName(s), enc ? enc : "?");
}
free(methods);
// Instance methods
methods = class_copyMethodList(cls, &count);
if (count) printf(" Instance methods:\n");
for (unsigned int i = 0; i < count; i++) {
SEL s = method_getName(methods[i]);
const char *enc = method_getTypeEncoding(methods[i]);
printf(" - %s [%s]\n", sel_getName(s), enc ? enc : "?");
}
free(methods);
// Properties
unsigned int pcount;
objc_property_t *props = class_copyPropertyList(cls, &pcount);
if (pcount) printf(" Properties:\n");
for (unsigned int i = 0; i < pcount; i++) {
const char *pname = property_getName(props[i]);
const char *pattr = property_getAttributes(props[i]);
printf(" @property %s [%s]\n", pname, pattr ? pattr : "?");
}
free(props);
// Protocols
unsigned int prcount;
Protocol * __unsafe_unretained *protos = class_copyProtocolList(cls, &prcount);
if (prcount) {
printf(" Protocols:");
for (unsigned int i = 0; i < prcount; i++)
printf(" %s", protocol_getName(protos[i]));
printf("\n");
}
free(protos);
}
static IOSurfaceRef make_surface(size_t bytes) {
return IOSurfaceCreate((__bridge CFDictionaryRef)@{
(id)kIOSurfaceWidth:@(bytes), (id)kIOSurfaceHeight:@1,
(id)kIOSurfaceBytesPerElement:@1, (id)kIOSurfaceBytesPerRow:@(bytes),
(id)kIOSurfaceAllocSize:@(bytes), (id)kIOSurfacePixelFormat:@0});
}
int main() {
@autoreleasepool {
setbuf(stdout, NULL);
mach_timebase_info(&g_tb);
dlopen("/System/Library/PrivateFrameworks/AppleNeuralEngine.framework/AppleNeuralEngine", RTLD_NOW);
printf("=== ANE Performance Stats Probe ===\n");
// Dump all ANE-related classes
dump_class("_ANEPerformanceStats");
dump_class("_ANEPerfRequest");
dump_class("ANEPerfRequest");
dump_class("_ANEPerformanceCounters");
dump_class("_ANEDeviceInfo");
dump_class("_ANEModel");
dump_class("_ANEInMemoryModel");
dump_class("_ANERequest");
dump_class("_ANEIOSurfaceObject");
dump_class("_ANEInMemoryModelDescriptor");
dump_class("_ANEClient");
dump_class("_ANEVirtualClient");
// Try to instantiate _ANEPerformanceStats
printf("\n=== Instantiation Tests ===\n");
Class perfClass = NSClassFromString(@"_ANEPerformanceStats");
if (perfClass) {
// Try alloc/init
@try {
id perfStats = [[perfClass alloc] init];
printf("_ANEPerformanceStats alloc/init: %s\n",
perfStats ? [[perfStats description] UTF8String] : "nil");
// Try to read all properties via KVC
if (perfStats) {
unsigned int pcount;
objc_property_t *props = class_copyPropertyList(perfClass, &pcount);
for (unsigned int i = 0; i < pcount; i++) {
const char *pname = property_getName(props[i]);
@try {
id val = [perfStats valueForKey:[NSString stringWithUTF8String:pname]];
printf(" %s = %s\n", pname, val ? [[val description] UTF8String] : "nil");
} @catch (NSException *ex) {
printf(" %s = <exception: %s>\n", pname, [[ex reason] UTF8String]);
}
}
free(props);
}
} @catch (NSException *ex) {
printf("Exception: %s\n", [[ex reason] UTF8String]);
}
}
// === Compile a simple kernel and try passing perfStats to request ===
printf("\n=== Compile kernel and test perfStats in request ===\n");
Class g_D = NSClassFromString(@"_ANEInMemoryModelDescriptor");
Class g_I = NSClassFromString(@"_ANEInMemoryModel");
Class g_AR = NSClassFromString(@"_ANERequest");
Class g_AIO= NSClassFromString(@"_ANEIOSurfaceObject");
int IC = 4, OC = 4, SP = 4;
_Float16 weights[16];
for (int i = 0; i < 16; i++) weights[i] = (i/4 == i%4) ? (_Float16)1.0f : (_Float16)0.0f;
int ws = 16*2, tot = 128+ws;
uint8_t *blob = (uint8_t*)calloc(tot,1);
blob[0]=1; blob[4]=2; blob[64]=0xEF; blob[65]=0xBE; blob[66]=0xAD; blob[67]=0xDE; blob[68]=1;
*(uint32_t*)(blob+72)=ws; *(uint32_t*)(blob+80)=128;
memcpy(blob+128, weights, ws);
NSData *wdata = [NSData dataWithBytesNoCopy:blob length:tot freeWhenDone:YES];
NSString *mil = [NSString stringWithFormat:
@"program(1.3)\n"
"[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, "
"{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, "
"{\"coremltools-version\", \"9.0\"}})]\n"
"{\n"
" func main<ios18>(tensor<fp16, [1, %d, 1, %d]> x) {\n"
" string pt = const()[name=string(\"pt\"), val=string(\"valid\")];\n"
" tensor<int32, [2]> st = const()[name=string(\"st\"), val=tensor<int32, [2]>([1,1])];\n"
" tensor<int32, [4]> pd = const()[name=string(\"pd\"), val=tensor<int32, [4]>([0,0,0,0])];\n"
" tensor<int32, [2]> dl = const()[name=string(\"dl\"), val=tensor<int32, [2]>([1,1])];\n"
" int32 gr = const()[name=string(\"gr\"), val=int32(1)];\n"
" tensor<fp16, [%d,%d,1,1]> W = const()[name=string(\"W\"), "
"val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/weight.bin\"), offset=uint64(64)))];\n"
" tensor<fp16, [1,%d,1,%d]> y = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W,x=x)"
"[name=string(\"conv\")];\n"
" } -> (y);\n"
"}\n", IC, SP, OC, IC, OC, IC, OC, SP];
NSData *md = [mil dataUsingEncoding:NSUTF8StringEncoding];
id desc = ((id(*)(Class,SEL,id,id,id))objc_msgSend)(g_D, @selector(modelWithMILText:weights:optionsPlist:),
md, @{@"@model_path/weights/weight.bin": @{@"offset":@0, @"data":wdata}}, nil);
id mdl = ((id(*)(Class,SEL,id))objc_msgSend)(g_I, @selector(inMemoryModelWithDescriptor:), desc);
id hx = ((id(*)(id,SEL))objc_msgSend)(mdl, @selector(hexStringIdentifier));
NSString *td = [NSTemporaryDirectory() stringByAppendingPathComponent:hx];
[[NSFileManager defaultManager] createDirectoryAtPath:[td stringByAppendingPathComponent:@"weights"]
withIntermediateDirectories:YES attributes:nil error:nil];
[md writeToFile:[td stringByAppendingPathComponent:@"model.mil"] atomically:YES];
[wdata writeToFile:[td stringByAppendingPathComponent:@"weights/weight.bin"] atomically:YES];
NSError *e = nil;
((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(compileWithQoS:options:error:), 21, @{}, &e);
((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(loadWithQoS:options:error:), 21, @{}, &e);
IOSurfaceRef ioIn = make_surface(IC*SP*2);
IOSurfaceRef ioOut = make_surface(OC*SP*2);
id wI = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioIn);
id wO = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioOut);
// Try creating request WITH perfStats
if (perfClass) {
id perfStats = [[perfClass alloc] init];
printf(" Creating request with perfStats=%s\n", perfStats ? "non-nil" : "nil");
id req = ((id(*)(Class,SEL,id,id,id,id,id,id,id))objc_msgSend)(g_AR,
@selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:),
@[wI], @[@0], @[wO], @[@0], nil, perfStats, @0);
printf(" Request: %s\n", req ? "created" : "nil");
if (req) {
// Write input
IOSurfaceLock(ioIn, 0, NULL);
_Float16 *inp = (_Float16*)IOSurfaceGetBaseAddress(ioIn);
for (int i = 0; i < IC*SP; i++) inp[i] = (_Float16)1.0f;
IOSurfaceUnlock(ioIn, 0, NULL);
// Eval
BOOL ok = ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)(
mdl, @selector(evaluateWithQoS:options:request:error:), 21, @{}, req, &e);
printf(" Eval: %s\n", ok ? "OK" : [[e description] UTF8String]);
// Read perfStats after eval
if (ok && perfStats) {
printf(" PerfStats after eval:\n");
unsigned int pcount;
objc_property_t *props = class_copyPropertyList(perfClass, &pcount);
for (unsigned int i = 0; i < pcount; i++) {
const char *pname = property_getName(props[i]);
@try {
id val = [perfStats valueForKey:[NSString stringWithUTF8String:pname]];
printf(" %s = %s\n", pname, val ? [[val description] UTF8String] : "nil");
} @catch (NSException *ex) {
printf(" %s = <exception>\n", pname);
}
}
free(props);
// Run 100 evals and check if counters accumulate
printf("\n Running 100 evals...\n");
for (int i = 0; i < 100; i++) {
((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)(
mdl, @selector(evaluateWithQoS:options:request:error:), 21, @{}, req, &e);
}
printf(" PerfStats after 100 evals:\n");
props = class_copyPropertyList(perfClass, &pcount);
for (unsigned int i = 0; i < pcount; i++) {
const char *pname = property_getName(props[i]);
@try {
id val = [perfStats valueForKey:[NSString stringWithUTF8String:pname]];
printf(" %s = %s\n", pname, val ? [[val description] UTF8String] : "nil");
} @catch (NSException *ex) {
printf(" %s = <exception>\n", pname);
}
}
free(props);
}
}
}
// Also probe IORegistry for ANE perf data
printf("\n=== IORegistry ANE info ===\n");
printf(" (run: ioreg -r -c H11ANEIn | head -100)\n");
// Cleanup
((BOOL(*)(id,SEL,unsigned int,NSError**))objc_msgSend)(mdl, @selector(unloadWithQoS:error:), 21, &e);
[[NSFileManager defaultManager] removeItemAtPath:td error:nil];
CFRelease(ioIn); CFRelease(ioOut);
}
return 0;
}

154
training/test_qos_sweep.m Normal file
View File

@ -0,0 +1,154 @@
// test_qos_sweep.m Does QoS affect frequency/latency?
// Sweep QoS 0-63 on compile, load, eval of a simple kernel.
#import <Foundation/Foundation.h>
#import <objc/runtime.h>
#import <objc/message.h>
#import <dlfcn.h>
#import <IOSurface/IOSurface.h>
#import <mach/mach_time.h>
static mach_timebase_info_data_t g_tb;
static double tb_ms(uint64_t t) { return (double)t * g_tb.numer / g_tb.denom / 1e6; }
static IOSurfaceRef make_surface(size_t bytes) {
return IOSurfaceCreate((__bridge CFDictionaryRef)@{
(id)kIOSurfaceWidth:@(bytes), (id)kIOSurfaceHeight:@1,
(id)kIOSurfaceBytesPerElement:@1, (id)kIOSurfaceBytesPerRow:@(bytes),
(id)kIOSurfaceAllocSize:@(bytes), (id)kIOSurfacePixelFormat:@0});
}
int main() {
@autoreleasepool {
setbuf(stdout, NULL);
mach_timebase_info(&g_tb);
dlopen("/System/Library/PrivateFrameworks/AppleNeuralEngine.framework/AppleNeuralEngine", RTLD_NOW);
Class g_D = NSClassFromString(@"_ANEInMemoryModelDescriptor");
Class g_I = NSClassFromString(@"_ANEInMemoryModel");
Class g_AR = NSClassFromString(@"_ANERequest");
Class g_AIO= NSClassFromString(@"_ANEIOSurfaceObject");
// Larger kernel for measurable latency: 256x256 conv, spatial=64
int IC = 256, OC = 256, SP = 64;
int ws = IC*OC*2, tot = 128+ws;
uint8_t *blob = (uint8_t*)calloc(tot, 1);
blob[0]=1; blob[4]=2; blob[64]=0xEF; blob[65]=0xBE; blob[66]=0xAD; blob[67]=0xDE; blob[68]=1;
*(uint32_t*)(blob+72)=ws; *(uint32_t*)(blob+80)=128;
// Random weights
_Float16 *wp = (_Float16*)(blob+128);
for (int i = 0; i < IC*OC; i++) wp[i] = (_Float16)(0.01f * (i % 100 - 50));
NSData *wdata = [NSData dataWithBytesNoCopy:blob length:tot freeWhenDone:YES];
NSString *mil = [NSString stringWithFormat:
@"program(1.3)\n"
"[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, "
"{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, "
"{\"coremltools-version\", \"9.0\"}})]\n"
"{\n"
" func main<ios18>(tensor<fp16, [1, %d, 1, %d]> x) {\n"
" string pt = const()[name=string(\"pt\"), val=string(\"valid\")];\n"
" tensor<int32, [2]> st = const()[name=string(\"st\"), val=tensor<int32, [2]>([1,1])];\n"
" tensor<int32, [4]> pd = const()[name=string(\"pd\"), val=tensor<int32, [4]>([0,0,0,0])];\n"
" tensor<int32, [2]> dl = const()[name=string(\"dl\"), val=tensor<int32, [2]>([1,1])];\n"
" int32 gr = const()[name=string(\"gr\"), val=int32(1)];\n"
" tensor<fp16, [%d,%d,1,1]> W = const()[name=string(\"W\"), "
"val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/weight.bin\"), offset=uint64(64)))];\n"
" tensor<fp16, [1,%d,1,%d]> y = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W,x=x)"
"[name=string(\"conv\")];\n"
" } -> (y);\n"
"}\n", IC, SP, OC, IC, OC, IC, OC, SP];
NSDictionary *weights = @{@"@model_path/weights/weight.bin": @{@"offset":@0, @"data":wdata}};
NSData *milData = [mil dataUsingEncoding:NSUTF8StringEncoding];
NSFileManager *fm = [NSFileManager defaultManager];
printf("=== QoS Sweep: compile/load/eval with QoS 0-63 ===\n");
printf("Kernel: %dx%d conv, spatial=%d (%.1f MFLOPS)\n", IC, OC, SP, 2.0*IC*OC*SP/1e6);
printf("%4s %10s %10s %10s %10s %s\n", "QoS", "Compile", "Load", "Eval(1)", "Eval(avg10)", "Status");
unsigned int qos_values[] = {0, 1, 5, 10, 15, 17, 19, 21, 25, 31, 33, 40, 47, 50, 55, 60, 63};
int n_qos = sizeof(qos_values)/sizeof(qos_values[0]);
for (int qi = 0; qi < n_qos; qi++) {
unsigned int qos = qos_values[qi];
NSError *e = nil;
id desc = ((id(*)(Class,SEL,id,id,id))objc_msgSend)(g_D, @selector(modelWithMILText:weights:optionsPlist:),
milData, weights, nil);
id mdl = ((id(*)(Class,SEL,id))objc_msgSend)(g_I, @selector(inMemoryModelWithDescriptor:), desc);
id hx = ((id(*)(id,SEL))objc_msgSend)(mdl, @selector(hexStringIdentifier));
NSString *td = [NSTemporaryDirectory() stringByAppendingPathComponent:
[NSString stringWithFormat:@"qos_test_%u_%@", qos, hx]];
[fm createDirectoryAtPath:[td stringByAppendingPathComponent:@"weights"]
withIntermediateDirectories:YES attributes:nil error:nil];
[milData writeToFile:[td stringByAppendingPathComponent:@"model.mil"] atomically:YES];
[wdata writeToFile:[td stringByAppendingPathComponent:@"weights/weight.bin"] atomically:YES];
// Compile
uint64_t t0 = mach_absolute_time();
BOOL cok = ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(
mdl, @selector(compileWithQoS:options:error:), qos, @{}, &e);
double cms = tb_ms(mach_absolute_time() - t0);
if (!cok) {
printf("%4u %10s %10s %10s %10s COMPILE_FAIL\n", qos, "-", "-", "-", "-");
[fm removeItemAtPath:td error:nil];
continue;
}
// Load
t0 = mach_absolute_time();
BOOL lok = ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(
mdl, @selector(loadWithQoS:options:error:), qos, @{}, &e);
double lms = tb_ms(mach_absolute_time() - t0);
if (!lok) {
printf("%4u %8.1fms %10s %10s %10s LOAD_FAIL\n", qos, cms, "-", "-", "-");
[fm removeItemAtPath:td error:nil];
continue;
}
// Build request
IOSurfaceRef ioIn = make_surface(IC*SP*2);
IOSurfaceRef ioOut = make_surface(OC*SP*2);
id wI = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioIn);
id wO = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioOut);
id req = ((id(*)(Class,SEL,id,id,id,id,id,id,id))objc_msgSend)(g_AR,
@selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:),
@[wI], @[@0], @[wO], @[@0], nil, nil, @0);
// Write input
IOSurfaceLock(ioIn, 0, NULL);
_Float16 *inp = (_Float16*)IOSurfaceGetBaseAddress(ioIn);
for (int i = 0; i < IC*SP; i++) inp[i] = (_Float16)0.5f;
IOSurfaceUnlock(ioIn, 0, NULL);
// Eval with same QoS
t0 = mach_absolute_time();
BOOL eok = ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)(
mdl, @selector(evaluateWithQoS:options:request:error:), qos, @{}, req, &e);
double ems1 = tb_ms(mach_absolute_time() - t0);
if (!eok) {
printf("%4u %8.1fms %8.1fms %10s %10s EVAL_FAIL\n", qos, cms, lms, "-", "-");
} else {
// Average over 10 evals
t0 = mach_absolute_time();
for (int i = 0; i < 10; i++) {
((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)(
mdl, @selector(evaluateWithQoS:options:request:error:), qos, @{}, req, &e);
}
double ems_avg = tb_ms(mach_absolute_time() - t0) / 10.0;
printf("%4u %8.1fms %8.1fms %8.2fms %8.2fms OK\n", qos, cms, lms, ems1, ems_avg);
}
// Cleanup
((BOOL(*)(id,SEL,unsigned int,NSError**))objc_msgSend)(mdl, @selector(unloadWithQoS:error:), 21, &e);
CFRelease(ioIn); CFRelease(ioOut);
[fm removeItemAtPath:td error:nil];
}
printf("\nDone.\n");
}
return 0;
}

View File

@ -0,0 +1,241 @@
// test_weight_reload.m Can we skip recompilation by rewriting weight blobs on disk?
// Compile a conv kernel with weights A, eval, verify output.
// Overwrite weights/weight.bin in tmpDir with weights B.
// unloadWithQoS: then loadWithQoS: (no recompile).
// Eval again if output matches B @ x, compilation bottleneck is eliminated.
#import <Foundation/Foundation.h>
#import <objc/runtime.h>
#import <objc/message.h>
#import <dlfcn.h>
#import <IOSurface/IOSurface.h>
#import <mach/mach_time.h>
#include <math.h>
static mach_timebase_info_data_t g_tb;
static double tb_ms(uint64_t t) { return (double)t * g_tb.numer / g_tb.denom / 1e6; }
static IOSurfaceRef make_surface(size_t bytes) {
return IOSurfaceCreate((__bridge CFDictionaryRef)@{
(id)kIOSurfaceWidth:@(bytes), (id)kIOSurfaceHeight:@1,
(id)kIOSurfaceBytesPerElement:@1, (id)kIOSurfaceBytesPerRow:@(bytes),
(id)kIOSurfaceAllocSize:@(bytes), (id)kIOSurfacePixelFormat:@0});
}
int main() {
@autoreleasepool {
setbuf(stdout, NULL);
mach_timebase_info(&g_tb);
dlopen("/System/Library/PrivateFrameworks/AppleNeuralEngine.framework/AppleNeuralEngine", RTLD_NOW);
Class g_D = NSClassFromString(@"_ANEInMemoryModelDescriptor");
Class g_I = NSClassFromString(@"_ANEInMemoryModel");
Class g_AR = NSClassFromString(@"_ANERequest");
Class g_AIO= NSClassFromString(@"_ANEIOSurfaceObject");
if (!g_D || !g_I || !g_AR || !g_AIO) {
printf("FAIL: ANE classes not found\n");
return 1;
}
// Small test: 4x4 conv kernel, spatial=4
int IC = 4, OC = 4, SP = 4;
// Weight set A: identity matrix
_Float16 weightsA[16];
for (int i = 0; i < IC*OC; i++) weightsA[i] = (i / OC == i % OC) ? (_Float16)1.0f : (_Float16)0.0f;
// Weight set B: 2x identity
_Float16 weightsB[16];
for (int i = 0; i < IC*OC; i++) weightsB[i] = (i / OC == i % OC) ? (_Float16)2.0f : (_Float16)0.0f;
// Build weight blob for A
int ws = IC * OC * 2;
int tot = 128 + ws;
uint8_t *blob = (uint8_t*)calloc(tot, 1);
blob[0]=1; blob[4]=2; blob[64]=0xEF; blob[65]=0xBE; blob[66]=0xAD; blob[67]=0xDE; blob[68]=1;
*(uint32_t*)(blob+72) = ws;
*(uint32_t*)(blob+80) = 128;
memcpy(blob + 128, weightsA, ws);
NSData *wdataA = [NSData dataWithBytesNoCopy:blob length:tot freeWhenDone:YES];
// MIL for a simple conv
NSString *mil = [NSString stringWithFormat:
@"program(1.3)\n"
"[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, "
"{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, "
"{\"coremltools-version\", \"9.0\"}})]\n"
"{\n"
" func main<ios18>(tensor<fp16, [1, %d, 1, %d]> x) {\n"
" string pt = const()[name=string(\"pt\"), val=string(\"valid\")];\n"
" tensor<int32, [2]> st = const()[name=string(\"st\"), val=tensor<int32, [2]>([1,1])];\n"
" tensor<int32, [4]> pd = const()[name=string(\"pd\"), val=tensor<int32, [4]>([0,0,0,0])];\n"
" tensor<int32, [2]> dl = const()[name=string(\"dl\"), val=tensor<int32, [2]>([1,1])];\n"
" int32 gr = const()[name=string(\"gr\"), val=int32(1)];\n"
" tensor<fp16, [%d,%d,1,1]> W = const()[name=string(\"W\"), "
"val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/weight.bin\"), offset=uint64(64)))];\n"
" tensor<fp16, [1,%d,1,%d]> y = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W,x=x)"
"[name=string(\"conv\")];\n"
" } -> (y);\n"
"}\n", IC, SP, OC, IC, OC, IC, OC, SP];
NSDictionary *weights = @{
@"@model_path/weights/weight.bin": @{@"offset": @0, @"data": wdataA}
};
NSData *milData = [mil dataUsingEncoding:NSUTF8StringEncoding];
// === Compile with weights A ===
printf("=== Step 1: Compile with weights A (identity) ===\n");
uint64_t t0 = mach_absolute_time();
id desc = ((id(*)(Class,SEL,id,id,id))objc_msgSend)(g_D, @selector(modelWithMILText:weights:optionsPlist:), milData, weights, nil);
if (!desc) { printf("FAIL: desc=NULL\n"); return 1; }
id mdl = ((id(*)(Class,SEL,id))objc_msgSend)(g_I, @selector(inMemoryModelWithDescriptor:), desc);
id hx = ((id(*)(id,SEL))objc_msgSend)(mdl, @selector(hexStringIdentifier));
NSString *td = [NSTemporaryDirectory() stringByAppendingPathComponent:hx];
NSFileManager *fm = [NSFileManager defaultManager];
[fm createDirectoryAtPath:[td stringByAppendingPathComponent:@"weights"] withIntermediateDirectories:YES attributes:nil error:nil];
[milData writeToFile:[td stringByAppendingPathComponent:@"model.mil"] atomically:YES];
[wdataA writeToFile:[td stringByAppendingPathComponent:@"weights/weight.bin"] atomically:YES];
NSError *e = nil;
BOOL ok = ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(compileWithQoS:options:error:), 21, @{}, &e);
if (!ok) { printf("FAIL: compile: %s\n", [[e description] UTF8String]); return 1; }
ok = ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(loadWithQoS:options:error:), 21, @{}, &e);
if (!ok) { printf("FAIL: load: %s\n", [[e description] UTF8String]); return 1; }
printf(" Compile+load: %.1fms\n", tb_ms(mach_absolute_time() - t0));
printf(" tmpDir: %s\n", [td UTF8String]);
// Build request and IOSurfaces
int inBytes = IC * SP * 2;
int outBytes = OC * SP * 2;
IOSurfaceRef ioIn = make_surface(inBytes);
IOSurfaceRef ioOut = make_surface(outBytes);
id wI = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioIn);
id wO = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioOut);
id req = ((id(*)(Class,SEL,id,id,id,id,id,id,id))objc_msgSend)(g_AR,
@selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:),
@[wI], @[@0], @[wO], @[@0], nil, nil, @0);
// Write input: [1, 2, 3, 4] repeated across channels
IOSurfaceLock(ioIn, 0, NULL);
_Float16 *inp = (_Float16*)IOSurfaceGetBaseAddress(ioIn);
for (int c = 0; c < IC; c++)
for (int s = 0; s < SP; s++)
inp[c * SP + s] = (_Float16)(s + 1.0f);
IOSurfaceUnlock(ioIn, 0, NULL);
// Eval with weights A
printf("\n=== Step 2: Eval with weights A ===\n");
ok = ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)(mdl, @selector(evaluateWithQoS:options:request:error:), 21, @{}, req, &e);
if (!ok) { printf("FAIL: eval: %s\n", e ? [[e description] UTF8String] : "?"); return 1; }
IOSurfaceLock(ioOut, kIOSurfaceLockReadOnly, NULL);
_Float16 *outA = (_Float16*)IOSurfaceGetBaseAddress(ioOut);
printf(" Output A (identity @ [1,2,3,4]):");
for (int c = 0; c < OC; c++) {
printf(" [");
for (int s = 0; s < SP; s++) printf("%.1f%s", (float)outA[c*SP+s], s<SP-1?",":"");
printf("]");
}
printf("\n");
// Copy output A
_Float16 outA_copy[64];
memcpy(outA_copy, outA, outBytes);
IOSurfaceUnlock(ioOut, kIOSurfaceLockReadOnly, NULL);
// === Step 3: Overwrite weight file with B, unload+load ===
printf("\n=== Step 3: Overwrite weight.bin with B (2x identity), unload+load ===\n");
uint8_t *blobB = (uint8_t*)calloc(tot, 1);
blobB[0]=1; blobB[4]=2; blobB[64]=0xEF; blobB[65]=0xBE; blobB[66]=0xAD; blobB[67]=0xDE; blobB[68]=1;
*(uint32_t*)(blobB+72) = ws;
*(uint32_t*)(blobB+80) = 128;
memcpy(blobB + 128, weightsB, ws);
NSData *wdataB = [NSData dataWithBytesNoCopy:blobB length:tot freeWhenDone:YES];
NSString *weightPath = [td stringByAppendingPathComponent:@"weights/weight.bin"];
[wdataB writeToFile:weightPath atomically:YES];
printf(" Wrote new weight.bin (%d bytes)\n", tot);
// Unload
t0 = mach_absolute_time();
ok = ((BOOL(*)(id,SEL,unsigned int,NSError**))objc_msgSend)(mdl, @selector(unloadWithQoS:error:), 21, &e);
printf(" Unload: %s (%.2fms)\n", ok ? "OK" : "FAIL", tb_ms(mach_absolute_time() - t0));
// Reload (no compile!)
t0 = mach_absolute_time();
ok = ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(loadWithQoS:options:error:), 21, @{}, &e);
printf(" Load: %s (%.2fms)\n", ok ? "OK" : [[e description] UTF8String], tb_ms(mach_absolute_time() - t0));
if (!ok) {
printf("\n*** Load-after-overwrite FAILED — trying compile+load ***\n");
t0 = mach_absolute_time();
ok = ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(compileWithQoS:options:error:), 21, @{}, &e);
printf(" Re-compile: %s (%.2fms)\n", ok ? "OK" : "FAIL", tb_ms(mach_absolute_time() - t0));
t0 = mach_absolute_time();
ok = ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(loadWithQoS:options:error:), 21, @{}, &e);
printf(" Re-load: %s (%.2fms)\n", ok ? "OK" : "FAIL", tb_ms(mach_absolute_time() - t0));
}
// Need new request with new IOSurface objects (re-use same surfaces)
wI = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioIn);
wO = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioOut);
req = ((id(*)(Class,SEL,id,id,id,id,id,id,id))objc_msgSend)(g_AR,
@selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:),
@[wI], @[@0], @[wO], @[@0], nil, nil, @0);
// Re-write same input
IOSurfaceLock(ioIn, 0, NULL);
inp = (_Float16*)IOSurfaceGetBaseAddress(ioIn);
for (int c = 0; c < IC; c++)
for (int s = 0; s < SP; s++)
inp[c * SP + s] = (_Float16)(s + 1.0f);
IOSurfaceUnlock(ioIn, 0, NULL);
// Eval with (possibly reloaded) weights B
printf("\n=== Step 4: Eval after reload ===\n");
ok = ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)(mdl, @selector(evaluateWithQoS:options:request:error:), 21, @{}, req, &e);
if (!ok) { printf("FAIL: eval after reload: %s\n", e ? [[e description] UTF8String] : "?"); return 1; }
IOSurfaceLock(ioOut, kIOSurfaceLockReadOnly, NULL);
_Float16 *outB = (_Float16*)IOSurfaceGetBaseAddress(ioOut);
printf(" Output B (2x identity @ [1,2,3,4]):");
for (int c = 0; c < OC; c++) {
printf(" [");
for (int s = 0; s < SP; s++) printf("%.1f%s", (float)outB[c*SP+s], s<SP-1?",":"");
printf("]");
}
printf("\n");
// Check: did the output change?
bool changed = false;
for (int i = 0; i < OC * SP; i++) {
if (fabsf((float)outB[i] - (float)outA_copy[i]) > 0.01f) { changed = true; break; }
}
// Expected output B should be 2x output A if weight reload worked
bool correct = true;
for (int i = 0; i < OC * SP; i++) {
float expected = (float)outA_copy[i] * 2.0f;
if (fabsf((float)outB[i] - expected) > 0.1f) { correct = false; break; }
}
IOSurfaceUnlock(ioOut, kIOSurfaceLockReadOnly, NULL);
printf("\n=== RESULT ===\n");
if (changed && correct) {
printf("SUCCESS: Weight reload works! Output changed to match new weights.\n");
printf(">>> Compilation bottleneck can be eliminated <<<\n");
} else if (changed && !correct) {
printf("PARTIAL: Output changed but doesn't match expected 2x. Weights may be partially updated.\n");
printf(" Expected 2x of A, got different values.\n");
} else {
printf("FAIL: Output did NOT change. Weight reload does not work.\n");
printf(" Output is still the same as weights A. ANE cached the compiled model.\n");
printf(">>> Need alternative approach (weightsBuffer IOSurface or async recompile) <<<\n");
}
// Cleanup
((BOOL(*)(id,SEL,unsigned int,NSError**))objc_msgSend)(mdl, @selector(unloadWithQoS:error:), 21, &e);
[fm removeItemAtPath:td error:nil];
CFRelease(ioIn); CFRelease(ioOut);
}
return 0;
}

View File

@ -581,6 +581,16 @@ int main(int argc, char *argv[]) {
steps_batch++;
if (step % 10 == 0 || step == start_step)
printf("step %-4d loss=%.4f\n", step, loss);
// JSON telemetry to stderr
double step_ane = t_ane/steps_batch, step_io = t_io/steps_batch;
double step_cls = t_cls/steps_batch, step_elem = t_elem/steps_batch;
double step_rms = t_rms/steps_batch, step_cbw = t_cblas_wait/steps_batch;
fprintf(stderr, "{\"type\":\"step\",\"step\":%d,\"loss\":%.6f,"
"\"t_ane\":%.3f,\"t_io\":%.3f,\"t_cls\":%.3f,"
"\"t_elem\":%.3f,\"t_rms\":%.3f,\"t_cblas_wait\":%.3f,"
"\"compiles\":%d}\n",
step, loss, step_ane, step_io, step_cls, step_elem, step_rms, step_cbw, g_compile_count);
}
double tms = tb_ms(mach_absolute_time() - tt);
total_train_ms += tms;
@ -622,6 +632,19 @@ int main(int argc, char *argv[]) {
printf(" ane=%.1f io=%.1f cls=%.1f elem=%.1f rms=%.1f cblas_wait=%.1f ms/step\n",
t_ane/steps_batch, t_io/steps_batch, t_cls/steps_batch, t_elem/steps_batch,
t_rms/steps_batch, t_cblas_wait/steps_batch);
// JSON batch telemetry to stderr
{
double bf = NLAYERS * (4.0*2*DIM*DIM*SEQ + 2.0*2*DIM*HIDDEN*SEQ + 2.0*HIDDEN*DIM*SEQ);
double bs = NLAYERS * 2.0*HEADS*5*SEQ*SEQ*HD;
double ane_f_batch = (bf*2 + bs) * steps_batch;
double ane_tflops = ane_f_batch / (tms * 1e9);
fprintf(stderr, "{\"type\":\"batch\",\"batch\":%d,\"compile_ms\":%.1f,"
"\"train_ms\":%.1f,\"ms_per_step\":%.1f}\n",
steps_batch, cms, tms, tms/steps_batch);
fprintf(stderr, "{\"type\":\"perf\",\"ane_tflops\":%.3f,\"ane_util_pct\":%.2f}\n",
ane_tflops, 100.0*ane_tflops/15.8);
}
}
// Efficiency report