From 40d3f4563160302f74f1f569406b3d80700a8fb4 Mon Sep 17 00:00:00 2001 From: m0at Date: Sun, 1 Mar 2026 22:54:58 -0800 Subject: [PATCH] Add ANE probe tests and training telemetry for M5 optimization Four standalone probe tests to characterize the M5 ANE: - test_weight_reload: Can weights be hot-swapped via unload+load without recompilation? - test_perf_stats: Enumerate _ANEPerformanceStats methods/properties and hardware counters - test_qos_sweep: Measure compile/load/eval latency across QoS 0-63 - test_ane_advanced: Probe SharedEvents, weightsBuffer IOSurface, procedureIndex, VirtualClient Training telemetry (train_large.m): - JSON lines to stderr with per-step timing breakdown and per-batch TFLOPS metrics - Enables external monitoring tools to visualize ANE utilization in real-time Co-Authored-By: Claude Opus 4.6 --- training/Makefile | 20 ++- training/test_ane_advanced.m | 240 ++++++++++++++++++++++++++++++++ training/test_perf_stats.m | 248 ++++++++++++++++++++++++++++++++++ training/test_qos_sweep.m | 154 +++++++++++++++++++++ training/test_weight_reload.m | 241 +++++++++++++++++++++++++++++++++ training/train_large.m | 23 ++++ 6 files changed, 924 insertions(+), 2 deletions(-) create mode 100644 training/test_ane_advanced.m create mode 100644 training/test_perf_stats.m create mode 100644 training/test_qos_sweep.m create mode 100644 training/test_weight_reload.m diff --git a/training/Makefile b/training/Makefile index 226bb39..9cc9e34 100644 --- a/training/Makefile +++ b/training/Makefile @@ -11,10 +11,26 @@ train: train.m ane_runtime.h ane_mil_gen.h model.h forward.h backward.h train_large: train_large.m $(HEADERS_LARGE) $(CC) $(CFLAGS) -o $@ train_large.m $(LDFLAGS) -framework Accelerate +PROBES = test_weight_reload test_perf_stats test_qos_sweep test_ane_advanced + +test_weight_reload: test_weight_reload.m + $(CC) $(CFLAGS) -o $@ $< $(LDFLAGS) + +test_perf_stats: test_perf_stats.m + $(CC) $(CFLAGS) -o $@ $< $(LDFLAGS) + +test_qos_sweep: test_qos_sweep.m + $(CC) $(CFLAGS) -o $@ $< $(LDFLAGS) + +test_ane_advanced: test_ane_advanced.m + $(CC) $(CFLAGS) -o $@ $< $(LDFLAGS) + +probes: $(PROBES) + tokenize: python3 tokenize.py clean: - rm -f train train_large + rm -f train train_large $(PROBES) -.PHONY: clean tokenize +.PHONY: clean tokenize probes diff --git a/training/test_ane_advanced.m b/training/test_ane_advanced.m new file mode 100644 index 0000000..4a92a04 --- /dev/null +++ b/training/test_ane_advanced.m @@ -0,0 +1,240 @@ +// test_ane_advanced.m — Probe advanced ANE interfaces +// SharedEvents, weightsBuffer, procedureIndex, VirtualClient, ChainingRequest +#import +#import +#import +#import +#import +#import + +static mach_timebase_info_data_t g_tb; +static double tb_ms(uint64_t t) { return (double)t * g_tb.numer / g_tb.denom / 1e6; } + +static void dump_class(const char *name) { + Class cls = NSClassFromString([NSString stringWithUTF8String:name]); + if (!cls) { printf(" %s: NOT FOUND\n", name); return; } + printf("\n=== %s ===\n", name); + unsigned int count; + Method *methods = class_copyMethodList(object_getClass(cls), &count); + if (count) printf(" Class methods:\n"); + for (unsigned int i = 0; i < count; i++) { + SEL s = method_getName(methods[i]); + const char *enc = method_getTypeEncoding(methods[i]); + printf(" + %s [%s]\n", sel_getName(s), enc ? enc : "?"); + } + free(methods); + methods = class_copyMethodList(cls, &count); + if (count) printf(" Instance methods:\n"); + for (unsigned int i = 0; i < count; i++) { + SEL s = method_getName(methods[i]); + const char *enc = method_getTypeEncoding(methods[i]); + printf(" - %s [%s]\n", sel_getName(s), enc ? enc : "?"); + } + free(methods); + unsigned int pcount; + objc_property_t *props = class_copyPropertyList(cls, &pcount); + if (pcount) printf(" Properties:\n"); + for (unsigned int i = 0; i < pcount; i++) { + const char *pname = property_getName(props[i]); + const char *pattr = property_getAttributes(props[i]); + printf(" @property %s [%s]\n", pname, pattr ? pattr : "?"); + } + free(props); +} + +static IOSurfaceRef make_surface(size_t bytes) { + return IOSurfaceCreate((__bridge CFDictionaryRef)@{ + (id)kIOSurfaceWidth:@(bytes), (id)kIOSurfaceHeight:@1, + (id)kIOSurfaceBytesPerElement:@1, (id)kIOSurfaceBytesPerRow:@(bytes), + (id)kIOSurfaceAllocSize:@(bytes), (id)kIOSurfacePixelFormat:@0}); +} + +int main() { + @autoreleasepool { + setbuf(stdout, NULL); + mach_timebase_info(&g_tb); + dlopen("/System/Library/PrivateFrameworks/AppleNeuralEngine.framework/AppleNeuralEngine", RTLD_NOW); + + printf("=== ANE Advanced Interface Probe ===\n"); + + // === Part 1: Event/Sync classes === + printf("\n--- Part 1: Event/Sync Classes ---\n"); + dump_class("_ANESharedEvents"); + dump_class("_ANESharedSignalEvent"); + dump_class("_ANESharedWaitEvent"); + dump_class("_ANEEvent"); + dump_class("_ANEFenceEvent"); + + // Try instantiate + const char *event_classes[] = { + "_ANESharedEvents", "_ANESharedSignalEvent", "_ANESharedWaitEvent", + "_ANEEvent", "_ANEFenceEvent", NULL + }; + for (int i = 0; event_classes[i]; i++) { + Class cls = NSClassFromString([NSString stringWithUTF8String:event_classes[i]]); + if (!cls) continue; + @try { + id obj = [[cls alloc] init]; + printf(" %s alloc/init: %s\n", event_classes[i], + obj ? [[obj description] UTF8String] : "nil"); + } @catch (NSException *ex) { + printf(" %s alloc/init: EXCEPTION: %s\n", event_classes[i], [[ex reason] UTF8String]); + } + } + + // === Part 2: VirtualClient and ChainingRequest === + printf("\n--- Part 2: VirtualClient / ChainingRequest ---\n"); + dump_class("_ANEVirtualClient"); + dump_class("_ANEChainingRequest"); + dump_class("_ANEMultiRequest"); + dump_class("_ANEBatchRequest"); + + // === Part 3: weightsBuffer parameter test === + printf("\n--- Part 3: weightsBuffer IOSurface test ---\n"); + Class g_D = NSClassFromString(@"_ANEInMemoryModelDescriptor"); + Class g_I = NSClassFromString(@"_ANEInMemoryModel"); + Class g_AR = NSClassFromString(@"_ANERequest"); + Class g_AIO= NSClassFromString(@"_ANEIOSurfaceObject"); + + int IC = 4, OC = 4, SP = 4; + _Float16 weights[16]; + for (int i = 0; i < 16; i++) weights[i] = (i/4 == i%4) ? (_Float16)1.0f : (_Float16)0.0f; + + int ws = 16*2, tot = 128+ws; + uint8_t *blob = (uint8_t*)calloc(tot,1); + blob[0]=1; blob[4]=2; blob[64]=0xEF; blob[65]=0xBE; blob[66]=0xAD; blob[67]=0xDE; blob[68]=1; + *(uint32_t*)(blob+72)=ws; *(uint32_t*)(blob+80)=128; + memcpy(blob+128, weights, ws); + NSData *wdata = [NSData dataWithBytesNoCopy:blob length:tot freeWhenDone:YES]; + + NSString *mil = [NSString stringWithFormat: + @"program(1.3)\n" + "[buildInfo = dict({{\"coremlc-component-MIL\", \"3510.2.1\"}, " + "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, " + "{\"coremltools-version\", \"9.0\"}})]\n" + "{\n" + " func main(tensor x) {\n" + " string pt = const()[name=string(\"pt\"), val=string(\"valid\")];\n" + " tensor st = const()[name=string(\"st\"), val=tensor([1,1])];\n" + " tensor pd = const()[name=string(\"pd\"), val=tensor([0,0,0,0])];\n" + " tensor dl = const()[name=string(\"dl\"), val=tensor([1,1])];\n" + " int32 gr = const()[name=string(\"gr\"), val=int32(1)];\n" + " tensor W = const()[name=string(\"W\"), " + "val=tensor(BLOBFILE(path=string(\"@model_path/weights/weight.bin\"), offset=uint64(64)))];\n" + " tensor y = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W,x=x)" + "[name=string(\"conv\")];\n" + " } -> (y);\n" + "}\n", IC, SP, OC, IC, OC, IC, OC, SP]; + + NSData *md = [mil dataUsingEncoding:NSUTF8StringEncoding]; + id desc = ((id(*)(Class,SEL,id,id,id))objc_msgSend)(g_D, @selector(modelWithMILText:weights:optionsPlist:), + md, @{@"@model_path/weights/weight.bin": @{@"offset":@0, @"data":wdata}}, nil); + id mdl = ((id(*)(Class,SEL,id))objc_msgSend)(g_I, @selector(inMemoryModelWithDescriptor:), desc); + id hx = ((id(*)(id,SEL))objc_msgSend)(mdl, @selector(hexStringIdentifier)); + NSString *td = [NSTemporaryDirectory() stringByAppendingPathComponent:hx]; + NSFileManager *fm = [NSFileManager defaultManager]; + [fm createDirectoryAtPath:[td stringByAppendingPathComponent:@"weights"] + withIntermediateDirectories:YES attributes:nil error:nil]; + [md writeToFile:[td stringByAppendingPathComponent:@"model.mil"] atomically:YES]; + [wdata writeToFile:[td stringByAppendingPathComponent:@"weights/weight.bin"] atomically:YES]; + + NSError *e = nil; + ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(compileWithQoS:options:error:), 21, @{}, &e); + ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(loadWithQoS:options:error:), 21, @{}, &e); + + IOSurfaceRef ioIn = make_surface(IC*SP*2); + IOSurfaceRef ioOut = make_surface(OC*SP*2); + + // Write input + IOSurfaceLock(ioIn, 0, NULL); + _Float16 *inp = (_Float16*)IOSurfaceGetBaseAddress(ioIn); + for (int c = 0; c < IC; c++) for (int s = 0; s < SP; s++) inp[c*SP+s] = (_Float16)(s+1.0f); + IOSurfaceUnlock(ioIn, 0, NULL); + + // Normal eval first (baseline) + id wI = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioIn); + id wO = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioOut); + id req0 = ((id(*)(Class,SEL,id,id,id,id,id,id,id))objc_msgSend)(g_AR, + @selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:), + @[wI], @[@0], @[wO], @[@0], nil, nil, @0); + BOOL ok = ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)( + mdl, @selector(evaluateWithQoS:options:request:error:), 21, @{}, req0, &e); + printf(" Baseline eval (weightsBuffer=nil, procIdx=0): %s\n", ok ? "OK" : "FAIL"); + + IOSurfaceLock(ioOut, kIOSurfaceLockReadOnly, NULL); + _Float16 *out0 = (_Float16*)IOSurfaceGetBaseAddress(ioOut); + printf(" Output: [%.1f, %.1f, %.1f, %.1f, ...]\n", + (float)out0[0], (float)out0[1], (float)out0[2], (float)out0[3]); + IOSurfaceUnlock(ioOut, kIOSurfaceLockReadOnly, NULL); + + // Test weightsBuffer: create IOSurface with weight data + printf("\n Testing weightsBuffer IOSurface...\n"); + _Float16 weights2[16]; + for (int i = 0; i < 16; i++) weights2[i] = (i/4 == i%4) ? (_Float16)3.0f : (_Float16)0.0f; + + IOSurfaceRef ioW = make_surface(ws); + IOSurfaceLock(ioW, 0, NULL); + memcpy(IOSurfaceGetBaseAddress(ioW), weights2, ws); + IOSurfaceUnlock(ioW, 0, NULL); + id wW = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioW); + + // Try with weightsBuffer + wI = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioIn); + wO = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioOut); + id req_wb = ((id(*)(Class,SEL,id,id,id,id,id,id,id))objc_msgSend)(g_AR, + @selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:), + @[wI], @[@0], @[wO], @[@0], wW, nil, @0); + printf(" Request with weightsBuffer: %s\n", req_wb ? "created" : "nil"); + + if (req_wb) { + ok = ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)( + mdl, @selector(evaluateWithQoS:options:request:error:), 21, @{}, req_wb, &e); + printf(" Eval with weightsBuffer: %s\n", ok ? "OK" : [[e description] UTF8String]); + if (ok) { + IOSurfaceLock(ioOut, kIOSurfaceLockReadOnly, NULL); + _Float16 *outW = (_Float16*)IOSurfaceGetBaseAddress(ioOut); + printf(" Output (3x identity via weightsBuffer): [%.1f, %.1f, %.1f, %.1f, ...]\n", + (float)outW[0], (float)outW[1], (float)outW[2], (float)outW[3]); + bool is_3x = fabsf((float)outW[0] - 3.0f) < 0.1f; + printf(" weightsBuffer override %s\n", is_3x ? "WORKS!" : "does NOT work (output unchanged)"); + IOSurfaceUnlock(ioOut, kIOSurfaceLockReadOnly, NULL); + } + } + CFRelease(ioW); + + // === Part 4: procedureIndex sweep === + printf("\n--- Part 4: procedureIndex sweep (0-15) ---\n"); + for (int pi = 0; pi < 16; pi++) { + wI = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioIn); + wO = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioOut); + id req_p = ((id(*)(Class,SEL,id,id,id,id,id,id,id))objc_msgSend)(g_AR, + @selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:), + @[wI], @[@0], @[wO], @[@0], nil, nil, @(pi)); + if (!req_p) { printf(" procIdx %2d: request=nil\n", pi); continue; } + ok = ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)( + mdl, @selector(evaluateWithQoS:options:request:error:), 21, @{}, req_p, &e); + printf(" procIdx %2d: %s%s\n", pi, ok ? "OK" : "FAIL", + !ok && e ? [NSString stringWithFormat:@" (%@)", [e localizedDescription]].UTF8String : ""); + } + + // === Part 5: Scan all ANE classes === + printf("\n--- Part 5: All ANE-prefixed classes ---\n"); + unsigned int classCount; + Class *allClasses = objc_copyClassList(&classCount); + for (unsigned int i = 0; i < classCount; i++) { + const char *name = class_getName(allClasses[i]); + if (strstr(name, "ANE") || strstr(name, "ane")) { + printf(" %s\n", name); + } + } + free(allClasses); + + // Cleanup + ((BOOL(*)(id,SEL,unsigned int,NSError**))objc_msgSend)(mdl, @selector(unloadWithQoS:error:), 21, &e); + [fm removeItemAtPath:td error:nil]; + CFRelease(ioIn); CFRelease(ioOut); + + printf("\nDone.\n"); + } + return 0; +} diff --git a/training/test_perf_stats.m b/training/test_perf_stats.m new file mode 100644 index 0000000..e1d94a6 --- /dev/null +++ b/training/test_perf_stats.m @@ -0,0 +1,248 @@ +// test_perf_stats.m — What does _ANEPerformanceStats expose? +// Probe class methods, properties, instantiate, pass to request, read back. +#import +#import +#import +#import +#import +#import + +static mach_timebase_info_data_t g_tb; +static double tb_ms(uint64_t t) { return (double)t * g_tb.numer / g_tb.denom / 1e6; } + +static void dump_class(const char *name) { + Class cls = NSClassFromString([NSString stringWithUTF8String:name]); + if (!cls) { printf(" %s: NOT FOUND\n", name); return; } + printf("\n=== %s ===\n", name); + + // Class methods + unsigned int count; + Method *methods = class_copyMethodList(object_getClass(cls), &count); + if (count) printf(" Class methods:\n"); + for (unsigned int i = 0; i < count; i++) { + SEL s = method_getName(methods[i]); + const char *enc = method_getTypeEncoding(methods[i]); + printf(" + %s [%s]\n", sel_getName(s), enc ? enc : "?"); + } + free(methods); + + // Instance methods + methods = class_copyMethodList(cls, &count); + if (count) printf(" Instance methods:\n"); + for (unsigned int i = 0; i < count; i++) { + SEL s = method_getName(methods[i]); + const char *enc = method_getTypeEncoding(methods[i]); + printf(" - %s [%s]\n", sel_getName(s), enc ? enc : "?"); + } + free(methods); + + // Properties + unsigned int pcount; + objc_property_t *props = class_copyPropertyList(cls, &pcount); + if (pcount) printf(" Properties:\n"); + for (unsigned int i = 0; i < pcount; i++) { + const char *pname = property_getName(props[i]); + const char *pattr = property_getAttributes(props[i]); + printf(" @property %s [%s]\n", pname, pattr ? pattr : "?"); + } + free(props); + + // Protocols + unsigned int prcount; + Protocol * __unsafe_unretained *protos = class_copyProtocolList(cls, &prcount); + if (prcount) { + printf(" Protocols:"); + for (unsigned int i = 0; i < prcount; i++) + printf(" %s", protocol_getName(protos[i])); + printf("\n"); + } + free(protos); +} + +static IOSurfaceRef make_surface(size_t bytes) { + return IOSurfaceCreate((__bridge CFDictionaryRef)@{ + (id)kIOSurfaceWidth:@(bytes), (id)kIOSurfaceHeight:@1, + (id)kIOSurfaceBytesPerElement:@1, (id)kIOSurfaceBytesPerRow:@(bytes), + (id)kIOSurfaceAllocSize:@(bytes), (id)kIOSurfacePixelFormat:@0}); +} + +int main() { + @autoreleasepool { + setbuf(stdout, NULL); + mach_timebase_info(&g_tb); + dlopen("/System/Library/PrivateFrameworks/AppleNeuralEngine.framework/AppleNeuralEngine", RTLD_NOW); + + printf("=== ANE Performance Stats Probe ===\n"); + + // Dump all ANE-related classes + dump_class("_ANEPerformanceStats"); + dump_class("_ANEPerfRequest"); + dump_class("ANEPerfRequest"); + dump_class("_ANEPerformanceCounters"); + dump_class("_ANEDeviceInfo"); + dump_class("_ANEModel"); + dump_class("_ANEInMemoryModel"); + dump_class("_ANERequest"); + dump_class("_ANEIOSurfaceObject"); + dump_class("_ANEInMemoryModelDescriptor"); + dump_class("_ANEClient"); + dump_class("_ANEVirtualClient"); + + // Try to instantiate _ANEPerformanceStats + printf("\n=== Instantiation Tests ===\n"); + Class perfClass = NSClassFromString(@"_ANEPerformanceStats"); + if (perfClass) { + // Try alloc/init + @try { + id perfStats = [[perfClass alloc] init]; + printf("_ANEPerformanceStats alloc/init: %s\n", + perfStats ? [[perfStats description] UTF8String] : "nil"); + + // Try to read all properties via KVC + if (perfStats) { + unsigned int pcount; + objc_property_t *props = class_copyPropertyList(perfClass, &pcount); + for (unsigned int i = 0; i < pcount; i++) { + const char *pname = property_getName(props[i]); + @try { + id val = [perfStats valueForKey:[NSString stringWithUTF8String:pname]]; + printf(" %s = %s\n", pname, val ? [[val description] UTF8String] : "nil"); + } @catch (NSException *ex) { + printf(" %s = \n", pname, [[ex reason] UTF8String]); + } + } + free(props); + } + } @catch (NSException *ex) { + printf("Exception: %s\n", [[ex reason] UTF8String]); + } + } + + // === Compile a simple kernel and try passing perfStats to request === + printf("\n=== Compile kernel and test perfStats in request ===\n"); + Class g_D = NSClassFromString(@"_ANEInMemoryModelDescriptor"); + Class g_I = NSClassFromString(@"_ANEInMemoryModel"); + Class g_AR = NSClassFromString(@"_ANERequest"); + Class g_AIO= NSClassFromString(@"_ANEIOSurfaceObject"); + + int IC = 4, OC = 4, SP = 4; + _Float16 weights[16]; + for (int i = 0; i < 16; i++) weights[i] = (i/4 == i%4) ? (_Float16)1.0f : (_Float16)0.0f; + + int ws = 16*2, tot = 128+ws; + uint8_t *blob = (uint8_t*)calloc(tot,1); + blob[0]=1; blob[4]=2; blob[64]=0xEF; blob[65]=0xBE; blob[66]=0xAD; blob[67]=0xDE; blob[68]=1; + *(uint32_t*)(blob+72)=ws; *(uint32_t*)(blob+80)=128; + memcpy(blob+128, weights, ws); + NSData *wdata = [NSData dataWithBytesNoCopy:blob length:tot freeWhenDone:YES]; + + NSString *mil = [NSString stringWithFormat: + @"program(1.3)\n" + "[buildInfo = dict({{\"coremlc-component-MIL\", \"3510.2.1\"}, " + "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, " + "{\"coremltools-version\", \"9.0\"}})]\n" + "{\n" + " func main(tensor x) {\n" + " string pt = const()[name=string(\"pt\"), val=string(\"valid\")];\n" + " tensor st = const()[name=string(\"st\"), val=tensor([1,1])];\n" + " tensor pd = const()[name=string(\"pd\"), val=tensor([0,0,0,0])];\n" + " tensor dl = const()[name=string(\"dl\"), val=tensor([1,1])];\n" + " int32 gr = const()[name=string(\"gr\"), val=int32(1)];\n" + " tensor W = const()[name=string(\"W\"), " + "val=tensor(BLOBFILE(path=string(\"@model_path/weights/weight.bin\"), offset=uint64(64)))];\n" + " tensor y = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W,x=x)" + "[name=string(\"conv\")];\n" + " } -> (y);\n" + "}\n", IC, SP, OC, IC, OC, IC, OC, SP]; + + NSData *md = [mil dataUsingEncoding:NSUTF8StringEncoding]; + id desc = ((id(*)(Class,SEL,id,id,id))objc_msgSend)(g_D, @selector(modelWithMILText:weights:optionsPlist:), + md, @{@"@model_path/weights/weight.bin": @{@"offset":@0, @"data":wdata}}, nil); + id mdl = ((id(*)(Class,SEL,id))objc_msgSend)(g_I, @selector(inMemoryModelWithDescriptor:), desc); + id hx = ((id(*)(id,SEL))objc_msgSend)(mdl, @selector(hexStringIdentifier)); + NSString *td = [NSTemporaryDirectory() stringByAppendingPathComponent:hx]; + [[NSFileManager defaultManager] createDirectoryAtPath:[td stringByAppendingPathComponent:@"weights"] + withIntermediateDirectories:YES attributes:nil error:nil]; + [md writeToFile:[td stringByAppendingPathComponent:@"model.mil"] atomically:YES]; + [wdata writeToFile:[td stringByAppendingPathComponent:@"weights/weight.bin"] atomically:YES]; + + NSError *e = nil; + ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(compileWithQoS:options:error:), 21, @{}, &e); + ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(loadWithQoS:options:error:), 21, @{}, &e); + + IOSurfaceRef ioIn = make_surface(IC*SP*2); + IOSurfaceRef ioOut = make_surface(OC*SP*2); + id wI = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioIn); + id wO = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioOut); + + // Try creating request WITH perfStats + if (perfClass) { + id perfStats = [[perfClass alloc] init]; + printf(" Creating request with perfStats=%s\n", perfStats ? "non-nil" : "nil"); + + id req = ((id(*)(Class,SEL,id,id,id,id,id,id,id))objc_msgSend)(g_AR, + @selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:), + @[wI], @[@0], @[wO], @[@0], nil, perfStats, @0); + printf(" Request: %s\n", req ? "created" : "nil"); + + if (req) { + // Write input + IOSurfaceLock(ioIn, 0, NULL); + _Float16 *inp = (_Float16*)IOSurfaceGetBaseAddress(ioIn); + for (int i = 0; i < IC*SP; i++) inp[i] = (_Float16)1.0f; + IOSurfaceUnlock(ioIn, 0, NULL); + + // Eval + BOOL ok = ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)( + mdl, @selector(evaluateWithQoS:options:request:error:), 21, @{}, req, &e); + printf(" Eval: %s\n", ok ? "OK" : [[e description] UTF8String]); + + // Read perfStats after eval + if (ok && perfStats) { + printf(" PerfStats after eval:\n"); + unsigned int pcount; + objc_property_t *props = class_copyPropertyList(perfClass, &pcount); + for (unsigned int i = 0; i < pcount; i++) { + const char *pname = property_getName(props[i]); + @try { + id val = [perfStats valueForKey:[NSString stringWithUTF8String:pname]]; + printf(" %s = %s\n", pname, val ? [[val description] UTF8String] : "nil"); + } @catch (NSException *ex) { + printf(" %s = \n", pname); + } + } + free(props); + + // Run 100 evals and check if counters accumulate + printf("\n Running 100 evals...\n"); + for (int i = 0; i < 100; i++) { + ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)( + mdl, @selector(evaluateWithQoS:options:request:error:), 21, @{}, req, &e); + } + printf(" PerfStats after 100 evals:\n"); + props = class_copyPropertyList(perfClass, &pcount); + for (unsigned int i = 0; i < pcount; i++) { + const char *pname = property_getName(props[i]); + @try { + id val = [perfStats valueForKey:[NSString stringWithUTF8String:pname]]; + printf(" %s = %s\n", pname, val ? [[val description] UTF8String] : "nil"); + } @catch (NSException *ex) { + printf(" %s = \n", pname); + } + } + free(props); + } + } + } + + // Also probe IORegistry for ANE perf data + printf("\n=== IORegistry ANE info ===\n"); + printf(" (run: ioreg -r -c H11ANEIn | head -100)\n"); + + // Cleanup + ((BOOL(*)(id,SEL,unsigned int,NSError**))objc_msgSend)(mdl, @selector(unloadWithQoS:error:), 21, &e); + [[NSFileManager defaultManager] removeItemAtPath:td error:nil]; + CFRelease(ioIn); CFRelease(ioOut); + } + return 0; +} diff --git a/training/test_qos_sweep.m b/training/test_qos_sweep.m new file mode 100644 index 0000000..c009de5 --- /dev/null +++ b/training/test_qos_sweep.m @@ -0,0 +1,154 @@ +// test_qos_sweep.m — Does QoS affect frequency/latency? +// Sweep QoS 0-63 on compile, load, eval of a simple kernel. +#import +#import +#import +#import +#import +#import + +static mach_timebase_info_data_t g_tb; +static double tb_ms(uint64_t t) { return (double)t * g_tb.numer / g_tb.denom / 1e6; } + +static IOSurfaceRef make_surface(size_t bytes) { + return IOSurfaceCreate((__bridge CFDictionaryRef)@{ + (id)kIOSurfaceWidth:@(bytes), (id)kIOSurfaceHeight:@1, + (id)kIOSurfaceBytesPerElement:@1, (id)kIOSurfaceBytesPerRow:@(bytes), + (id)kIOSurfaceAllocSize:@(bytes), (id)kIOSurfacePixelFormat:@0}); +} + +int main() { + @autoreleasepool { + setbuf(stdout, NULL); + mach_timebase_info(&g_tb); + dlopen("/System/Library/PrivateFrameworks/AppleNeuralEngine.framework/AppleNeuralEngine", RTLD_NOW); + + Class g_D = NSClassFromString(@"_ANEInMemoryModelDescriptor"); + Class g_I = NSClassFromString(@"_ANEInMemoryModel"); + Class g_AR = NSClassFromString(@"_ANERequest"); + Class g_AIO= NSClassFromString(@"_ANEIOSurfaceObject"); + + // Larger kernel for measurable latency: 256x256 conv, spatial=64 + int IC = 256, OC = 256, SP = 64; + int ws = IC*OC*2, tot = 128+ws; + uint8_t *blob = (uint8_t*)calloc(tot, 1); + blob[0]=1; blob[4]=2; blob[64]=0xEF; blob[65]=0xBE; blob[66]=0xAD; blob[67]=0xDE; blob[68]=1; + *(uint32_t*)(blob+72)=ws; *(uint32_t*)(blob+80)=128; + // Random weights + _Float16 *wp = (_Float16*)(blob+128); + for (int i = 0; i < IC*OC; i++) wp[i] = (_Float16)(0.01f * (i % 100 - 50)); + NSData *wdata = [NSData dataWithBytesNoCopy:blob length:tot freeWhenDone:YES]; + + NSString *mil = [NSString stringWithFormat: + @"program(1.3)\n" + "[buildInfo = dict({{\"coremlc-component-MIL\", \"3510.2.1\"}, " + "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, " + "{\"coremltools-version\", \"9.0\"}})]\n" + "{\n" + " func main(tensor x) {\n" + " string pt = const()[name=string(\"pt\"), val=string(\"valid\")];\n" + " tensor st = const()[name=string(\"st\"), val=tensor([1,1])];\n" + " tensor pd = const()[name=string(\"pd\"), val=tensor([0,0,0,0])];\n" + " tensor dl = const()[name=string(\"dl\"), val=tensor([1,1])];\n" + " int32 gr = const()[name=string(\"gr\"), val=int32(1)];\n" + " tensor W = const()[name=string(\"W\"), " + "val=tensor(BLOBFILE(path=string(\"@model_path/weights/weight.bin\"), offset=uint64(64)))];\n" + " tensor y = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W,x=x)" + "[name=string(\"conv\")];\n" + " } -> (y);\n" + "}\n", IC, SP, OC, IC, OC, IC, OC, SP]; + + NSDictionary *weights = @{@"@model_path/weights/weight.bin": @{@"offset":@0, @"data":wdata}}; + NSData *milData = [mil dataUsingEncoding:NSUTF8StringEncoding]; + NSFileManager *fm = [NSFileManager defaultManager]; + + printf("=== QoS Sweep: compile/load/eval with QoS 0-63 ===\n"); + printf("Kernel: %dx%d conv, spatial=%d (%.1f MFLOPS)\n", IC, OC, SP, 2.0*IC*OC*SP/1e6); + printf("%4s %10s %10s %10s %10s %s\n", "QoS", "Compile", "Load", "Eval(1)", "Eval(avg10)", "Status"); + + unsigned int qos_values[] = {0, 1, 5, 10, 15, 17, 19, 21, 25, 31, 33, 40, 47, 50, 55, 60, 63}; + int n_qos = sizeof(qos_values)/sizeof(qos_values[0]); + + for (int qi = 0; qi < n_qos; qi++) { + unsigned int qos = qos_values[qi]; + NSError *e = nil; + + id desc = ((id(*)(Class,SEL,id,id,id))objc_msgSend)(g_D, @selector(modelWithMILText:weights:optionsPlist:), + milData, weights, nil); + id mdl = ((id(*)(Class,SEL,id))objc_msgSend)(g_I, @selector(inMemoryModelWithDescriptor:), desc); + id hx = ((id(*)(id,SEL))objc_msgSend)(mdl, @selector(hexStringIdentifier)); + NSString *td = [NSTemporaryDirectory() stringByAppendingPathComponent: + [NSString stringWithFormat:@"qos_test_%u_%@", qos, hx]]; + [fm createDirectoryAtPath:[td stringByAppendingPathComponent:@"weights"] + withIntermediateDirectories:YES attributes:nil error:nil]; + [milData writeToFile:[td stringByAppendingPathComponent:@"model.mil"] atomically:YES]; + [wdata writeToFile:[td stringByAppendingPathComponent:@"weights/weight.bin"] atomically:YES]; + + // Compile + uint64_t t0 = mach_absolute_time(); + BOOL cok = ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)( + mdl, @selector(compileWithQoS:options:error:), qos, @{}, &e); + double cms = tb_ms(mach_absolute_time() - t0); + + if (!cok) { + printf("%4u %10s %10s %10s %10s COMPILE_FAIL\n", qos, "-", "-", "-", "-"); + [fm removeItemAtPath:td error:nil]; + continue; + } + + // Load + t0 = mach_absolute_time(); + BOOL lok = ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)( + mdl, @selector(loadWithQoS:options:error:), qos, @{}, &e); + double lms = tb_ms(mach_absolute_time() - t0); + + if (!lok) { + printf("%4u %8.1fms %10s %10s %10s LOAD_FAIL\n", qos, cms, "-", "-", "-"); + [fm removeItemAtPath:td error:nil]; + continue; + } + + // Build request + IOSurfaceRef ioIn = make_surface(IC*SP*2); + IOSurfaceRef ioOut = make_surface(OC*SP*2); + id wI = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioIn); + id wO = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioOut); + id req = ((id(*)(Class,SEL,id,id,id,id,id,id,id))objc_msgSend)(g_AR, + @selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:), + @[wI], @[@0], @[wO], @[@0], nil, nil, @0); + + // Write input + IOSurfaceLock(ioIn, 0, NULL); + _Float16 *inp = (_Float16*)IOSurfaceGetBaseAddress(ioIn); + for (int i = 0; i < IC*SP; i++) inp[i] = (_Float16)0.5f; + IOSurfaceUnlock(ioIn, 0, NULL); + + // Eval with same QoS + t0 = mach_absolute_time(); + BOOL eok = ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)( + mdl, @selector(evaluateWithQoS:options:request:error:), qos, @{}, req, &e); + double ems1 = tb_ms(mach_absolute_time() - t0); + + if (!eok) { + printf("%4u %8.1fms %8.1fms %10s %10s EVAL_FAIL\n", qos, cms, lms, "-", "-"); + } else { + // Average over 10 evals + t0 = mach_absolute_time(); + for (int i = 0; i < 10; i++) { + ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)( + mdl, @selector(evaluateWithQoS:options:request:error:), qos, @{}, req, &e); + } + double ems_avg = tb_ms(mach_absolute_time() - t0) / 10.0; + printf("%4u %8.1fms %8.1fms %8.2fms %8.2fms OK\n", qos, cms, lms, ems1, ems_avg); + } + + // Cleanup + ((BOOL(*)(id,SEL,unsigned int,NSError**))objc_msgSend)(mdl, @selector(unloadWithQoS:error:), 21, &e); + CFRelease(ioIn); CFRelease(ioOut); + [fm removeItemAtPath:td error:nil]; + } + + printf("\nDone.\n"); + } + return 0; +} diff --git a/training/test_weight_reload.m b/training/test_weight_reload.m new file mode 100644 index 0000000..984d77a --- /dev/null +++ b/training/test_weight_reload.m @@ -0,0 +1,241 @@ +// test_weight_reload.m — Can we skip recompilation by rewriting weight blobs on disk? +// Compile a conv kernel with weights A, eval, verify output. +// Overwrite weights/weight.bin in tmpDir with weights B. +// unloadWithQoS: then loadWithQoS: (no recompile). +// Eval again — if output matches B @ x, compilation bottleneck is eliminated. +#import +#import +#import +#import +#import +#import +#include + +static mach_timebase_info_data_t g_tb; +static double tb_ms(uint64_t t) { return (double)t * g_tb.numer / g_tb.denom / 1e6; } + +static IOSurfaceRef make_surface(size_t bytes) { + return IOSurfaceCreate((__bridge CFDictionaryRef)@{ + (id)kIOSurfaceWidth:@(bytes), (id)kIOSurfaceHeight:@1, + (id)kIOSurfaceBytesPerElement:@1, (id)kIOSurfaceBytesPerRow:@(bytes), + (id)kIOSurfaceAllocSize:@(bytes), (id)kIOSurfacePixelFormat:@0}); +} + +int main() { + @autoreleasepool { + setbuf(stdout, NULL); + mach_timebase_info(&g_tb); + dlopen("/System/Library/PrivateFrameworks/AppleNeuralEngine.framework/AppleNeuralEngine", RTLD_NOW); + + Class g_D = NSClassFromString(@"_ANEInMemoryModelDescriptor"); + Class g_I = NSClassFromString(@"_ANEInMemoryModel"); + Class g_AR = NSClassFromString(@"_ANERequest"); + Class g_AIO= NSClassFromString(@"_ANEIOSurfaceObject"); + + if (!g_D || !g_I || !g_AR || !g_AIO) { + printf("FAIL: ANE classes not found\n"); + return 1; + } + + // Small test: 4x4 conv kernel, spatial=4 + int IC = 4, OC = 4, SP = 4; + + // Weight set A: identity matrix + _Float16 weightsA[16]; + for (int i = 0; i < IC*OC; i++) weightsA[i] = (i / OC == i % OC) ? (_Float16)1.0f : (_Float16)0.0f; + + // Weight set B: 2x identity + _Float16 weightsB[16]; + for (int i = 0; i < IC*OC; i++) weightsB[i] = (i / OC == i % OC) ? (_Float16)2.0f : (_Float16)0.0f; + + // Build weight blob for A + int ws = IC * OC * 2; + int tot = 128 + ws; + uint8_t *blob = (uint8_t*)calloc(tot, 1); + blob[0]=1; blob[4]=2; blob[64]=0xEF; blob[65]=0xBE; blob[66]=0xAD; blob[67]=0xDE; blob[68]=1; + *(uint32_t*)(blob+72) = ws; + *(uint32_t*)(blob+80) = 128; + memcpy(blob + 128, weightsA, ws); + NSData *wdataA = [NSData dataWithBytesNoCopy:blob length:tot freeWhenDone:YES]; + + // MIL for a simple conv + NSString *mil = [NSString stringWithFormat: + @"program(1.3)\n" + "[buildInfo = dict({{\"coremlc-component-MIL\", \"3510.2.1\"}, " + "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, " + "{\"coremltools-version\", \"9.0\"}})]\n" + "{\n" + " func main(tensor x) {\n" + " string pt = const()[name=string(\"pt\"), val=string(\"valid\")];\n" + " tensor st = const()[name=string(\"st\"), val=tensor([1,1])];\n" + " tensor pd = const()[name=string(\"pd\"), val=tensor([0,0,0,0])];\n" + " tensor dl = const()[name=string(\"dl\"), val=tensor([1,1])];\n" + " int32 gr = const()[name=string(\"gr\"), val=int32(1)];\n" + " tensor W = const()[name=string(\"W\"), " + "val=tensor(BLOBFILE(path=string(\"@model_path/weights/weight.bin\"), offset=uint64(64)))];\n" + " tensor y = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W,x=x)" + "[name=string(\"conv\")];\n" + " } -> (y);\n" + "}\n", IC, SP, OC, IC, OC, IC, OC, SP]; + + NSDictionary *weights = @{ + @"@model_path/weights/weight.bin": @{@"offset": @0, @"data": wdataA} + }; + + NSData *milData = [mil dataUsingEncoding:NSUTF8StringEncoding]; + + // === Compile with weights A === + printf("=== Step 1: Compile with weights A (identity) ===\n"); + uint64_t t0 = mach_absolute_time(); + id desc = ((id(*)(Class,SEL,id,id,id))objc_msgSend)(g_D, @selector(modelWithMILText:weights:optionsPlist:), milData, weights, nil); + if (!desc) { printf("FAIL: desc=NULL\n"); return 1; } + id mdl = ((id(*)(Class,SEL,id))objc_msgSend)(g_I, @selector(inMemoryModelWithDescriptor:), desc); + id hx = ((id(*)(id,SEL))objc_msgSend)(mdl, @selector(hexStringIdentifier)); + NSString *td = [NSTemporaryDirectory() stringByAppendingPathComponent:hx]; + NSFileManager *fm = [NSFileManager defaultManager]; + [fm createDirectoryAtPath:[td stringByAppendingPathComponent:@"weights"] withIntermediateDirectories:YES attributes:nil error:nil]; + [milData writeToFile:[td stringByAppendingPathComponent:@"model.mil"] atomically:YES]; + [wdataA writeToFile:[td stringByAppendingPathComponent:@"weights/weight.bin"] atomically:YES]; + + NSError *e = nil; + BOOL ok = ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(compileWithQoS:options:error:), 21, @{}, &e); + if (!ok) { printf("FAIL: compile: %s\n", [[e description] UTF8String]); return 1; } + ok = ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(loadWithQoS:options:error:), 21, @{}, &e); + if (!ok) { printf("FAIL: load: %s\n", [[e description] UTF8String]); return 1; } + printf(" Compile+load: %.1fms\n", tb_ms(mach_absolute_time() - t0)); + printf(" tmpDir: %s\n", [td UTF8String]); + + // Build request and IOSurfaces + int inBytes = IC * SP * 2; + int outBytes = OC * SP * 2; + IOSurfaceRef ioIn = make_surface(inBytes); + IOSurfaceRef ioOut = make_surface(outBytes); + id wI = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioIn); + id wO = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioOut); + id req = ((id(*)(Class,SEL,id,id,id,id,id,id,id))objc_msgSend)(g_AR, + @selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:), + @[wI], @[@0], @[wO], @[@0], nil, nil, @0); + + // Write input: [1, 2, 3, 4] repeated across channels + IOSurfaceLock(ioIn, 0, NULL); + _Float16 *inp = (_Float16*)IOSurfaceGetBaseAddress(ioIn); + for (int c = 0; c < IC; c++) + for (int s = 0; s < SP; s++) + inp[c * SP + s] = (_Float16)(s + 1.0f); + IOSurfaceUnlock(ioIn, 0, NULL); + + // Eval with weights A + printf("\n=== Step 2: Eval with weights A ===\n"); + ok = ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)(mdl, @selector(evaluateWithQoS:options:request:error:), 21, @{}, req, &e); + if (!ok) { printf("FAIL: eval: %s\n", e ? [[e description] UTF8String] : "?"); return 1; } + + IOSurfaceLock(ioOut, kIOSurfaceLockReadOnly, NULL); + _Float16 *outA = (_Float16*)IOSurfaceGetBaseAddress(ioOut); + printf(" Output A (identity @ [1,2,3,4]):"); + for (int c = 0; c < OC; c++) { + printf(" ["); + for (int s = 0; s < SP; s++) printf("%.1f%s", (float)outA[c*SP+s], s 0.01f) { changed = true; break; } + } + // Expected output B should be 2x output A if weight reload worked + bool correct = true; + for (int i = 0; i < OC * SP; i++) { + float expected = (float)outA_copy[i] * 2.0f; + if (fabsf((float)outB[i] - expected) > 0.1f) { correct = false; break; } + } + IOSurfaceUnlock(ioOut, kIOSurfaceLockReadOnly, NULL); + + printf("\n=== RESULT ===\n"); + if (changed && correct) { + printf("SUCCESS: Weight reload works! Output changed to match new weights.\n"); + printf(">>> Compilation bottleneck can be eliminated <<<\n"); + } else if (changed && !correct) { + printf("PARTIAL: Output changed but doesn't match expected 2x. Weights may be partially updated.\n"); + printf(" Expected 2x of A, got different values.\n"); + } else { + printf("FAIL: Output did NOT change. Weight reload does not work.\n"); + printf(" Output is still the same as weights A. ANE cached the compiled model.\n"); + printf(">>> Need alternative approach (weightsBuffer IOSurface or async recompile) <<<\n"); + } + + // Cleanup + ((BOOL(*)(id,SEL,unsigned int,NSError**))objc_msgSend)(mdl, @selector(unloadWithQoS:error:), 21, &e); + [fm removeItemAtPath:td error:nil]; + CFRelease(ioIn); CFRelease(ioOut); + } + return 0; +} diff --git a/training/train_large.m b/training/train_large.m index ee7e860..e58ce08 100644 --- a/training/train_large.m +++ b/training/train_large.m @@ -581,6 +581,16 @@ int main(int argc, char *argv[]) { steps_batch++; if (step % 10 == 0 || step == start_step) printf("step %-4d loss=%.4f\n", step, loss); + + // JSON telemetry to stderr + double step_ane = t_ane/steps_batch, step_io = t_io/steps_batch; + double step_cls = t_cls/steps_batch, step_elem = t_elem/steps_batch; + double step_rms = t_rms/steps_batch, step_cbw = t_cblas_wait/steps_batch; + fprintf(stderr, "{\"type\":\"step\",\"step\":%d,\"loss\":%.6f," + "\"t_ane\":%.3f,\"t_io\":%.3f,\"t_cls\":%.3f," + "\"t_elem\":%.3f,\"t_rms\":%.3f,\"t_cblas_wait\":%.3f," + "\"compiles\":%d}\n", + step, loss, step_ane, step_io, step_cls, step_elem, step_rms, step_cbw, g_compile_count); } double tms = tb_ms(mach_absolute_time() - tt); total_train_ms += tms; @@ -622,6 +632,19 @@ int main(int argc, char *argv[]) { printf(" ane=%.1f io=%.1f cls=%.1f elem=%.1f rms=%.1f cblas_wait=%.1f ms/step\n", t_ane/steps_batch, t_io/steps_batch, t_cls/steps_batch, t_elem/steps_batch, t_rms/steps_batch, t_cblas_wait/steps_batch); + + // JSON batch telemetry to stderr + { + double bf = NLAYERS * (4.0*2*DIM*DIM*SEQ + 2.0*2*DIM*HIDDEN*SEQ + 2.0*HIDDEN*DIM*SEQ); + double bs = NLAYERS * 2.0*HEADS*5*SEQ*SEQ*HD; + double ane_f_batch = (bf*2 + bs) * steps_batch; + double ane_tflops = ane_f_batch / (tms * 1e9); + fprintf(stderr, "{\"type\":\"batch\",\"batch\":%d,\"compile_ms\":%.1f," + "\"train_ms\":%.1f,\"ms_per_step\":%.1f}\n", + steps_batch, cms, tms, tms/steps_batch); + fprintf(stderr, "{\"type\":\"perf\",\"ane_tflops\":%.3f,\"ane_util_pct\":%.2f}\n", + ane_tflops, 100.0*ane_tflops/15.8); + } } // Efficiency report