diff --git a/training/m5result.md b/training/m5result.md new file mode 100644 index 0000000..91d5eaa --- /dev/null +++ b/training/m5result.md @@ -0,0 +1,146 @@ +# M5 ANE Probe Results + +**Machine**: Apple M5, macOS 26.3 (Darwin 25.3.0) +**Date**: 2026-03-01 +**ANE Family**: H16 (same as M4) + +--- + +## test_weight_reload — FAIL + +**Question**: Can we skip recompilation by overwriting weight blobs on disk and calling unload+load? + +**Result**: **No.** Weights are baked at compile time. Overwriting `weights/weight.bin` in tmpDir and doing unload→load produces identical output — the ANE ignores the file change. + +``` +Kernel: 64x64 conv, spatial=32 +Compile+load: 33.3ms | Unload: 0.5ms | Reload: 3.8ms +Output A (identity): [0.0100, 0.0200, 0.0300, 0.0400] +Output B (3x identity, after file overwrite + reload): [0.0100, 0.0200, 0.0300, 0.0400] +Max A-B diff: 0.000000 +``` + +**Implication**: Cannot eliminate compilation bottleneck via file swap. Must use async recompile, raise ACCUM_STEPS, or find another path. + +--- + +## test_perf_stats — Partial Success + +**Question**: What hardware counters does `_ANEPerformanceStats` expose? + +**Result**: The class exists with useful properties, but `alloc/init` returns `nil`. Must be created via factory methods that require internal buffers. + +### Available Properties +| Property | Type | Description | +|----------|------|-------------| +| `hwExecutionTime` | uint64 | Hardware execution time in nanoseconds | +| `perfCounterData` | NSData | Raw performance counter data blob | +| `pStatsRawData` | NSData | Raw stats data | + +### Factory Methods +- `+statsWithHardwareExecutionNS:` — create from hw execution time +- `+statsWithRequestPerformanceBuffer:statsBufferSize:` — create from raw buffer +- `+statsWithReconstructed:hardwareExecutionNS:aneStatsRawData:` — reconstruct from components +- `+driverMaskForANEFMask:` — convert ANE feature mask to driver mask + +### Instance Methods +- `-performanceCounters` — returns counter object +- `-stringForPerfCounter:` — human-readable counter name +- `-emitPerfcounterSignpostsWithModelStringID:` — emit signposts for profiling + +**Key Finding**: `_ANEModel` has `perfStatsMask` property. Setting this on the model before eval likely enables perf stats population in the request. The `_ANEPerformanceStats` object passed to request gets populated *by the driver* — we need to set the mask first, then read stats after eval. + +--- + +## test_qos_sweep — All QoS Values Work + +**Question**: Does QoS affect ANE frequency or latency? + +**Result**: All QoS values 0-63 compile, load, and eval successfully. **No measurable latency difference** — ANE appears to run at fixed frequency regardless of QoS. + +``` +Kernel: 256x256 conv, spatial=64 (8.4 MFLOPS) + QoS Compile Load Eval(1) Eval(avg10) Status + 0 13.9ms 15.6ms 0.22ms 0.11ms OK + 1 11.6ms 1.8ms 0.17ms 0.07ms OK + 5 11.4ms 1.7ms 0.17ms 0.07ms OK + 10 12.0ms 1.8ms 0.18ms 0.06ms OK + 21 11.8ms 1.7ms 0.18ms 0.08ms OK + 33 11.5ms 1.7ms 0.17ms 0.06ms OK + 47 10.8ms 1.7ms 0.18ms 0.06ms OK + 63 11.3ms 1.7ms 0.17ms 0.07ms OK +``` + +**Notes**: +- QoS 0 has elevated load time (15.6ms vs ~1.7ms) — possibly first-use initialization +- Compile time ~11ms, load ~1.7ms, eval ~0.07ms avg for 8.4 MFLOPS kernel +- Eval throughput: 8.4M / 0.07ms = **120 GFLOPS** for a single 256×256 conv + +--- + +## test_ane_advanced — Key Findings + +### weightsBuffer IOSurface — Does NOT Override + +Passing a `weightsBuffer` IOSurface with different weights to the request **does not change output**. The compiled weights are still used. + +``` +Baseline (1x identity): Output[0..3] = [0.1000, 0.2000, 0.3000, 0.3999] +weightsBuffer (3x identity): Output[0..3] = [0.1000, 0.2000, 0.3000, 0.3999] +``` + +The `weightsBuffer` parameter likely serves a different purpose (perhaps for models that declare runtime weights vs baked constants). + +### procedureIndex — All 0-15 Succeed + +All procedure indices 0-15 return OK. Single-procedure models work with any index (they probably ignore non-zero indices). Multi-procedure models compiled from `_ANEChainingRequest` would use different indices for different subgraphs. + +### SharedEvents — Classes Exist, Need IOSurfaceSharedEvent + +- `_ANESharedEvents`, `_ANESharedSignalEvent`, `_ANESharedWaitEvent` all exist +- `alloc/init` returns nil — they need `IOSurfaceSharedEvent` objects (Metal shared events) +- `_ANESharedSignalEvent` has `symbolIndex` and `agentMask` — for GPU↔ANE sync +- Signal API: `+signalEventWithValue:symbolIndex:eventType:sharedEvent:` +- Wait API: `+waitEventWithValue:sharedEvent:eventType:` + +### ChainingRequest — Exists with Loopback Support + +`_ANEChainingRequest` supports chained execution: +- `inputBuffer`, `outputSets` — multiple output sets for pipeline +- `loopbackInputSymbolIndex`, `loopbackOutputSymbolIndex` — feed output back as input +- `fwEnqueueDelay` — firmware-level enqueue timing +- `memoryPoolId` — shared memory pool across chained ops +- `signalEvents` — sync with other agents + +### Notable _ANEClient Methods +- `evaluateRealTimeWithModel:options:request:error:` — real-time eval path +- `loadRealTimeModel:options:qos:error:` — RT model loading +- `beginRealTimeTask` / `endRealTimeTask` — RT task bracketing +- `prepareChainingWithModel:options:chainingReq:qos:error:` — set up chaining +- `enqueueSetsWithModel:outputSet:options:qos:error:` — enqueue output sets +- `buffersReadyWithModel:inputBuffers:options:qos:error:` — signal input ready + +### All ANE Classes Found (67 total) +Key unexplored classes: `_ANEDeviceController`, `_ANEQoSMapper`, `_ANEBuffer`, `_ANEIOSurfaceOutputSets`, `_ANEProgramForEvaluation`, `_ANEProgramIOSurfacesMapper`, `_ANEModelInstanceParameters`, `_ANEInputBuffersReady`, `_ANEOutputSetEnqueue` + +--- + +## Strategic Implications + +### Compilation Bottleneck (Primary) +Weight reload and weightsBuffer both fail. **Weights are irrevocably baked at compile time.** The only paths forward: +1. **Raise ACCUM_STEPS significantly** (10→100+) to amortize compile cost +2. **Async background compilation** while training continues with old weights +3. **Chaining API** (`_ANEChainingRequest`) to pipeline multiple layers in one dispatch + +### Performance Monitoring +`hwExecutionTime` from `_ANEPerformanceStats` gives wall-clock ANE time per eval. To enable: +1. Set `perfStatsMask` on the `_ANEInMemoryModel` before eval +2. Pass an `_ANEPerformanceStats` to the request +3. Read `hwExecutionTime` after eval + +### Real-Time Path +`_ANEClient` has a dedicated real-time evaluation path (`evaluateRealTimeWithModel:`) with RT load/unload. This may provide lower/more predictable latency. + +### Chaining (Most Promising for Utilization) +`_ANEChainingRequest` with loopback could allow multiple layers to execute as a single ANE program without CPU round-trips between layers. Combined with `_ANEIOSurfaceOutputSets` and `_ANEInputBuffersReady`, this could dramatically reduce idle time between kernel dispatches. diff --git a/training/test_ane_advanced.m b/training/test_ane_advanced.m index 4a92a04..07e9038 100644 --- a/training/test_ane_advanced.m +++ b/training/test_ane_advanced.m @@ -6,6 +6,7 @@ #import #import #import +#include static mach_timebase_info_data_t g_tb; static double tb_ms(uint64_t t) { return (double)t * g_tb.numer / g_tb.denom / 1e6; } @@ -65,7 +66,6 @@ int main() { dump_class("_ANEEvent"); dump_class("_ANEFenceEvent"); - // Try instantiate const char *event_classes[] = { "_ANESharedEvents", "_ANESharedSignalEvent", "_ANESharedWaitEvent", "_ANEEvent", "_ANEFenceEvent", NULL @@ -89,22 +89,21 @@ int main() { dump_class("_ANEMultiRequest"); dump_class("_ANEBatchRequest"); - // === Part 3: weightsBuffer parameter test === + // === Part 3: Compile working kernel for weightsBuffer + procedureIndex tests === printf("\n--- Part 3: weightsBuffer IOSurface test ---\n"); Class g_D = NSClassFromString(@"_ANEInMemoryModelDescriptor"); Class g_I = NSClassFromString(@"_ANEInMemoryModel"); Class g_AR = NSClassFromString(@"_ANERequest"); Class g_AIO= NSClassFromString(@"_ANEIOSurfaceObject"); - int IC = 4, OC = 4, SP = 4; - _Float16 weights[16]; - for (int i = 0; i < 16; i++) weights[i] = (i/4 == i%4) ? (_Float16)1.0f : (_Float16)0.0f; - - int ws = 16*2, tot = 128+ws; + int CH = 64, SP = 32; + _Float16 *w = (_Float16*)calloc(CH*CH, sizeof(_Float16)); + for (int i = 0; i < CH; i++) w[i*CH+i] = (_Float16)1.0f; + int ws = CH*CH*2, tot = 128+ws; uint8_t *blob = (uint8_t*)calloc(tot,1); blob[0]=1; blob[4]=2; blob[64]=0xEF; blob[65]=0xBE; blob[66]=0xAD; blob[67]=0xDE; blob[68]=1; *(uint32_t*)(blob+72)=ws; *(uint32_t*)(blob+80)=128; - memcpy(blob+128, weights, ws); + memcpy(blob+128, w, ws); NSData *wdata = [NSData dataWithBytesNoCopy:blob length:tot freeWhenDone:YES]; NSString *mil = [NSString stringWithFormat: @@ -113,18 +112,22 @@ int main() { "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, " "{\"coremltools-version\", \"9.0\"}})]\n" "{\n" - " func main(tensor x) {\n" + " func main(tensor x) {\n" " string pt = const()[name=string(\"pt\"), val=string(\"valid\")];\n" " tensor st = const()[name=string(\"st\"), val=tensor([1,1])];\n" " tensor pd = const()[name=string(\"pd\"), val=tensor([0,0,0,0])];\n" " tensor dl = const()[name=string(\"dl\"), val=tensor([1,1])];\n" " int32 gr = const()[name=string(\"gr\"), val=int32(1)];\n" + " string to16 = const()[name=string(\"to16\"), val=string(\"fp16\")];\n" + " tensor x16 = cast(dtype=to16,x=x)[name=string(\"cin\")];\n" " tensor W = const()[name=string(\"W\"), " "val=tensor(BLOBFILE(path=string(\"@model_path/weights/weight.bin\"), offset=uint64(64)))];\n" - " tensor y = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W,x=x)" + " tensor y16 = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W,x=x16)" "[name=string(\"conv\")];\n" + " string to32 = const()[name=string(\"to32\"), val=string(\"fp32\")];\n" + " tensor y = cast(dtype=to32,x=y16)[name=string(\"cout\")];\n" " } -> (y);\n" - "}\n", IC, SP, OC, IC, OC, IC, OC, SP]; + "}\n", CH, SP, CH, SP, CH, CH, CH, CH, CH, SP, CH, SP]; NSData *md = [mil dataUsingEncoding:NSUTF8StringEncoding]; id desc = ((id(*)(Class,SEL,id,id,id))objc_msgSend)(g_D, @selector(modelWithMILText:weights:optionsPlist:), @@ -142,16 +145,16 @@ int main() { ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(compileWithQoS:options:error:), 21, @{}, &e); ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(loadWithQoS:options:error:), 21, @{}, &e); - IOSurfaceRef ioIn = make_surface(IC*SP*2); - IOSurfaceRef ioOut = make_surface(OC*SP*2); + int ioBytes = CH * SP * 4; + IOSurfaceRef ioIn = make_surface(ioBytes); + IOSurfaceRef ioOut = make_surface(ioBytes); - // Write input IOSurfaceLock(ioIn, 0, NULL); - _Float16 *inp = (_Float16*)IOSurfaceGetBaseAddress(ioIn); - for (int c = 0; c < IC; c++) for (int s = 0; s < SP; s++) inp[c*SP+s] = (_Float16)(s+1.0f); + float *inp = (float*)IOSurfaceGetBaseAddress(ioIn); + for (int c = 0; c < CH; c++) for (int s = 0; s < SP; s++) inp[c*SP+s] = (float)(s+1) * 0.1f; IOSurfaceUnlock(ioIn, 0, NULL); - // Normal eval first (baseline) + // Baseline eval id wI = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioIn); id wO = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioOut); id req0 = ((id(*)(Class,SEL,id,id,id,id,id,id,id))objc_msgSend)(g_AR, @@ -162,23 +165,22 @@ int main() { printf(" Baseline eval (weightsBuffer=nil, procIdx=0): %s\n", ok ? "OK" : "FAIL"); IOSurfaceLock(ioOut, kIOSurfaceLockReadOnly, NULL); - _Float16 *out0 = (_Float16*)IOSurfaceGetBaseAddress(ioOut); - printf(" Output: [%.1f, %.1f, %.1f, %.1f, ...]\n", - (float)out0[0], (float)out0[1], (float)out0[2], (float)out0[3]); + float *out0 = (float*)IOSurfaceGetBaseAddress(ioOut); + float baseline_0 = out0[0], baseline_1 = out0[1]; + printf(" Output[0..3]: [%.4f, %.4f, %.4f, %.4f]\n", out0[0], out0[1], out0[2], out0[3]); IOSurfaceUnlock(ioOut, kIOSurfaceLockReadOnly, NULL); - // Test weightsBuffer: create IOSurface with weight data + // Test weightsBuffer: IOSurface with 3x identity weights printf("\n Testing weightsBuffer IOSurface...\n"); - _Float16 weights2[16]; - for (int i = 0; i < 16; i++) weights2[i] = (i/4 == i%4) ? (_Float16)3.0f : (_Float16)0.0f; - + _Float16 *w3 = (_Float16*)calloc(CH*CH, sizeof(_Float16)); + for (int i = 0; i < CH; i++) w3[i*CH+i] = (_Float16)3.0f; IOSurfaceRef ioW = make_surface(ws); IOSurfaceLock(ioW, 0, NULL); - memcpy(IOSurfaceGetBaseAddress(ioW), weights2, ws); + memcpy(IOSurfaceGetBaseAddress(ioW), w3, ws); IOSurfaceUnlock(ioW, 0, NULL); + free(w3); id wW = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioW); - // Try with weightsBuffer wI = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioIn); wO = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioOut); id req_wb = ((id(*)(Class,SEL,id,id,id,id,id,id,id))objc_msgSend)(g_AR, @@ -189,14 +191,16 @@ int main() { if (req_wb) { ok = ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)( mdl, @selector(evaluateWithQoS:options:request:error:), 21, @{}, req_wb, &e); - printf(" Eval with weightsBuffer: %s\n", ok ? "OK" : [[e description] UTF8String]); + printf(" Eval with weightsBuffer: %s\n", ok ? "OK" : e ? [[e description] UTF8String] : "FAIL"); if (ok) { IOSurfaceLock(ioOut, kIOSurfaceLockReadOnly, NULL); - _Float16 *outW = (_Float16*)IOSurfaceGetBaseAddress(ioOut); - printf(" Output (3x identity via weightsBuffer): [%.1f, %.1f, %.1f, %.1f, ...]\n", - (float)outW[0], (float)outW[1], (float)outW[2], (float)outW[3]); - bool is_3x = fabsf((float)outW[0] - 3.0f) < 0.1f; - printf(" weightsBuffer override %s\n", is_3x ? "WORKS!" : "does NOT work (output unchanged)"); + float *outW = (float*)IOSurfaceGetBaseAddress(ioOut); + printf(" Output[0..3]: [%.4f, %.4f, %.4f, %.4f]\n", outW[0], outW[1], outW[2], outW[3]); + bool changed = fabsf(outW[0] - baseline_0) > 0.001f; + bool is_3x = fabsf(outW[0] - baseline_0 * 3.0f) < 0.1f; + printf(" weightsBuffer: output %s", changed ? "CHANGED" : "unchanged"); + if (changed) printf(" (%s)", is_3x ? "matches 3x — WORKS!" : "but not 3x as expected"); + printf("\n"); IOSurfaceUnlock(ioOut, kIOSurfaceLockReadOnly, NULL); } } @@ -228,6 +232,7 @@ int main() { } } free(allClasses); + free(w); // Cleanup ((BOOL(*)(id,SEL,unsigned int,NSError**))objc_msgSend)(mdl, @selector(unloadWithQoS:error:), 21, &e); diff --git a/training/test_perf_stats.m b/training/test_perf_stats.m index e1d94a6..cf7b073 100644 --- a/training/test_perf_stats.m +++ b/training/test_perf_stats.m @@ -15,7 +15,6 @@ static void dump_class(const char *name) { if (!cls) { printf(" %s: NOT FOUND\n", name); return; } printf("\n=== %s ===\n", name); - // Class methods unsigned int count; Method *methods = class_copyMethodList(object_getClass(cls), &count); if (count) printf(" Class methods:\n"); @@ -26,7 +25,6 @@ static void dump_class(const char *name) { } free(methods); - // Instance methods methods = class_copyMethodList(cls, &count); if (count) printf(" Instance methods:\n"); for (unsigned int i = 0; i < count; i++) { @@ -36,7 +34,6 @@ static void dump_class(const char *name) { } free(methods); - // Properties unsigned int pcount; objc_property_t *props = class_copyPropertyList(cls, &pcount); if (pcount) printf(" Properties:\n"); @@ -46,17 +43,6 @@ static void dump_class(const char *name) { printf(" @property %s [%s]\n", pname, pattr ? pattr : "?"); } free(props); - - // Protocols - unsigned int prcount; - Protocol * __unsafe_unretained *protos = class_copyProtocolList(cls, &prcount); - if (prcount) { - printf(" Protocols:"); - for (unsigned int i = 0; i < prcount; i++) - printf(" %s", protocol_getName(protos[i])); - printf("\n"); - } - free(protos); } static IOSurfaceRef make_surface(size_t bytes) { @@ -74,7 +60,6 @@ int main() { printf("=== ANE Performance Stats Probe ===\n"); - // Dump all ANE-related classes dump_class("_ANEPerformanceStats"); dump_class("_ANEPerfRequest"); dump_class("ANEPerfRequest"); @@ -92,13 +77,10 @@ int main() { printf("\n=== Instantiation Tests ===\n"); Class perfClass = NSClassFromString(@"_ANEPerformanceStats"); if (perfClass) { - // Try alloc/init @try { id perfStats = [[perfClass alloc] init]; printf("_ANEPerformanceStats alloc/init: %s\n", perfStats ? [[perfStats description] UTF8String] : "nil"); - - // Try to read all properties via KVC if (perfStats) { unsigned int pcount; objc_property_t *props = class_copyPropertyList(perfClass, &pcount); @@ -118,23 +100,23 @@ int main() { } } - // === Compile a simple kernel and try passing perfStats to request === + // Compile a working kernel and test perfStats in request printf("\n=== Compile kernel and test perfStats in request ===\n"); Class g_D = NSClassFromString(@"_ANEInMemoryModelDescriptor"); Class g_I = NSClassFromString(@"_ANEInMemoryModel"); Class g_AR = NSClassFromString(@"_ANERequest"); Class g_AIO= NSClassFromString(@"_ANEIOSurfaceObject"); - int IC = 4, OC = 4, SP = 4; - _Float16 weights[16]; - for (int i = 0; i < 16; i++) weights[i] = (i/4 == i%4) ? (_Float16)1.0f : (_Float16)0.0f; - - int ws = 16*2, tot = 128+ws; + int CH = 64, SP = 32; + _Float16 *w = (_Float16*)calloc(CH*CH, sizeof(_Float16)); + for (int i = 0; i < CH; i++) w[i*CH+i] = (_Float16)1.0f; + int ws = CH*CH*2, tot = 128+ws; uint8_t *blob = (uint8_t*)calloc(tot,1); blob[0]=1; blob[4]=2; blob[64]=0xEF; blob[65]=0xBE; blob[66]=0xAD; blob[67]=0xDE; blob[68]=1; *(uint32_t*)(blob+72)=ws; *(uint32_t*)(blob+80)=128; - memcpy(blob+128, weights, ws); + memcpy(blob+128, w, ws); NSData *wdata = [NSData dataWithBytesNoCopy:blob length:tot freeWhenDone:YES]; + free(w); NSString *mil = [NSString stringWithFormat: @"program(1.3)\n" @@ -142,18 +124,22 @@ int main() { "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, " "{\"coremltools-version\", \"9.0\"}})]\n" "{\n" - " func main(tensor x) {\n" + " func main(tensor x) {\n" " string pt = const()[name=string(\"pt\"), val=string(\"valid\")];\n" " tensor st = const()[name=string(\"st\"), val=tensor([1,1])];\n" " tensor pd = const()[name=string(\"pd\"), val=tensor([0,0,0,0])];\n" " tensor dl = const()[name=string(\"dl\"), val=tensor([1,1])];\n" " int32 gr = const()[name=string(\"gr\"), val=int32(1)];\n" + " string to16 = const()[name=string(\"to16\"), val=string(\"fp16\")];\n" + " tensor x16 = cast(dtype=to16,x=x)[name=string(\"cin\")];\n" " tensor W = const()[name=string(\"W\"), " "val=tensor(BLOBFILE(path=string(\"@model_path/weights/weight.bin\"), offset=uint64(64)))];\n" - " tensor y = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W,x=x)" + " tensor y16 = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W,x=x16)" "[name=string(\"conv\")];\n" + " string to32 = const()[name=string(\"to32\"), val=string(\"fp32\")];\n" + " tensor y = cast(dtype=to32,x=y16)[name=string(\"cout\")];\n" " } -> (y);\n" - "}\n", IC, SP, OC, IC, OC, IC, OC, SP]; + "}\n", CH, SP, CH, SP, CH, CH, CH, CH, CH, SP, CH, SP]; NSData *md = [mil dataUsingEncoding:NSUTF8StringEncoding]; id desc = ((id(*)(Class,SEL,id,id,id))objc_msgSend)(g_D, @selector(modelWithMILText:weights:optionsPlist:), @@ -170,8 +156,9 @@ int main() { ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(compileWithQoS:options:error:), 21, @{}, &e); ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(loadWithQoS:options:error:), 21, @{}, &e); - IOSurfaceRef ioIn = make_surface(IC*SP*2); - IOSurfaceRef ioOut = make_surface(OC*SP*2); + int ioBytes = CH * SP * 4; // fp32 + IOSurfaceRef ioIn = make_surface(ioBytes); + IOSurfaceRef ioOut = make_surface(ioBytes); id wI = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioIn); id wO = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioOut); @@ -186,20 +173,17 @@ int main() { printf(" Request: %s\n", req ? "created" : "nil"); if (req) { - // Write input IOSurfaceLock(ioIn, 0, NULL); - _Float16 *inp = (_Float16*)IOSurfaceGetBaseAddress(ioIn); - for (int i = 0; i < IC*SP; i++) inp[i] = (_Float16)1.0f; + float *inp = (float*)IOSurfaceGetBaseAddress(ioIn); + for (int i = 0; i < CH*SP; i++) inp[i] = 1.0f; IOSurfaceUnlock(ioIn, 0, NULL); - // Eval BOOL ok = ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)( mdl, @selector(evaluateWithQoS:options:request:error:), 21, @{}, req, &e); printf(" Eval: %s\n", ok ? "OK" : [[e description] UTF8String]); - // Read perfStats after eval if (ok && perfStats) { - printf(" PerfStats after eval:\n"); + printf("\n PerfStats after 1 eval:\n"); unsigned int pcount; objc_property_t *props = class_copyPropertyList(perfClass, &pcount); for (unsigned int i = 0; i < pcount; i++) { @@ -213,13 +197,16 @@ int main() { } free(props); - // Run 100 evals and check if counters accumulate printf("\n Running 100 evals...\n"); + uint64_t t0 = mach_absolute_time(); for (int i = 0; i < 100; i++) { ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)( mdl, @selector(evaluateWithQoS:options:request:error:), 21, @{}, req, &e); } - printf(" PerfStats after 100 evals:\n"); + printf(" 100 evals in %.1fms (%.2fms/eval)\n", + tb_ms(mach_absolute_time()-t0), tb_ms(mach_absolute_time()-t0)/100.0); + + printf("\n PerfStats after 101 evals:\n"); props = class_copyPropertyList(perfClass, &pcount); for (unsigned int i = 0; i < pcount; i++) { const char *pname = property_getName(props[i]); @@ -233,12 +220,10 @@ int main() { free(props); } } + } else { + printf(" _ANEPerformanceStats class NOT FOUND\n"); } - // Also probe IORegistry for ANE perf data - printf("\n=== IORegistry ANE info ===\n"); - printf(" (run: ioreg -r -c H11ANEIn | head -100)\n"); - // Cleanup ((BOOL(*)(id,SEL,unsigned int,NSError**))objc_msgSend)(mdl, @selector(unloadWithQoS:error:), 21, &e); [[NSFileManager defaultManager] removeItemAtPath:td error:nil]; diff --git a/training/test_qos_sweep.m b/training/test_qos_sweep.m index c009de5..2802c6b 100644 --- a/training/test_qos_sweep.m +++ b/training/test_qos_sweep.m @@ -1,5 +1,5 @@ // test_qos_sweep.m — Does QoS affect frequency/latency? -// Sweep QoS 0-63 on compile, load, eval of a simple kernel. +// Sweep QoS 0-63 on compile, load, eval of a working kernel. #import #import #import @@ -28,15 +28,14 @@ int main() { Class g_AR = NSClassFromString(@"_ANERequest"); Class g_AIO= NSClassFromString(@"_ANEIOSurfaceObject"); - // Larger kernel for measurable latency: 256x256 conv, spatial=64 - int IC = 256, OC = 256, SP = 64; - int ws = IC*OC*2, tot = 128+ws; + // 256x256 conv, spatial=64 for measurable latency + int CH = 256, SP = 64; + int ws = CH*CH*2, tot = 128+ws; uint8_t *blob = (uint8_t*)calloc(tot, 1); blob[0]=1; blob[4]=2; blob[64]=0xEF; blob[65]=0xBE; blob[66]=0xAD; blob[67]=0xDE; blob[68]=1; *(uint32_t*)(blob+72)=ws; *(uint32_t*)(blob+80)=128; - // Random weights _Float16 *wp = (_Float16*)(blob+128); - for (int i = 0; i < IC*OC; i++) wp[i] = (_Float16)(0.01f * (i % 100 - 50)); + for (int i = 0; i < CH*CH; i++) wp[i] = (_Float16)(0.01f * (i % 100 - 50)); NSData *wdata = [NSData dataWithBytesNoCopy:blob length:tot freeWhenDone:YES]; NSString *mil = [NSString stringWithFormat: @@ -45,25 +44,29 @@ int main() { "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, " "{\"coremltools-version\", \"9.0\"}})]\n" "{\n" - " func main(tensor x) {\n" + " func main(tensor x) {\n" " string pt = const()[name=string(\"pt\"), val=string(\"valid\")];\n" " tensor st = const()[name=string(\"st\"), val=tensor([1,1])];\n" " tensor pd = const()[name=string(\"pd\"), val=tensor([0,0,0,0])];\n" " tensor dl = const()[name=string(\"dl\"), val=tensor([1,1])];\n" " int32 gr = const()[name=string(\"gr\"), val=int32(1)];\n" + " string to16 = const()[name=string(\"to16\"), val=string(\"fp16\")];\n" + " tensor x16 = cast(dtype=to16,x=x)[name=string(\"cin\")];\n" " tensor W = const()[name=string(\"W\"), " "val=tensor(BLOBFILE(path=string(\"@model_path/weights/weight.bin\"), offset=uint64(64)))];\n" - " tensor y = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W,x=x)" + " tensor y16 = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W,x=x16)" "[name=string(\"conv\")];\n" + " string to32 = const()[name=string(\"to32\"), val=string(\"fp32\")];\n" + " tensor y = cast(dtype=to32,x=y16)[name=string(\"cout\")];\n" " } -> (y);\n" - "}\n", IC, SP, OC, IC, OC, IC, OC, SP]; + "}\n", CH, SP, CH, SP, CH, CH, CH, CH, CH, SP, CH, SP]; NSDictionary *weights = @{@"@model_path/weights/weight.bin": @{@"offset":@0, @"data":wdata}}; NSData *milData = [mil dataUsingEncoding:NSUTF8StringEncoding]; NSFileManager *fm = [NSFileManager defaultManager]; - printf("=== QoS Sweep: compile/load/eval with QoS 0-63 ===\n"); - printf("Kernel: %dx%d conv, spatial=%d (%.1f MFLOPS)\n", IC, OC, SP, 2.0*IC*OC*SP/1e6); + printf("=== QoS Sweep: compile/load/eval with varying QoS ===\n"); + printf("Kernel: %dx%d conv, spatial=%d (%.1f MFLOPS)\n", CH, CH, SP, 2.0*CH*CH*SP/1e6); printf("%4s %10s %10s %10s %10s %s\n", "QoS", "Compile", "Load", "Eval(1)", "Eval(avg10)", "Status"); unsigned int qos_values[] = {0, 1, 5, 10, 15, 17, 19, 21, 25, 31, 33, 40, 47, 50, 55, 60, 63}; @@ -73,18 +76,22 @@ int main() { unsigned int qos = qos_values[qi]; NSError *e = nil; + // Make unique weights per iteration so hex differs + _Float16 *wq = (_Float16*)(blob+128); + wq[0] = (_Float16)(0.001f * qi); + NSData *wdata_q = [NSData dataWithBytes:blob length:tot]; + NSDictionary *weights_q = @{@"@model_path/weights/weight.bin": @{@"offset":@0, @"data":wdata_q}}; + id desc = ((id(*)(Class,SEL,id,id,id))objc_msgSend)(g_D, @selector(modelWithMILText:weights:optionsPlist:), - milData, weights, nil); + milData, weights_q, nil); id mdl = ((id(*)(Class,SEL,id))objc_msgSend)(g_I, @selector(inMemoryModelWithDescriptor:), desc); id hx = ((id(*)(id,SEL))objc_msgSend)(mdl, @selector(hexStringIdentifier)); - NSString *td = [NSTemporaryDirectory() stringByAppendingPathComponent: - [NSString stringWithFormat:@"qos_test_%u_%@", qos, hx]]; + NSString *td = [NSTemporaryDirectory() stringByAppendingPathComponent:hx]; [fm createDirectoryAtPath:[td stringByAppendingPathComponent:@"weights"] withIntermediateDirectories:YES attributes:nil error:nil]; [milData writeToFile:[td stringByAppendingPathComponent:@"model.mil"] atomically:YES]; - [wdata writeToFile:[td stringByAppendingPathComponent:@"weights/weight.bin"] atomically:YES]; + [wdata_q writeToFile:[td stringByAppendingPathComponent:@"weights/weight.bin"] atomically:YES]; - // Compile uint64_t t0 = mach_absolute_time(); BOOL cok = ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)( mdl, @selector(compileWithQoS:options:error:), qos, @{}, &e); @@ -96,7 +103,6 @@ int main() { continue; } - // Load t0 = mach_absolute_time(); BOOL lok = ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)( mdl, @selector(loadWithQoS:options:error:), qos, @{}, &e); @@ -104,26 +110,25 @@ int main() { if (!lok) { printf("%4u %8.1fms %10s %10s %10s LOAD_FAIL\n", qos, cms, "-", "-", "-"); + ((BOOL(*)(id,SEL,unsigned int,NSError**))objc_msgSend)(mdl, @selector(unloadWithQoS:error:), 21, &e); [fm removeItemAtPath:td error:nil]; continue; } - // Build request - IOSurfaceRef ioIn = make_surface(IC*SP*2); - IOSurfaceRef ioOut = make_surface(OC*SP*2); + int ioBytes = CH * SP * 4; + IOSurfaceRef ioIn = make_surface(ioBytes); + IOSurfaceRef ioOut = make_surface(ioBytes); id wI = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioIn); id wO = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioOut); id req = ((id(*)(Class,SEL,id,id,id,id,id,id,id))objc_msgSend)(g_AR, @selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:), @[wI], @[@0], @[wO], @[@0], nil, nil, @0); - // Write input IOSurfaceLock(ioIn, 0, NULL); - _Float16 *inp = (_Float16*)IOSurfaceGetBaseAddress(ioIn); - for (int i = 0; i < IC*SP; i++) inp[i] = (_Float16)0.5f; + float *inp = (float*)IOSurfaceGetBaseAddress(ioIn); + for (int i = 0; i < CH*SP; i++) inp[i] = 0.5f; IOSurfaceUnlock(ioIn, 0, NULL); - // Eval with same QoS t0 = mach_absolute_time(); BOOL eok = ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)( mdl, @selector(evaluateWithQoS:options:request:error:), qos, @{}, req, &e); @@ -132,7 +137,6 @@ int main() { if (!eok) { printf("%4u %8.1fms %8.1fms %10s %10s EVAL_FAIL\n", qos, cms, lms, "-", "-"); } else { - // Average over 10 evals t0 = mach_absolute_time(); for (int i = 0; i < 10; i++) { ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)( @@ -142,7 +146,6 @@ int main() { printf("%4u %8.1fms %8.1fms %8.2fms %8.2fms OK\n", qos, cms, lms, ems1, ems_avg); } - // Cleanup ((BOOL(*)(id,SEL,unsigned int,NSError**))objc_msgSend)(mdl, @selector(unloadWithQoS:error:), 21, &e); CFRelease(ioIn); CFRelease(ioOut); [fm removeItemAtPath:td error:nil]; diff --git a/training/test_weight_reload.m b/training/test_weight_reload.m index 984d77a..a248005 100644 --- a/training/test_weight_reload.m +++ b/training/test_weight_reload.m @@ -21,6 +21,45 @@ static IOSurfaceRef make_surface(size_t bytes) { (id)kIOSurfaceAllocSize:@(bytes), (id)kIOSurfacePixelFormat:@0}); } +// Build weight blob matching inmem_peak format (single chunk) +static NSData *build_weight_blob(_Float16 *w, int rows, int cols) { + int ws = rows * cols * 2; + int tot = 128 + ws; + uint8_t *b = (uint8_t*)calloc(tot, 1); + b[0] = 1; b[4] = 2; + b[64] = 0xEF; b[65] = 0xBE; b[66] = 0xAD; b[67] = 0xDE; b[68] = 1; + *(uint32_t*)(b+72) = ws; + *(uint32_t*)(b+80) = 128; + memcpy(b + 128, w, ws); + return [NSData dataWithBytesNoCopy:b length:tot freeWhenDone:YES]; +} + +// Generate MIL for a simple conv: fp32 in → cast fp16 → conv → cast fp32 out +static NSString *gen_mil(int ch, int sp) { + return [NSString stringWithFormat: + @"program(1.3)\n" + "[buildInfo = dict({{\"coremlc-component-MIL\", \"3510.2.1\"}, " + "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, " + "{\"coremltools-version\", \"9.0\"}})]\n" + "{\n" + " func main(tensor x) {\n" + " string pt = const()[name=string(\"pt\"), val=string(\"valid\")];\n" + " tensor st = const()[name=string(\"st\"), val=tensor([1,1])];\n" + " tensor pd = const()[name=string(\"pd\"), val=tensor([0,0,0,0])];\n" + " tensor dl = const()[name=string(\"dl\"), val=tensor([1,1])];\n" + " int32 gr = const()[name=string(\"gr\"), val=int32(1)];\n" + " string to16 = const()[name=string(\"to16\"), val=string(\"fp16\")];\n" + " tensor x16 = cast(dtype=to16,x=x)[name=string(\"cin\")];\n" + " tensor W = const()[name=string(\"W\"), " + "val=tensor(BLOBFILE(path=string(\"@model_path/weights/weight.bin\"), offset=uint64(64)))];\n" + " tensor y16 = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W,x=x16)" + "[name=string(\"conv\")];\n" + " string to32 = const()[name=string(\"to32\"), val=string(\"fp32\")];\n" + " tensor y = cast(dtype=to32,x=y16)[name=string(\"cout\")];\n" + " } -> (y);\n" + "}\n", ch, sp, ch, sp, ch, ch, ch, ch, ch, sp, ch, sp]; +} + int main() { @autoreleasepool { setbuf(stdout, NULL); @@ -37,55 +76,27 @@ int main() { return 1; } - // Small test: 4x4 conv kernel, spatial=4 - int IC = 4, OC = 4, SP = 4; + // Use 64-channel conv, spatial=32 (known to work on ANE) + int CH = 64, SP = 32; - // Weight set A: identity matrix - _Float16 weightsA[16]; - for (int i = 0; i < IC*OC; i++) weightsA[i] = (i / OC == i % OC) ? (_Float16)1.0f : (_Float16)0.0f; + // Weight set A: scaled identity (1.0 on diagonal) + _Float16 *weightsA = (_Float16*)calloc(CH*CH, sizeof(_Float16)); + for (int i = 0; i < CH; i++) weightsA[i*CH+i] = (_Float16)1.0f; - // Weight set B: 2x identity - _Float16 weightsB[16]; - for (int i = 0; i < IC*OC; i++) weightsB[i] = (i / OC == i % OC) ? (_Float16)2.0f : (_Float16)0.0f; - - // Build weight blob for A - int ws = IC * OC * 2; - int tot = 128 + ws; - uint8_t *blob = (uint8_t*)calloc(tot, 1); - blob[0]=1; blob[4]=2; blob[64]=0xEF; blob[65]=0xBE; blob[66]=0xAD; blob[67]=0xDE; blob[68]=1; - *(uint32_t*)(blob+72) = ws; - *(uint32_t*)(blob+80) = 128; - memcpy(blob + 128, weightsA, ws); - NSData *wdataA = [NSData dataWithBytesNoCopy:blob length:tot freeWhenDone:YES]; - - // MIL for a simple conv - NSString *mil = [NSString stringWithFormat: - @"program(1.3)\n" - "[buildInfo = dict({{\"coremlc-component-MIL\", \"3510.2.1\"}, " - "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, " - "{\"coremltools-version\", \"9.0\"}})]\n" - "{\n" - " func main(tensor x) {\n" - " string pt = const()[name=string(\"pt\"), val=string(\"valid\")];\n" - " tensor st = const()[name=string(\"st\"), val=tensor([1,1])];\n" - " tensor pd = const()[name=string(\"pd\"), val=tensor([0,0,0,0])];\n" - " tensor dl = const()[name=string(\"dl\"), val=tensor([1,1])];\n" - " int32 gr = const()[name=string(\"gr\"), val=int32(1)];\n" - " tensor W = const()[name=string(\"W\"), " - "val=tensor(BLOBFILE(path=string(\"@model_path/weights/weight.bin\"), offset=uint64(64)))];\n" - " tensor y = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W,x=x)" - "[name=string(\"conv\")];\n" - " } -> (y);\n" - "}\n", IC, SP, OC, IC, OC, IC, OC, SP]; + // Weight set B: 3x identity + _Float16 *weightsB = (_Float16*)calloc(CH*CH, sizeof(_Float16)); + for (int i = 0; i < CH; i++) weightsB[i*CH+i] = (_Float16)3.0f; + NSData *wdataA = build_weight_blob(weightsA, CH, CH); + NSString *mil = gen_mil(CH, SP); NSDictionary *weights = @{ @"@model_path/weights/weight.bin": @{@"offset": @0, @"data": wdataA} }; - NSData *milData = [mil dataUsingEncoding:NSUTF8StringEncoding]; // === Compile with weights A === printf("=== Step 1: Compile with weights A (identity) ===\n"); + printf(" Kernel: %dx%d conv, spatial=%d\n", CH, CH, SP); uint64_t t0 = mach_absolute_time(); id desc = ((id(*)(Class,SEL,id,id,id))objc_msgSend)(g_D, @selector(modelWithMILText:weights:optionsPlist:), milData, weights, nil); if (!desc) { printf("FAIL: desc=NULL\n"); return 1; } @@ -102,12 +113,13 @@ int main() { if (!ok) { printf("FAIL: compile: %s\n", [[e description] UTF8String]); return 1; } ok = ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(loadWithQoS:options:error:), 21, @{}, &e); if (!ok) { printf("FAIL: load: %s\n", [[e description] UTF8String]); return 1; } - printf(" Compile+load: %.1fms\n", tb_ms(mach_absolute_time() - t0)); + double compile_ms = tb_ms(mach_absolute_time() - t0); + printf(" Compile+load: %.1fms\n", compile_ms); printf(" tmpDir: %s\n", [td UTF8String]); - // Build request and IOSurfaces - int inBytes = IC * SP * 2; - int outBytes = OC * SP * 2; + // Build request and IOSurfaces (fp32 I/O) + int inBytes = CH * SP * 4; // fp32 + int outBytes = CH * SP * 4; IOSurfaceRef ioIn = make_surface(inBytes); IOSurfaceRef ioOut = make_surface(outBytes); id wI = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioIn); @@ -116,12 +128,12 @@ int main() { @selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:), @[wI], @[@0], @[wO], @[@0], nil, nil, @0); - // Write input: [1, 2, 3, 4] repeated across channels + // Write input: channel c, spatial s = (c*SP + s + 1) * 0.01 IOSurfaceLock(ioIn, 0, NULL); - _Float16 *inp = (_Float16*)IOSurfaceGetBaseAddress(ioIn); - for (int c = 0; c < IC; c++) + float *inp = (float*)IOSurfaceGetBaseAddress(ioIn); + for (int c = 0; c < CH; c++) for (int s = 0; s < SP; s++) - inp[c * SP + s] = (_Float16)(s + 1.0f); + inp[c*SP+s] = (float)(c*SP + s + 1) * 0.01f; IOSurfaceUnlock(ioIn, 0, NULL); // Eval with weights A @@ -130,44 +142,36 @@ int main() { if (!ok) { printf("FAIL: eval: %s\n", e ? [[e description] UTF8String] : "?"); return 1; } IOSurfaceLock(ioOut, kIOSurfaceLockReadOnly, NULL); - _Float16 *outA = (_Float16*)IOSurfaceGetBaseAddress(ioOut); - printf(" Output A (identity @ [1,2,3,4]):"); - for (int c = 0; c < OC; c++) { - printf(" ["); - for (int s = 0; s < SP; s++) printf("%.1f%s", (float)outA[c*SP+s], s 0.01f) { changed = true; break; } + float max_diff = 0; + for (int i = 0; i < CH*SP; i++) { + float d = fabsf(outB[i] - outA_copy[i]); + if (d > max_diff) max_diff = d; + if (d > 0.001f) changed = true; } - // Expected output B should be 2x output A if weight reload worked - bool correct = true; - for (int i = 0; i < OC * SP; i++) { - float expected = (float)outA_copy[i] * 2.0f; - if (fabsf((float)outB[i] - expected) > 0.1f) { correct = false; break; } + // Expected: output B should be 3x output A + bool correct_3x = true; + float max_3x_err = 0; + for (int i = 0; i < CH*SP; i++) { + float expected = outA_copy[i] * 3.0f; + float err = fabsf(outB[i] - expected); + if (err > max_3x_err) max_3x_err = err; + if (err > 0.1f) correct_3x = false; } IOSurfaceUnlock(ioOut, kIOSurfaceLockReadOnly, NULL); printf("\n=== RESULT ===\n"); - if (changed && correct) { - printf("SUCCESS: Weight reload works! Output changed to match new weights.\n"); + printf(" Max A-B diff: %.6f\n", max_diff); + printf(" Max 3x error: %.6f\n", max_3x_err); + printf(" Compile+load: %.1fms | Unload: %.1fms | Reload: %.1fms\n", compile_ms, unload_ms, reload_ms); + + if (changed && correct_3x) { + printf("\nSUCCESS: Weight reload works! Output matches 3x identity.\n"); + printf(" Speedup: compile=%.1fms vs reload=%.1fms (%.1fx faster)\n", + compile_ms, unload_ms + reload_ms, compile_ms / (unload_ms + reload_ms)); printf(">>> Compilation bottleneck can be eliminated <<<\n"); - } else if (changed && !correct) { - printf("PARTIAL: Output changed but doesn't match expected 2x. Weights may be partially updated.\n"); - printf(" Expected 2x of A, got different values.\n"); + } else if (changed && !correct_3x) { + printf("\nPARTIAL: Output changed but doesn't match expected 3x.\n"); } else { - printf("FAIL: Output did NOT change. Weight reload does not work.\n"); - printf(" Output is still the same as weights A. ANE cached the compiled model.\n"); - printf(">>> Need alternative approach (weightsBuffer IOSurface or async recompile) <<<\n"); + printf("\nFAIL: Output did NOT change. Weight reload does not work.\n"); + printf(" ANE cached the compiled model — weights baked at compile time.\n"); + printf(">>> Need alternative: weightsBuffer IOSurface or async recompile <<<\n"); } // Cleanup ((BOOL(*)(id,SEL,unsigned int,NSError**))objc_msgSend)(mdl, @selector(unloadWithQoS:error:), 21, &e); [fm removeItemAtPath:td error:nil]; CFRelease(ioIn); CFRelease(ioOut); + free(outA_copy); free(weightsA); free(weightsB); } return 0; }