commit f213c8db68e0a328ac300544cc12b9c41bac9f05 Author: maderix Date: Sat Feb 28 00:22:06 2026 -0800 Initial release diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..2e0bef4 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2026 maderix + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md new file mode 100644 index 0000000..d2c7bb2 --- /dev/null +++ b/README.md @@ -0,0 +1,111 @@ +# ANE Training — Backpropagation on Apple Neural Engine + +Training neural networks directly on Apple's Neural Engine (ANE) via reverse-engineered private APIs. No CoreML training APIs, no Metal, no GPU — pure ANE compute. + +## What This Is + +A from-scratch implementation of transformer training (forward + backward pass) running on the ANE in Apple Silicon. The ANE is a 15.8 TFLOPS (M4) inference accelerator that Apple does not expose for training. This project reverse-engineers the `_ANEClient` / `_ANECompiler` private APIs and the MIL (Model Intermediate Language) format to run custom compute graphs — including backpropagation — directly on ANE hardware. + +**Current results (M4, single transformer layer, dim=768, seq=512):** +- 9.3 ms/step, 11.2% ANE utilization (1.78 TFLOPS sustained) +- 6 ANE kernel dispatches per training step +- All forward and backward dx passes on ANE, dW gradients on CPU (Accelerate cblas) +- Adam optimizer, gradient accumulation, checkpoint/resume + +## Architecture + +The training loop uses 6 ANE kernels per step: + +| Kernel | Function | Weights | +|--------|----------|---------| +| `kFwdAttn` | RMSNorm + QKV projection + SDPA + output projection | Wq, Wk, Wv, Wo, rms1, mask | +| `kFwdFFN` | RMSNorm + SwiGLU FFN (W1, W3, SiLU, W2) | W1, W2, W3, rms2 | +| `kFFNBwd` | FFN backward (W2^T + SiLU_bwd + W1^T + W3^T) | W2^T, W1^T, W3^T | +| `kSdpaBwd1` | Wo^T + SDPA backward part 1 (dV, probs, dp) | Wo^T, mask | +| `kSdpaBwd2` | SDPA backward part 2 (softmax grad, dQ, dK) | — | +| `kQKVb` | QKV backward (Wq^T + Wk^T + Wv^T → dx) | Wq^T, Wk^T, Wv^T | + +CPU handles: RMSNorm backward, residual connections, loss computation, dW gradient accumulation (cblas_sgemm), Adam optimizer updates. + +Key optimizations: +- **Channel-first CPU layout** — matches ANE IOSurface `[1,C,1,S]` format, eliminates all transpose overhead +- **vDSP vectorized RMSNorm** — 10x faster than naive (6.7ms → 0.7ms) +- **GCD async cblas overlap** — dW gradient sgemms run in parallel with ANE evals on a serial dispatch queue +- **Deferred cblas wait** — wait pushed into next step's forward pass for maximum overlap +- **ANE RMSNorm fusion** — RMSNorm folded into forward kernels as MIL ops (reduce_sum + pow + mul) +- **Wo^T fusion** — output projection backward merged into SDPA backward kernel +- **Forward taps** — Q, K, V, attention scores, hidden states exposed via concat outputs, avoiding CPU recompute +- **exec() restart** — bypasses ~119 ANE compile limit per process + +## File Structure + +``` +├── api_exploration.m # Initial ANE API discovery +├── inmem_basic.m # In-memory MIL compilation proof-of-concept +├── inmem_bench.m # ANE dispatch latency benchmarks +├── inmem_peak.m # Peak TFLOPS measurement (2048x2048 matmul) +├── sram_bench.m # ANE SRAM bandwidth probing +├── sram_probe.m # SRAM size/layout exploration +└── training/ + ├── ane_runtime.h # ANE private API wrapper (compile, eval, IOSurface) + ├── ane_mil_gen.h # MIL program generation helpers + ├── model.h # Model weight initialization and blob builders + ├── forward.h # Forward pass MIL generators + ├── backward.h # Backward pass MIL generators + ├── train.m # Minimal training loop (early prototype) + ├── tiny_train.m # 2-layer tiny model training + ├── train_large.m # Main: single-layer dim=768 training (optimized) + ├── test_*.m # Unit tests for individual kernels + └── Makefile +``` + +## Building + +Requires macOS 15+ on Apple Silicon (tested on M4). + +```bash +# Build the main training program +xcrun clang -O2 -framework Foundation -framework IOSurface \ + -framework CoreML -framework Accelerate -ldl -lobjc \ + -o train_large training/train_large.m + +# Run +./train_large +``` + +No external dependencies. Uses only system frameworks + private ANE APIs resolved at runtime via `objc_msgSend`. + +## How It Works + +1. **MIL generation** — Objective-C code constructs MIL program text at runtime, specifying convolutions (for linear layers), matmul (for attention), softmax, element-wise ops +2. **In-memory compilation** — `_ANEInMemoryModelDescriptor` compiles MIL text + weight blobs directly to ANE programs, no disk mlmodelc needed +3. **IOSurface I/O** — Input/output tensors passed via IOSurface shared memory in `[1, channels, 1, spatial]` format (fp16) +4. **Weight embedding** — Weights baked into ANE programs as BLOBFILE constants; recompiled each batch when weights change +5. **Gradient flow** — Forward taps expose intermediates needed for backward; backward kernels compute dx (input gradients) on ANE; dW (weight gradients) computed on CPU via cblas + +## Limitations + +- **SDPA causal masking** — ANE hardware ignores `attn_mask` in SDPA ops; causal attention is decomposed into separate Q@K^T (ANE) → mask+softmax (ANE via add+softmax) → scores@V (ANE) +- **~119 compile limit** — ANE compiler leaks resources; worked around via `exec()` restart with checkpoint +- **Single layer** — Currently trains one transformer layer; multi-layer would need pipeline scheduling +- **Synthetic data** — Currently uses random data for benchmarking; real tokenized data support is WIP + +## Performance History + +| Optimization | ms/step | ANE util | +|---|---|---| +| Baseline (vDSP transpose) | 33.5 | 3.1% | +| Channel-first layout | 20.3 | 5.2% | +| vDSP vectorized RMSNorm | 14.2 | 7.4% | +| GCD async cblas overlap | 11.4 | 9.2% | +| ANE RMSNorm fusion | 11.4 | 9.2% | +| Wo^T fusion (7→6 kernels) | 11.4 | 9.2% | +| Deferred cblas wait | **9.3** | **11.2%** | + +## Disclaimer + +This project is independent research into Apple Neural Engine architecture. It uses undocumented APIs discovered through runtime introspection for research and educational purposes under fair use and interoperability provisions (see *Sega v. Accolade*, 1992; DMCA §1201(f)). No Apple proprietary code or binaries are included in this repository. This project is not affiliated with or endorsed by Apple Inc. Use at your own risk. + +## License + +MIT — see [LICENSE](LICENSE) diff --git a/api_exploration.m b/api_exploration.m new file mode 100644 index 0000000..a778def --- /dev/null +++ b/api_exploration.m @@ -0,0 +1,167 @@ +#import +#import +#import +#import +#import +#import +#import + +static mach_timebase_info_data_t g_tb; +static double ticksToMs(uint64_t t) { return (double)t * g_tb.numer / g_tb.denom / 1e6; } + +int main() { + @autoreleasepool { + mach_timebase_info(&g_tb); + dlopen("/System/Library/PrivateFrameworks/AppleNeuralEngine.framework/AppleNeuralEngine", RTLD_NOW); + + // === Approach 1: MLModelAsset from compiled .mlmodelc data === + // First compile a known-working model to .mlmodelc + printf("=== Approach 1: MLModelAsset in-memory ===\n"); + NSError *e = nil; + NSURL *src = [NSURL fileURLWithPath:@"/tmp/ane_sram_1024ch_64sp.mlpackage"]; + NSURL *compiled = [MLModel compileModelAtURL:src error:&e]; + if (e) { printf("Compile failed: %s\n", [[e description] UTF8String]); return 1; } + printf("Compiled to: %s\n", [[compiled path] UTF8String]); + + // Read the model.mlmodel spec from the compiled bundle + // The spec is typically in coremldata.bin or model.mlmodel + NSFileManager *fm = [NSFileManager defaultManager]; + NSArray *files = [fm contentsOfDirectoryAtPath:[compiled path] error:nil]; + printf("Files in .mlmodelc:\n"); + for (NSString *f in files) printf(" %s\n", [f UTF8String]); + + // Try loading with MLModelAsset + // MLModelAsset has modelAssetWithURL: on macOS 15 + if (@available(macOS 14.0, *)) { + // Read the spec data + NSString *specPath = [[compiled path] stringByAppendingPathComponent:@"coremldata.bin"]; + if (![fm fileExistsAtPath:specPath]) { + specPath = [[compiled path] stringByAppendingPathComponent:@"model.mlmodel"]; + } + NSData *specData = [NSData dataWithContentsOfFile:specPath]; + printf("Spec data: %lu bytes from %s\n", (unsigned long)[specData length], + [[specPath lastPathComponent] UTF8String]); + + // Try MLModelAsset + Class assetClass = NSClassFromString(@"MLModelAsset"); + if (assetClass) { + printf("MLModelAsset class found\n"); + // List methods + unsigned int count; + Method *methods = class_copyMethodList(object_getClass(assetClass), &count); + for (unsigned int i = 0; i < count; i++) + printf(" + %s\n", sel_getName(method_getName(methods[i]))); + free(methods); + } + } + + // === Approach 2: Read a .mlmodelc, extract MIL, feed to _ANEInMemoryModelDescriptor === + printf("\n=== Approach 2: Inspect MIL in compiled model ===\n"); + // Look for model.mil or any MIL file + NSDirectoryEnumerator *en = [fm enumeratorAtPath:[compiled path]]; + NSString *f; + while ((f = [en nextObject])) { + NSString *full = [[compiled path] stringByAppendingPathComponent:f]; + BOOL isDir; + [fm fileExistsAtPath:full isDirectory:&isDir]; + if (!isDir) { + NSDictionary *attrs = [fm attributesOfItemAtPath:full error:nil]; + printf(" %s (%llu bytes)\n", [f UTF8String], + [[attrs objectForKey:NSFileSize] unsignedLongLongValue]); + } + } + + // Try to find and read model.mil + NSString *milPath = [[compiled path] stringByAppendingPathComponent:@"model.mil"]; + if ([fm fileExistsAtPath:milPath]) { + NSString *milText = [NSString stringWithContentsOfFile:milPath encoding:NSUTF8StringEncoding error:nil]; + printf("\n=== model.mil contents (first 2000 chars) ===\n"); + printf("%s\n", [[milText substringToIndex:MIN(2000, [milText length])] UTF8String]); + } + + // Also check for mlmodelc structure + NSString *aneDir = nil; + en = [fm enumeratorAtPath:[compiled path]]; + while ((f = [en nextObject])) { + if ([f hasSuffix:@".espresso.net"] || [f hasSuffix:@".hwx"] || [f hasSuffix:@".mil"]) { + printf(" FOUND: %s\n", [f UTF8String]); + } + } + + // === Approach 3: Try _ANEInMemoryModelDescriptor with actual MIL from compiled model === + printf("\n=== Approach 3: _ANEInMemoryModelDescriptor ===\n"); + Class ANEInMemDesc = NSClassFromString(@"_ANEInMemoryModelDescriptor"); + if (ANEInMemDesc) { + printf("Class exists. Methods:\n"); + unsigned int count; + Method *methods = class_copyMethodList(object_getClass(ANEInMemDesc), &count); + for (unsigned int i = 0; i < count; i++) { + SEL s = method_getName(methods[i]); + printf(" + %s (args: %d)\n", sel_getName(s), method_getNumberOfArguments(methods[i])); + } + free(methods); + methods = class_copyMethodList(ANEInMemDesc, &count); + printf("Instance methods:\n"); + for (unsigned int i = 0; i < count; i++) { + SEL s = method_getName(methods[i]); + const char *enc = method_getTypeEncoding(methods[i]); + printf(" - %s [%s]\n", sel_getName(s), enc ? enc : "?"); + } + free(methods); + + // If model.mil exists, try feeding it + if ([fm fileExistsAtPath:milPath]) { + NSString *milText = [NSString stringWithContentsOfFile:milPath encoding:NSUTF8StringEncoding error:nil]; + printf("\nTrying modelWithMILText: with actual model.mil...\n"); + id desc = ((id(*)(Class,SEL,id,id,id))objc_msgSend)( + ANEInMemDesc, @selector(modelWithMILText:weights:optionsPlist:), + milText, nil, nil); + printf("Result: %s\n", desc ? [[desc description] UTF8String] : "nil"); + + // Try with NSData + NSData *milData = [milText dataUsingEncoding:NSUTF8StringEncoding]; + desc = ((id(*)(Class,SEL,id,id,id))objc_msgSend)( + ANEInMemDesc, @selector(modelWithMILText:weights:optionsPlist:), + milData, nil, nil); + printf("Result (NSData): %s\n", desc ? [[desc description] UTF8String] : "nil"); + } + } else { + printf("_ANEInMemoryModelDescriptor NOT FOUND\n"); + } + + // === Approach 4: Hook into what CoreML actually sends to ANE === + printf("\n=== Approach 4: Trace CoreML -> ANE path ===\n"); + // Load the model the normal working way and inspect the _ANEModel + MLModelConfiguration *config = [[MLModelConfiguration alloc] init]; + config.computeUnits = MLComputeUnitsAll; + MLModel *model = [MLModel modelWithContentsOfURL:compiled configuration:config error:&e]; + if (e) { printf("MLModel load failed: %s\n", [[e description] UTF8String]); return 1; } + + // Try to get internal model object + printf("MLModel: %s\n", [[model description] UTF8String]); + + // Check if we can access the ANE model through the MLModel + // Try KVC for internal properties + @try { + id engine = [model valueForKey:@"engine"]; + printf("engine: %s\n", engine ? [[engine description] UTF8String] : "nil"); + } @catch(NSException *ex) { + printf("No 'engine' key\n"); + } + @try { + id proxy = [model valueForKey:@"proxy"]; + printf("proxy: %s\n", proxy ? [NSStringFromClass([proxy class]) UTF8String] : "nil"); + } @catch(NSException *ex) { + printf("No 'proxy' key\n"); + } + + // Check MLNeuralNetworkEngine or MLANEEngine + Class aneEngine = NSClassFromString(@"MLANEEngine"); + Class nnEngine = NSClassFromString(@"MLNeuralNetworkEngine"); + Class milEngine = NSClassFromString(@"MLMILComputeEngine"); + printf("MLANEEngine: %s\n", aneEngine ? "exists" : "not found"); + printf("MLNeuralNetworkEngine: %s\n", nnEngine ? "exists" : "not found"); + printf("MLMILComputeEngine: %s\n", milEngine ? "exists" : "not found"); + } + return 0; +} diff --git a/inmem_basic.m b/inmem_basic.m new file mode 100644 index 0000000..2a54505 --- /dev/null +++ b/inmem_basic.m @@ -0,0 +1,129 @@ +#import +#import +#import +#import +#import +#import +#import + +static mach_timebase_info_data_t g_tb; +static double ticksToMs(uint64_t t) { return (double)t * g_tb.numer / g_tb.denom / 1e6; } + +int main() { + @autoreleasepool { + mach_timebase_info(&g_tb); + dlopen("/System/Library/PrivateFrameworks/AppleNeuralEngine.framework/AppleNeuralEngine", RTLD_NOW); + + NSError *e = nil; + int ch = 256, sp = 64; + + // Get MIL and weights from a compiled model + NSURL *compiled = [MLModel compileModelAtURL: + [NSURL fileURLWithPath:@"/tmp/ane_sram_256ch_64sp.mlpackage"] error:&e]; + if (e) { printf("Compile failed\n"); return 1; } + + NSData *milData = [[NSString stringWithContentsOfFile: + [[compiled path] stringByAppendingPathComponent:@"model.mil"] + encoding:NSUTF8StringEncoding error:nil] dataUsingEncoding:NSUTF8StringEncoding]; + NSData *weightBlob = [NSData dataWithContentsOfFile: + [[compiled path] stringByAppendingPathComponent:@"weights/weight.bin"]]; + + Class Desc = NSClassFromString(@"_ANEInMemoryModelDescriptor"); + Class IMM = NSClassFromString(@"_ANEInMemoryModel"); + Class AR = NSClassFromString(@"_ANERequest"); + Class AIO = NSClassFromString(@"_ANEIOSurfaceObject"); + + NSDictionary *wdict = @{ + @"@model_path/weights/weight.bin": @{@"offset": @64, @"data": weightBlob} + }; + id desc = ((id(*)(Class,SEL,id,id,id))objc_msgSend)( + Desc, @selector(modelWithMILText:weights:optionsPlist:), + milData, wdict, nil); + id model = ((id(*)(Class,SEL,id))objc_msgSend)(IMM, @selector(inMemoryModelWithDescriptor:), desc); + + // Get the hex identifier to pre-populate the temp dir + id hexId = ((id(*)(id,SEL))objc_msgSend)(model, @selector(hexStringIdentifier)); + NSString *tmpDir = [NSTemporaryDirectory() stringByAppendingPathComponent:hexId]; + NSFileManager *fm = [NSFileManager defaultManager]; + + // Pre-create dir with MIL and weights + [fm createDirectoryAtPath:[tmpDir stringByAppendingPathComponent:@"weights"] + withIntermediateDirectories:YES attributes:nil error:nil]; + [milData writeToFile:[tmpDir stringByAppendingPathComponent:@"model.mil"] atomically:YES]; + [weightBlob writeToFile:[tmpDir stringByAppendingPathComponent:@"weights/weight.bin"] atomically:YES]; + printf("Pre-created: %s\n", [tmpDir UTF8String]); + + // Compile + printf("Compiling...\n"); + BOOL ok = ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)( + model, @selector(compileWithQoS:options:error:), 21, @{}, &e); + printf("compile: %s\n", ok ? "YES" : "NO"); + if (e) { printf(" err: %s\n", [[e description] UTF8String]); e=nil; } + if (!ok) return 1; + + // Load + printf("Loading...\n"); + ok = ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)( + model, @selector(loadWithQoS:options:error:), 21, @{}, &e); + printf("load: %s\n", ok ? "YES" : "NO"); + if (e) { printf(" err: %s\n", [[e description] UTF8String]); e=nil; } + if (!ok) return 1; + + printf("state: %lu\n", ((NSUInteger(*)(id,SEL))objc_msgSend)(model, @selector(state))); + + // Create IO surfaces + NSUInteger bytes = ch * sp * 4; + IOSurfaceRef ioIn = IOSurfaceCreate((__bridge CFDictionaryRef)@{ + (id)kIOSurfaceWidth:@(bytes),(id)kIOSurfaceHeight:@1, + (id)kIOSurfaceBytesPerElement:@1,(id)kIOSurfaceBytesPerRow:@(bytes), + (id)kIOSurfaceAllocSize:@(bytes),(id)kIOSurfacePixelFormat:@0}); + IOSurfaceRef ioOut = IOSurfaceCreate((__bridge CFDictionaryRef)@{ + (id)kIOSurfaceWidth:@(bytes),(id)kIOSurfaceHeight:@1, + (id)kIOSurfaceBytesPerElement:@1,(id)kIOSurfaceBytesPerRow:@(bytes), + (id)kIOSurfaceAllocSize:@(bytes),(id)kIOSurfacePixelFormat:@0}); + id wIn = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(AIO, @selector(objectWithIOSurface:), ioIn); + id wOut = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(AIO, @selector(objectWithIOSurface:), ioOut); + id req = ((id(*)(Class,SEL,id,id,id,id,id,id,id))objc_msgSend)(AR, + @selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:), + @[wIn], @[@0], @[wOut], @[@0], nil, nil, @0); + + // Evaluate + printf("Evaluating...\n"); + ok = ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)( + model, @selector(evaluateWithQoS:options:request:error:), + 21, @{}, req, &e); + printf("evaluate: %s\n", ok ? "YES" : "NO"); + if (e) { printf(" err: %s\n", [[e description] UTF8String]); e=nil; } + + if (ok) { + // Warmup + for (int i = 0; i < 10; i++) + ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)( + model, @selector(evaluateWithQoS:options:request:error:), + 21, @{}, req, &e); + + // Benchmark + int iters = 100; + uint64_t t0 = mach_absolute_time(); + for (int i = 0; i < iters; i++) + ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)( + model, @selector(evaluateWithQoS:options:request:error:), + 21, @{}, req, &e); + double ms = ticksToMs(mach_absolute_time() - t0) / iters; + double gf = 2.0*ch*ch*sp/1e9; + double tflops = gf / ms; + + printf("\n========================================\n"); + printf("IN-MEMORY ANE EXECUTION SUCCESSFUL!\n"); + printf(" %.3f ms/eval, %.2f TFLOPS\n", ms, tflops); + printf("========================================\n"); + } + + // Cleanup + ((BOOL(*)(id,SEL,unsigned int,NSError**))objc_msgSend)( + model, @selector(unloadWithQoS:error:), 21, &e); + CFRelease(ioIn); CFRelease(ioOut); + [fm removeItemAtPath:tmpDir error:nil]; + } + return 0; +} diff --git a/inmem_bench.m b/inmem_bench.m new file mode 100644 index 0000000..8a5af33 --- /dev/null +++ b/inmem_bench.m @@ -0,0 +1,111 @@ +#import +#import +#import +#import +#import +#import +#import + +static mach_timebase_info_data_t g_tb; +static double ticksToMs(uint64_t t) { return (double)t * g_tb.numer / g_tb.denom / 1e6; } + +double benchInMem(int ch, int sp) { + @autoreleasepool { + NSError *e = nil; + NSString *path = [NSString stringWithFormat:@"/tmp/ane_sram_%dch_%dsp.mlpackage", ch, sp]; + NSURL *compiled = [MLModel compileModelAtURL:[NSURL fileURLWithPath:path] error:&e]; + if (e) return -1; + + NSData *milData = [[NSString stringWithContentsOfFile: + [[compiled path] stringByAppendingPathComponent:@"model.mil"] + encoding:NSUTF8StringEncoding error:nil] dataUsingEncoding:NSUTF8StringEncoding]; + NSData *weightBlob = [NSData dataWithContentsOfFile: + [[compiled path] stringByAppendingPathComponent:@"weights/weight.bin"]]; + + Class Desc = NSClassFromString(@"_ANEInMemoryModelDescriptor"); + Class IMM = NSClassFromString(@"_ANEInMemoryModel"); + Class AR = NSClassFromString(@"_ANERequest"); + Class AIO = NSClassFromString(@"_ANEIOSurfaceObject"); + + NSDictionary *wdict = @{ + @"@model_path/weights/weight.bin": @{@"offset": @64, @"data": weightBlob} + }; + id desc = ((id(*)(Class,SEL,id,id,id))objc_msgSend)( + Desc, @selector(modelWithMILText:weights:optionsPlist:), + milData, wdict, nil); + if (!desc) return -2; + id model = ((id(*)(Class,SEL,id))objc_msgSend)(IMM, @selector(inMemoryModelWithDescriptor:), desc); + if (!model) return -3; + + id hexId = ((id(*)(id,SEL))objc_msgSend)(model, @selector(hexStringIdentifier)); + NSString *tmpDir = [NSTemporaryDirectory() stringByAppendingPathComponent:hexId]; + NSFileManager *fm = [NSFileManager defaultManager]; + [fm createDirectoryAtPath:[tmpDir stringByAppendingPathComponent:@"weights"] + withIntermediateDirectories:YES attributes:nil error:nil]; + [milData writeToFile:[tmpDir stringByAppendingPathComponent:@"model.mil"] atomically:YES]; + [weightBlob writeToFile:[tmpDir stringByAppendingPathComponent:@"weights/weight.bin"] atomically:YES]; + + BOOL ok = ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)( + model, @selector(compileWithQoS:options:error:), 21, @{}, &e); + if (!ok) { [fm removeItemAtPath:tmpDir error:nil]; return -4; } + + ok = ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)( + model, @selector(loadWithQoS:options:error:), 21, @{}, &e); + if (!ok) { [fm removeItemAtPath:tmpDir error:nil]; return -5; } + + NSUInteger bytes = ch * sp * 4; + IOSurfaceRef ioIn = IOSurfaceCreate((__bridge CFDictionaryRef)@{ + (id)kIOSurfaceWidth:@(bytes),(id)kIOSurfaceHeight:@1, + (id)kIOSurfaceBytesPerElement:@1,(id)kIOSurfaceBytesPerRow:@(bytes), + (id)kIOSurfaceAllocSize:@(bytes),(id)kIOSurfacePixelFormat:@0}); + IOSurfaceRef ioOut = IOSurfaceCreate((__bridge CFDictionaryRef)@{ + (id)kIOSurfaceWidth:@(bytes),(id)kIOSurfaceHeight:@1, + (id)kIOSurfaceBytesPerElement:@1,(id)kIOSurfaceBytesPerRow:@(bytes), + (id)kIOSurfaceAllocSize:@(bytes),(id)kIOSurfacePixelFormat:@0}); + id wIn = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(AIO, @selector(objectWithIOSurface:), ioIn); + id wOut = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(AIO, @selector(objectWithIOSurface:), ioOut); + id req = ((id(*)(Class,SEL,id,id,id,id,id,id,id))objc_msgSend)(AR, + @selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:), + @[wIn], @[@0], @[wOut], @[@0], nil, nil, @0); + + for (int i = 0; i < 5; i++) + ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)( + model, @selector(evaluateWithQoS:options:request:error:), 21, @{}, req, &e); + + int iters = 50; + uint64_t t0 = mach_absolute_time(); + for (int i = 0; i < iters; i++) + ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)( + model, @selector(evaluateWithQoS:options:request:error:), 21, @{}, req, &e); + double ms = ticksToMs(mach_absolute_time() - t0) / iters; + + ((BOOL(*)(id,SEL,unsigned int,NSError**))objc_msgSend)(model, @selector(unloadWithQoS:error:), 21, &e); + CFRelease(ioIn); CFRelease(ioOut); + [fm removeItemAtPath:tmpDir error:nil]; + return ms; + } +} + +int main() { + mach_timebase_info(&g_tb); + dlopen("/System/Library/PrivateFrameworks/AppleNeuralEngine.framework/AppleNeuralEngine", RTLD_NOW); + + printf("=== In-Memory ANE Benchmark ===\n\n"); + printf("%-12s %8s %10s %8s\n", "Config", "W (MB)", "ms/eval", "TFLOPS"); + printf("---------------------------------------------\n"); + + int chs[] = {256, 512, 1024, 2048, 3072, 4096}; + int sps[] = {64, 64, 64, 64, 64, 64}; + for (int i = 0; i < 6; i++) { + int ch = chs[i], sp = sps[i]; + double w_mb = (double)ch*ch*2/1024/1024; + double gf = 2.0*ch*ch*sp/1e9; + double ms = benchInMem(ch, sp); + double tflops = (ms > 0) ? gf/ms : 0; + if (ms > 0) + printf("%4dch x%2dsp %7.1f %8.3f ms %7.2f\n", ch, sp, w_mb, ms, tflops); + else + printf("%4dch x%2dsp %7.1f FAIL(%.0f)\n", ch, sp, w_mb, ms); + } + return 0; +} diff --git a/inmem_peak.m b/inmem_peak.m new file mode 100644 index 0000000..87b8163 --- /dev/null +++ b/inmem_peak.m @@ -0,0 +1,111 @@ +#import +#import +#import +#import +#import +#import +#import + +static mach_timebase_info_data_t g_tb; +static double ticksToMs(uint64_t t) { return (double)t * g_tb.numer / g_tb.denom / 1e6; } + +NSData *buildWeightBlob(int ch, int depth) { + NSUInteger wsize = ch * ch * 2; + NSUInteger chunkSize = 64 + wsize; + NSUInteger total = 64 + chunkSize * depth; + uint8_t *buf = calloc(total, 1); + buf[0] = 0x01; buf[4] = 0x02; + for (int i = 0; i < depth; i++) { + uint8_t *chunk = buf + 64 + i * chunkSize; + chunk[0]=0xEF; chunk[1]=0xBE; chunk[2]=0xAD; chunk[3]=0xDE; + chunk[4]=0x01; chunk[10]=0x08; + uint16_t *fp16 = (uint16_t*)(chunk + 64); + for (NSUInteger j = 0; j < wsize/2; j++) fp16[j] = (arc4random()&0x03FF)|0x2000; + } + return [NSData dataWithBytesNoCopy:buf length:total freeWhenDone:YES]; +} + +NSString *genMIL(int ch, int sp, int depth) { + NSMutableString *m = [NSMutableString string]; + [m appendString:@"program(1.3)\n[buildInfo = dict({{\"coremlc-component-MIL\", \"3510.2.1\"}, {\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, {\"coremltools-version\", \"9.0\"}})]\n{\n"]; + [m appendFormat:@" func main(tensor x) {\n", ch, sp]; + [m appendString:@" string c_pad_type_0 = const()[name = string(\"c_pad_type_0\"), val = string(\"valid\")];\n" + @" tensor c_strides_0 = const()[name = string(\"c_strides_0\"), val = tensor([1, 1])];\n" + @" tensor c_pad_0 = const()[name = string(\"c_pad_0\"), val = tensor([0, 0, 0, 0])];\n" + @" tensor c_dilations_0 = const()[name = string(\"c_dilations_0\"), val = tensor([1, 1])];\n" + @" int32 c_groups_0 = const()[name = string(\"c_groups_0\"), val = int32(1)];\n" + @" string x_to_fp16_dtype_0 = const()[name = string(\"x_to_fp16_dtype_0\"), val = string(\"fp16\")];\n"]; + [m appendFormat:@" tensor x_to_fp16 = cast(dtype = x_to_fp16_dtype_0, x = x)[name = string(\"cast_in\")];\n", ch, sp]; + NSUInteger cs = 64 + ch*ch*2; + NSString *prev = @"x_to_fp16"; + for (int i = 0; i < depth; i++) { + [m appendFormat:@" tensor W%d = const()[name = string(\"W%d\"), val = tensor(BLOBFILE(path = string(\"@model_path/weights/weight.bin\"), offset = uint64(%lu)))];\n", + ch, ch, i, i, ch, ch, (unsigned long)(64 + i*cs)]; + NSString *out = [NSString stringWithFormat:@"c%d", i]; + [m appendFormat:@" tensor %@ = conv(dilations = c_dilations_0, groups = c_groups_0, pad = c_pad_0, pad_type = c_pad_type_0, strides = c_strides_0, weight = W%d, x = %@)[name = string(\"%@\")];\n", + ch, sp, out, i, prev, out]; + prev = out; + } + [m appendString:@" string to_fp32 = const()[name = string(\"to_fp32\"), val = string(\"fp32\")];\n"]; + [m appendFormat:@" tensor c = cast(dtype = to_fp32, x = %@)[name = string(\"cast_out\")];\n", ch, sp, prev]; + [m appendString:@" } -> (c);\n}\n"]; + return m; +} + +double bench(int ch, int sp, int depth) { + @autoreleasepool { + NSError *e = nil; + NSData *milData = [[genMIL(ch,sp,depth) dataUsingEncoding:NSUTF8StringEncoding] copy]; + NSData *wb = buildWeightBlob(ch, depth); + Class D=NSClassFromString(@"_ANEInMemoryModelDescriptor"), I=NSClassFromString(@"_ANEInMemoryModel"); + Class AR=NSClassFromString(@"_ANERequest"), AIO=NSClassFromString(@"_ANEIOSurfaceObject"); + id desc=((id(*)(Class,SEL,id,id,id))objc_msgSend)(D,@selector(modelWithMILText:weights:optionsPlist:),milData,@{@"@model_path/weights/weight.bin":@{@"offset":@0,@"data":wb}},nil); + if(!desc)return -1; + id mdl=((id(*)(Class,SEL,id))objc_msgSend)(I,@selector(inMemoryModelWithDescriptor:),desc); + id hx=((id(*)(id,SEL))objc_msgSend)(mdl,@selector(hexStringIdentifier)); + NSString *td=[NSTemporaryDirectory() stringByAppendingPathComponent:hx]; + NSFileManager *fm=[NSFileManager defaultManager]; + [fm createDirectoryAtPath:[td stringByAppendingPathComponent:@"weights"] withIntermediateDirectories:YES attributes:nil error:nil]; + [milData writeToFile:[td stringByAppendingPathComponent:@"model.mil"] atomically:YES]; + [wb writeToFile:[td stringByAppendingPathComponent:@"weights/weight.bin"] atomically:YES]; + if(!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl,@selector(compileWithQoS:options:error:),21,@{},&e)){[fm removeItemAtPath:td error:nil];return -3;} + if(!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl,@selector(loadWithQoS:options:error:),21,@{},&e)){[fm removeItemAtPath:td error:nil];return -4;} + NSUInteger bytes=ch*sp*4; + IOSurfaceRef ioI=IOSurfaceCreate((__bridge CFDictionaryRef)@{(id)kIOSurfaceWidth:@(bytes),(id)kIOSurfaceHeight:@1,(id)kIOSurfaceBytesPerElement:@1,(id)kIOSurfaceBytesPerRow:@(bytes),(id)kIOSurfaceAllocSize:@(bytes),(id)kIOSurfacePixelFormat:@0}); + IOSurfaceRef ioO=IOSurfaceCreate((__bridge CFDictionaryRef)@{(id)kIOSurfaceWidth:@(bytes),(id)kIOSurfaceHeight:@1,(id)kIOSurfaceBytesPerElement:@1,(id)kIOSurfaceBytesPerRow:@(bytes),(id)kIOSurfaceAllocSize:@(bytes),(id)kIOSurfacePixelFormat:@0}); + id wI=((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(AIO,@selector(objectWithIOSurface:),ioI); + id wO=((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(AIO,@selector(objectWithIOSurface:),ioO); + id req=((id(*)(Class,SEL,id,id,id,id,id,id,id))objc_msgSend)(AR,@selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:),@[wI],@[@0],@[wO],@[@0],nil,nil,@0); + for(int i=0;i<10;i++)((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)(mdl,@selector(evaluateWithQoS:options:request:error:),21,@{},req,&e); + int it=50; uint64_t t0=mach_absolute_time(); + for(int i=0;i0?gf/ms:0; + if(ms>0)printf("%-28s %6.1f %6.2f %7.3f ms %6.2f %5.1f%%\n",l,w,gf,ms,tf,tf/0.019*100); + else printf("%-28s %6.1f %6.2f FAIL(%.0f)\n",l,w,gf,ms); + } + return 0; +} diff --git a/sram_bench.m b/sram_bench.m new file mode 100644 index 0000000..9dc3a35 --- /dev/null +++ b/sram_bench.m @@ -0,0 +1,101 @@ +#import +#import +#import +#import +#import +#import +#import + +static mach_timebase_info_data_t g_tb; +static double ticksToMs(uint64_t t) { return (double)t * g_tb.numer / g_tb.denom / 1e6; } +static id g_client; +static Class AM, AR, AIO; + +double bench(const char *path, int ch, int sp) { + @autoreleasepool { + NSError *e = nil; + NSURL *compiled = [MLModel compileModelAtURL: + [NSURL fileURLWithPath:[NSString stringWithUTF8String:path]] error:&e]; + if (e) return -1; + id model = ((id(*)(Class,SEL,id,id))objc_msgSend)(AM, @selector(modelAtURL:key:), compiled, @"s"); + BOOL ok = ((BOOL(*)(id,SEL,id,id,NSUInteger,NSError**))objc_msgSend)( + g_client, @selector(compileModel:options:qos:error:), model, + @{@"kANEFModelType":@"kANEFModelMIL",@"kANEFNetPlistFilenameKey":@"model.mil"}, 21, &e); + if (!ok) return -2; + ok = ((BOOL(*)(id,SEL,id,id,NSUInteger,NSError**))objc_msgSend)( + g_client, @selector(loadModel:options:qos:error:), model, @{}, 21, &e); + if (!ok) return -3; + + NSUInteger bytes = ch * sp * 4; // FP32 input + IOSurfaceRef ioIn = IOSurfaceCreate((__bridge CFDictionaryRef)@{ + (id)kIOSurfaceWidth:@(bytes),(id)kIOSurfaceHeight:@1, + (id)kIOSurfaceBytesPerElement:@1,(id)kIOSurfaceBytesPerRow:@(bytes), + (id)kIOSurfaceAllocSize:@(bytes),(id)kIOSurfacePixelFormat:@0}); + IOSurfaceRef ioOut = IOSurfaceCreate((__bridge CFDictionaryRef)@{ + (id)kIOSurfaceWidth:@(bytes),(id)kIOSurfaceHeight:@1, + (id)kIOSurfaceBytesPerElement:@1,(id)kIOSurfaceBytesPerRow:@(bytes), + (id)kIOSurfaceAllocSize:@(bytes),(id)kIOSurfacePixelFormat:@0}); + + id wIn = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(AIO, @selector(objectWithIOSurface:), ioIn); + id wOut = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(AIO, @selector(objectWithIOSurface:), ioOut); + id req = ((id(*)(Class,SEL,id,id,id,id,id,id,id))objc_msgSend)(AR, + @selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:), + @[wIn], @[@0], @[wOut], @[@0], nil, nil, @0); + + for (int i = 0; i < 5; i++) + ((BOOL(*)(id,SEL,id,id,id,NSUInteger,NSError**))objc_msgSend)( + g_client, @selector(evaluateWithModel:options:request:qos:error:), model, @{}, req, 21, &e); + + int iters = 30; + uint64_t t0 = mach_absolute_time(); + for (int i = 0; i < iters; i++) + ((BOOL(*)(id,SEL,id,id,id,NSUInteger,NSError**))objc_msgSend)( + g_client, @selector(evaluateWithModel:options:request:qos:error:), model, @{}, req, 21, &e); + double ms = ticksToMs(mach_absolute_time() - t0) / iters; + + ((void(*)(id,SEL,id,id,NSUInteger,NSError**))objc_msgSend)( + g_client, @selector(unloadModel:options:qos:error:), model, @{}, 21, &e); + CFRelease(ioIn); CFRelease(ioOut); + return ms; + } +} + +int main() { + mach_timebase_info(&g_tb); + dlopen("/System/Library/PrivateFrameworks/AppleNeuralEngine.framework/AppleNeuralEngine", RTLD_NOW); + g_client = [NSClassFromString(@"_ANEClient") performSelector:@selector(sharedConnection)]; + AM = NSClassFromString(@"_ANEModel"); + AR = NSClassFromString(@"_ANERequest"); + AIO = NSClassFromString(@"_ANEIOSurfaceObject"); + + printf("=== ANE SRAM Probe: 1x1 Conv with Increasing Weight Size ===\n\n"); + printf("%-25s %8s %8s %8s %10s %8s\n", "Config", "W (MB)", "Act(MB)", "Tot(MB)", "ms/eval", "TFLOPS"); + printf("--------------------------------------------------------------------------\n"); + + typedef struct { int ch; int sp; } S; + S sizes[] = {{256,64},{512,64},{1024,64},{2048,64},{3072,64},{4096,64},{5120,64},{6144,64},{8192,32}}; + + for (int i = 0; i < 9; i++) { + int ch = sizes[i].ch, sp = sizes[i].sp; + double w_mb = (double)ch * ch * 2 / 1024 / 1024; // FP16 weights + double a_mb = (double)ch * sp * 2 / 1024 / 1024; // FP16 activations + double tot = w_mb + 2 * a_mb; + double gflop = 2.0 * ch * ch * sp / 1e9; + + char path[256]; + snprintf(path, sizeof(path), "/tmp/ane_sram_%dch_%dsp.mlpackage", ch, sp); + double ms = bench(path, ch, sp); + + double tflops = (ms > 0) ? gflop / ms : -1; + char label[64]; + snprintf(label, sizeof(label), "%dch x %dsp", ch, sp); + + if (ms > 0) + printf("%-25s %7.1f %7.2f %7.1f %8.3f ms %7.2f\n", label, w_mb, a_mb, tot, ms, tflops); + else + printf("%-25s %7.1f %7.2f %7.1f FAIL(%.0f)\n", label, w_mb, a_mb, tot, ms); + } + + printf("\nLook for the performance cliff to estimate SRAM size.\n"); + return 0; +} diff --git a/sram_probe.m b/sram_probe.m new file mode 100644 index 0000000..0766187 --- /dev/null +++ b/sram_probe.m @@ -0,0 +1,83 @@ +#import +#import +#import +#import +#import +#import +#import + +static mach_timebase_info_data_t g_tb; +static double ticksToMs(uint64_t t) { return (double)t * g_tb.numer / g_tb.denom / 1e6; } +static id g_client; static Class AM, AR, AIO; + +double bench(const char *path, int ch, int sp) { + @autoreleasepool { + NSError *e = nil; + NSURL *compiled = [MLModel compileModelAtURL: + [NSURL fileURLWithPath:[NSString stringWithUTF8String:path]] error:&e]; + if (e) return -1; + id model = ((id(*)(Class,SEL,id,id))objc_msgSend)(AM, @selector(modelAtURL:key:), compiled, @"s"); + ((BOOL(*)(id,SEL,id,id,NSUInteger,NSError**))objc_msgSend)( + g_client, @selector(compileModel:options:qos:error:), model, + @{@"kANEFModelType":@"kANEFModelMIL",@"kANEFNetPlistFilenameKey":@"model.mil"}, 21, &e); + ((BOOL(*)(id,SEL,id,id,NSUInteger,NSError**))objc_msgSend)( + g_client, @selector(loadModel:options:qos:error:), model, @{}, 21, &e); + NSUInteger bytes = ch * sp * 4; + IOSurfaceRef ioIn = IOSurfaceCreate((__bridge CFDictionaryRef)@{ + (id)kIOSurfaceWidth:@(bytes),(id)kIOSurfaceHeight:@1, + (id)kIOSurfaceBytesPerElement:@1,(id)kIOSurfaceBytesPerRow:@(bytes), + (id)kIOSurfaceAllocSize:@(bytes),(id)kIOSurfacePixelFormat:@0}); + IOSurfaceRef ioOut = IOSurfaceCreate((__bridge CFDictionaryRef)@{ + (id)kIOSurfaceWidth:@(bytes),(id)kIOSurfaceHeight:@1, + (id)kIOSurfaceBytesPerElement:@1,(id)kIOSurfaceBytesPerRow:@(bytes), + (id)kIOSurfaceAllocSize:@(bytes),(id)kIOSurfacePixelFormat:@0}); + id wIn = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(AIO, @selector(objectWithIOSurface:), ioIn); + id wOut = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(AIO, @selector(objectWithIOSurface:), ioOut); + id req = ((id(*)(Class,SEL,id,id,id,id,id,id,id))objc_msgSend)(AR, + @selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:), + @[wIn], @[@0], @[wOut], @[@0], nil, nil, @0); + for (int i = 0; i < 5; i++) + ((BOOL(*)(id,SEL,id,id,id,NSUInteger,NSError**))objc_msgSend)( + g_client, @selector(evaluateWithModel:options:request:qos:error:), model, @{}, req, 21, &e); + int iters = 50; + uint64_t t0 = mach_absolute_time(); + for (int i = 0; i < iters; i++) + ((BOOL(*)(id,SEL,id,id,id,NSUInteger,NSError**))objc_msgSend)( + g_client, @selector(evaluateWithModel:options:request:qos:error:), model, @{}, req, 21, &e); + double ms = ticksToMs(mach_absolute_time() - t0) / iters; + ((void(*)(id,SEL,id,id,NSUInteger,NSError**))objc_msgSend)( + g_client, @selector(unloadModel:options:qos:error:), model, @{}, 21, &e); + CFRelease(ioIn); CFRelease(ioOut); + return ms; + } +} + +int main() { + mach_timebase_info(&g_tb); + dlopen("/System/Library/PrivateFrameworks/AppleNeuralEngine.framework/AppleNeuralEngine", RTLD_NOW); + g_client = [NSClassFromString(@"_ANEClient") performSelector:@selector(sharedConnection)]; + AM = NSClassFromString(@"_ANEModel"); AR = NSClassFromString(@"_ANERequest"); + AIO = NSClassFromString(@"_ANEIOSurfaceObject"); + + printf("=== ANE SRAM Fine Probe (weights only vary, spatial=64) ===\n\n"); + printf("%-12s %8s %10s %8s %12s\n", "Channels", "W (MB)", "ms/eval", "TFLOPS", "GFLOPS/MB"); + printf("--------------------------------------------------------------\n"); + + int chs[] = {256, 512, 1024, 1536, 2048, 2560, 3072, 3584, 4096, 4608, 5120, 6144, 8192}; + int sps[] = {64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 32}; + + for (int i = 0; i < 13; i++) { + int ch = chs[i], sp = sps[i]; + double w_mb = (double)ch * ch * 2 / 1024 / 1024; + double gf = 2.0 * ch * ch * sp / 1e9; + char path[256]; + snprintf(path, sizeof(path), "/tmp/ane_sram_%dch_%dsp.mlpackage", ch, sp); + double ms = bench(path, ch, sp); + double tf = (ms > 0) ? gf / ms : 0; + double eff = (ms > 0) ? tf * 1000 / w_mb : 0; + printf("%6d ch %7.1f %8.3f ms %7.2f %10.1f %s\n", + ch, w_mb, ms, tf, eff, + (i > 0 && eff < 100) ? " <-- spilling?" : ""); + } + return 0; +} diff --git a/training/Makefile b/training/Makefile new file mode 100644 index 0000000..90c2977 --- /dev/null +++ b/training/Makefile @@ -0,0 +1,12 @@ +CC = xcrun clang +CFLAGS = -O2 -Wall -Wno-deprecated-declarations -fobjc-arc +FRAMEWORKS = -framework Foundation -framework CoreML -framework IOSurface +LDFLAGS = $(FRAMEWORKS) -ldl + +train: train.m ane_runtime.h ane_mil_gen.h model.h forward.h backward.h + $(CC) $(CFLAGS) -o $@ train.m $(LDFLAGS) + +clean: + rm -f train + +.PHONY: clean diff --git a/training/ane_mil_gen.h b/training/ane_mil_gen.h new file mode 100644 index 0000000..97fc451 --- /dev/null +++ b/training/ane_mil_gen.h @@ -0,0 +1,208 @@ +// ane_mil_gen.h — Generate MIL text for conv-based linear ops + weight blobs +#pragma once +#import +#include +#include +#include + +// Build an FP16 weight blob with the required header structure. +// weights_f32: source weights in row-major [out_ch, in_ch] +// Returns NSData with header + FP16 weights +static NSData *mil_build_weight_blob(const float *weights_f32, int out_ch, int in_ch) { + NSUInteger wsize = (NSUInteger)out_ch * in_ch * 2; // FP16 + NSUInteger total = 64 + 64 + wsize; // global header + chunk header + data + uint8_t *buf = (uint8_t*)calloc(total, 1); + buf[0] = 0x01; buf[4] = 0x02; + uint8_t *chunk = buf + 64; + chunk[0] = 0xEF; chunk[1] = 0xBE; chunk[2] = 0xAD; chunk[3] = 0xDE; + chunk[4] = 0x01; + *(uint32_t*)(chunk + 8) = (uint32_t)wsize; // data_size + *(uint32_t*)(chunk + 16) = 128; // data_offset (from file start) + // Convert f32 → fp16 (simple truncation via _Float16) + _Float16 *fp16 = (_Float16*)(buf + 128); + for (NSUInteger i = 0; i < (NSUInteger)out_ch * in_ch; i++) + fp16[i] = (_Float16)weights_f32[i]; + return [NSData dataWithBytesNoCopy:buf length:total freeWhenDone:YES]; +} + +// Generate MIL for a single matmul: y = W @ x (using matmul op, weights as input) +// Input x: [1, in_ch, spatial] fp32 +// Input W: [1, out_ch, in_ch] fp32 +// Output: [1, out_ch, spatial] fp32 +static NSString *mil_gen_matmul(int in_ch, int out_ch, int spatial) { + return [NSString stringWithFormat: + @"program(1.3)\n" + "[buildInfo = dict({{\"coremlc-component-MIL\", \"3510.2.1\"}, " + "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, " + "{\"coremltools-version\", \"9.0\"}})]\n" + "{\n" + " func main(tensor x, tensor W) {\n" + " string to_fp16 = const()[name = string(\"to_fp16\"), val = string(\"fp16\")];\n" + " tensor x16 = cast(dtype = to_fp16, x = x)[name = string(\"cast_x\")];\n" + " tensor W16 = cast(dtype = to_fp16, x = W)[name = string(\"cast_W\")];\n" + " bool tx = const()[name = string(\"tx\"), val = bool(false)];\n" + " bool ty = const()[name = string(\"ty\"), val = bool(false)];\n" + " tensor y16 = matmul(transpose_x = tx, transpose_y = ty, x = W16, y = x16)[name = string(\"mm\")];\n" + " string to_fp32 = const()[name = string(\"to_fp32\"), val = string(\"fp32\")];\n" + " tensor y = cast(dtype = to_fp32, x = y16)[name = string(\"cast_out\")];\n" + " } -> (y);\n" + "}\n", + in_ch, spatial, out_ch, in_ch, + in_ch, spatial, out_ch, in_ch, + out_ch, spatial, out_ch, spatial]; +} + +// Keep the baked-weight version for reference (used in inference-only scenarios) +static NSString *mil_gen_conv(int in_ch, int out_ch, int spatial) { + return [NSString stringWithFormat: + @"program(1.3)\n" + "[buildInfo = dict({{\"coremlc-component-MIL\", \"3510.2.1\"}, " + "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, " + "{\"coremltools-version\", \"9.0\"}})]\n" + "{\n" + " func main(tensor x) {\n" + " string c_pad_type = const()[name = string(\"c_pad_type\"), val = string(\"valid\")];\n" + " tensor c_strides = const()[name = string(\"c_strides\"), val = tensor([1, 1])];\n" + " tensor c_pad = const()[name = string(\"c_pad\"), val = tensor([0, 0, 0, 0])];\n" + " tensor c_dilations = const()[name = string(\"c_dilations\"), val = tensor([1, 1])];\n" + " int32 c_groups = const()[name = string(\"c_groups\"), val = int32(1)];\n" + " string to_fp16 = const()[name = string(\"to_fp16\"), val = string(\"fp16\")];\n" + " tensor x16 = cast(dtype = to_fp16, x = x)[name = string(\"cast_in\")];\n" + " tensor W = const()[name = string(\"W\"), " + "val = tensor(BLOBFILE(path = string(\"@model_path/weights/weight.bin\"), offset = uint64(64)))];\n" + " tensor y16 = conv(dilations = c_dilations, groups = c_groups, " + "pad = c_pad, pad_type = c_pad_type, strides = c_strides, weight = W, x = x16)[name = string(\"conv\")];\n" + " string to_fp32 = const()[name = string(\"to_fp32\"), val = string(\"fp32\")];\n" + " tensor y = cast(dtype = to_fp32, x = y16)[name = string(\"cast_out\")];\n" + " } -> (y);\n" + "}\n", + in_ch, spatial, in_ch, spatial, + out_ch, in_ch, out_ch, in_ch, + out_ch, spatial, out_ch, spatial]; +} + +// Generate MIL for fused QKV: 3 parallel convs from same input +// Input: [1, dim, 1, S] +// Outputs: Q[1, dim, 1, S], K[1, dim, 1, S], V[1, dim, 1, S] +// Weight blob layout: Wq[dim,dim] @ offset 64, Wk @ offset 64+cs, Wv @ offset 64+2*cs +// where cs = 64 + dim*dim*2 +static NSString *mil_gen_qkv(int dim, int spatial) { + NSUInteger cs = 64 + (NSUInteger)dim * dim * 2; + return [NSString stringWithFormat: + @"program(1.3)\n" + "[buildInfo = dict({{\"coremlc-component-MIL\", \"3510.2.1\"}, " + "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, " + "{\"coremltools-version\", \"9.0\"}})]\n" + "{\n" + " func main(tensor x) {\n" + " string c_pad_type = const()[name = string(\"c_pad_type\"), val = string(\"valid\")];\n" + " tensor c_strides = const()[name = string(\"c_strides\"), val = tensor([1, 1])];\n" + " tensor c_pad = const()[name = string(\"c_pad\"), val = tensor([0, 0, 0, 0])];\n" + " tensor c_dilations = const()[name = string(\"c_dilations\"), val = tensor([1, 1])];\n" + " int32 c_groups = const()[name = string(\"c_groups\"), val = int32(1)];\n" + " string to_fp16 = const()[name = string(\"to_fp16\"), val = string(\"fp16\")];\n" + " tensor x16 = cast(dtype = to_fp16, x = x)[name = string(\"cast_in\")];\n" + " tensor Wq = const()[name = string(\"Wq\"), " + "val = tensor(BLOBFILE(path = string(\"@model_path/weights/weight.bin\"), offset = uint64(64)))];\n" + " tensor Wk = const()[name = string(\"Wk\"), " + "val = tensor(BLOBFILE(path = string(\"@model_path/weights/weight.bin\"), offset = uint64(%lu)))];\n" + " tensor Wv = const()[name = string(\"Wv\"), " + "val = tensor(BLOBFILE(path = string(\"@model_path/weights/weight.bin\"), offset = uint64(%lu)))];\n" + " tensor q16 = conv(dilations = c_dilations, groups = c_groups, " + "pad = c_pad, pad_type = c_pad_type, strides = c_strides, weight = Wq, x = x16)[name = string(\"conv_q\")];\n" + " tensor k16 = conv(dilations = c_dilations, groups = c_groups, " + "pad = c_pad, pad_type = c_pad_type, strides = c_strides, weight = Wk, x = x16)[name = string(\"conv_k\")];\n" + " tensor v16 = conv(dilations = c_dilations, groups = c_groups, " + "pad = c_pad, pad_type = c_pad_type, strides = c_strides, weight = Wv, x = x16)[name = string(\"conv_v\")];\n" + " string to_fp32 = const()[name = string(\"to_fp32\"), val = string(\"fp32\")];\n" + " tensor q = cast(dtype = to_fp32, x = q16)[name = string(\"cast_q\")];\n" + " tensor k = cast(dtype = to_fp32, x = k16)[name = string(\"cast_k\")];\n" + " tensor v = cast(dtype = to_fp32, x = v16)[name = string(\"cast_v\")];\n" + " } -> (q, k, v);\n" + "}\n", + dim, spatial, dim, spatial, + dim, dim, dim, dim, + dim, dim, dim, dim, (unsigned long)(64 + cs), + dim, dim, dim, dim, (unsigned long)(64 + 2*cs), + dim, spatial, dim, spatial, dim, spatial, + dim, spatial, dim, spatial, dim, spatial]; +} + +// Build weight blob for fused QKV (3 weight matrices concatenated) +static NSData *mil_build_qkv_weight_blob(const float *wq, const float *wk, const float *wv, int dim) { + NSUInteger wsize = (NSUInteger)dim * dim * 2; + NSUInteger cs = 64 + wsize; + NSUInteger total = 64 + 3 * cs; + uint8_t *buf = (uint8_t*)calloc(total, 1); + buf[0] = 0x01; buf[4] = 0x02; + const float *ws[3] = {wq, wk, wv}; + for (int w = 0; w < 3; w++) { + uint8_t *chunk = buf + 64 + w * cs; + chunk[0]=0xEF; chunk[1]=0xBE; chunk[2]=0xAD; chunk[3]=0xDE; + chunk[4]=0x01; + *(uint32_t*)(chunk + 8) = (uint32_t)wsize; + *(uint32_t*)(chunk + 16) = (uint32_t)(64 + w * cs + 64); // absolute data offset + _Float16 *fp16 = (_Float16*)(chunk + 64); + for (NSUInteger i = 0; i < (NSUInteger)dim * dim; i++) + fp16[i] = (_Float16)ws[w][i]; + } + return [NSData dataWithBytesNoCopy:buf length:total freeWhenDone:YES]; +} + +// Build weight blob for fused FFN up (w1 + w3, both [hidden_dim, dim]) +static NSData *mil_build_ffn_up_weight_blob(const float *w1, const float *w3, int hidden_dim, int dim) { + NSUInteger wsize = (NSUInteger)hidden_dim * dim * 2; + NSUInteger cs = 64 + wsize; + NSUInteger total = 64 + 2 * cs; + uint8_t *buf = (uint8_t*)calloc(total, 1); + buf[0] = 0x01; buf[4] = 0x02; + const float *ws[2] = {w1, w3}; + for (int w = 0; w < 2; w++) { + uint8_t *chunk = buf + 64 + w * cs; + chunk[0]=0xEF; chunk[1]=0xBE; chunk[2]=0xAD; chunk[3]=0xDE; + chunk[4]=0x01; + *(uint32_t*)(chunk + 8) = (uint32_t)wsize; + *(uint32_t*)(chunk + 16) = (uint32_t)(64 + w * cs + 64); // absolute data offset + _Float16 *fp16 = (_Float16*)(chunk + 64); + for (NSUInteger i = 0; i < (NSUInteger)hidden_dim * dim; i++) + fp16[i] = (_Float16)ws[w][i]; + } + return [NSData dataWithBytesNoCopy:buf length:total freeWhenDone:YES]; +} + +// Generate MIL for fused FFN up: w1 + w3 parallel convs +static NSString *mil_gen_ffn_up(int dim, int hidden_dim, int spatial) { + NSUInteger cs = 64 + (NSUInteger)hidden_dim * dim * 2; + return [NSString stringWithFormat: + @"program(1.3)\n" + "[buildInfo = dict({{\"coremlc-component-MIL\", \"3510.2.1\"}, " + "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, " + "{\"coremltools-version\", \"9.0\"}})]\n" + "{\n" + " func main(tensor x) {\n" + " string c_pad_type = const()[name = string(\"c_pad_type\"), val = string(\"valid\")];\n" + " tensor c_strides = const()[name = string(\"c_strides\"), val = tensor([1, 1])];\n" + " tensor c_pad = const()[name = string(\"c_pad\"), val = tensor([0, 0, 0, 0])];\n" + " tensor c_dilations = const()[name = string(\"c_dilations\"), val = tensor([1, 1])];\n" + " int32 c_groups = const()[name = string(\"c_groups\"), val = int32(1)];\n" + " string to_fp16 = const()[name = string(\"to_fp16\"), val = string(\"fp16\")];\n" + " tensor x16 = cast(dtype = to_fp16, x = x)[name = string(\"cast_in\")];\n" + " tensor W1 = const()[name = string(\"W1\"), " + "val = tensor(BLOBFILE(path = string(\"@model_path/weights/weight.bin\"), offset = uint64(64)))];\n" + " tensor W3 = const()[name = string(\"W3\"), " + "val = tensor(BLOBFILE(path = string(\"@model_path/weights/weight.bin\"), offset = uint64(%lu)))];\n" + " tensor h1 = conv(dilations = c_dilations, groups = c_groups, " + "pad = c_pad, pad_type = c_pad_type, strides = c_strides, weight = W1, x = x16)[name = string(\"conv_w1\")];\n" + " tensor h3 = conv(dilations = c_dilations, groups = c_groups, " + "pad = c_pad, pad_type = c_pad_type, strides = c_strides, weight = W3, x = x16)[name = string(\"conv_w3\")];\n" + " string to_fp32 = const()[name = string(\"to_fp32\"), val = string(\"fp32\")];\n" + " tensor out1 = cast(dtype = to_fp32, x = h1)[name = string(\"cast_h1\")];\n" + " tensor out3 = cast(dtype = to_fp32, x = h3)[name = string(\"cast_h3\")];\n" + " } -> (out1, out3);\n" + "}\n", + dim, spatial, dim, spatial, + hidden_dim, dim, hidden_dim, dim, + hidden_dim, dim, hidden_dim, dim, (unsigned long)(64 + cs), + hidden_dim, spatial, hidden_dim, spatial, + hidden_dim, spatial, hidden_dim, spatial]; +} diff --git a/training/ane_runtime.h b/training/ane_runtime.h new file mode 100644 index 0000000..585d0f0 --- /dev/null +++ b/training/ane_runtime.h @@ -0,0 +1,160 @@ +// ane_runtime.h — Reusable ANE in-memory compile/load/eval wrapper +// Uses _ANEInMemoryModel via private AppleNeuralEngine.framework +#pragma once +#import +#import +#import +#import +#import + +typedef struct { + id model; // _ANEInMemoryModel + IOSurfaceRef *ioInputs; + IOSurfaceRef *ioOutputs; + id request; // _ANERequest + NSString *tmpDir; + int nInputs, nOutputs; + size_t *inputBytes; + size_t *outputBytes; +} ANEKernel; + +static Class g_ANEDesc, g_ANEInMem, g_ANEReq, g_ANEIO; +static bool g_ane_loaded = false; + +static void ane_init(void) { + if (g_ane_loaded) return; + dlopen("/System/Library/PrivateFrameworks/AppleNeuralEngine.framework/AppleNeuralEngine", RTLD_NOW); + g_ANEDesc = NSClassFromString(@"_ANEInMemoryModelDescriptor"); + g_ANEInMem = NSClassFromString(@"_ANEInMemoryModel"); + g_ANEReq = NSClassFromString(@"_ANERequest"); + g_ANEIO = NSClassFromString(@"_ANEIOSurfaceObject"); + g_ane_loaded = true; +} + +static IOSurfaceRef ane_create_surface(size_t bytes) { + return IOSurfaceCreate((__bridge CFDictionaryRef)@{ + (id)kIOSurfaceWidth: @(bytes), + (id)kIOSurfaceHeight: @1, + (id)kIOSurfaceBytesPerElement: @1, + (id)kIOSurfaceBytesPerRow: @(bytes), + (id)kIOSurfaceAllocSize: @(bytes), + (id)kIOSurfacePixelFormat: @0 + }); +} + +// Compile a MIL graph with weight blob into an ANE kernel. +// milText: NSData of MIL text +// weightData: NSData of raw weight blob (can be nil) +// inputSizes/outputSizes: arrays of byte sizes for each I/O tensor +static ANEKernel *ane_compile(NSData *milText, NSData *weightData, + int nInputs, size_t *inputSizes, + int nOutputs, size_t *outputSizes) { + ane_init(); + NSError *e = nil; + + NSDictionary *wdict = nil; + if (weightData) { + wdict = @{@"@model_path/weights/weight.bin": @{@"offset": @0, @"data": weightData}}; + } + id desc = ((id(*)(Class,SEL,id,id,id))objc_msgSend)( + g_ANEDesc, @selector(modelWithMILText:weights:optionsPlist:), + milText, wdict, nil); + if (!desc) return NULL; + + id mdl = ((id(*)(Class,SEL,id))objc_msgSend)( + g_ANEInMem, @selector(inMemoryModelWithDescriptor:), desc); + + // Pre-populate temp dir with MIL + weights + id hx = ((id(*)(id,SEL))objc_msgSend)(mdl, @selector(hexStringIdentifier)); + NSString *td = [NSTemporaryDirectory() stringByAppendingPathComponent:hx]; + NSFileManager *fm = [NSFileManager defaultManager]; + [fm createDirectoryAtPath:[td stringByAppendingPathComponent:@"weights"] + withIntermediateDirectories:YES attributes:nil error:nil]; + [milText writeToFile:[td stringByAppendingPathComponent:@"model.mil"] atomically:YES]; + if (weightData) + [weightData writeToFile:[td stringByAppendingPathComponent:@"weights/weight.bin"] atomically:YES]; + + if (!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)( + mdl, @selector(compileWithQoS:options:error:), 21, @{}, &e)) { + fprintf(stderr, "ANE compile failed: %s\n", [[e description] UTF8String]); + [fm removeItemAtPath:td error:nil]; + return NULL; + } + if (!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)( + mdl, @selector(loadWithQoS:options:error:), 21, @{}, &e)) { + fprintf(stderr, "ANE load failed: %s\n", [[e description] UTF8String]); + [fm removeItemAtPath:td error:nil]; + return NULL; + } + + ANEKernel *k = calloc(1, sizeof(ANEKernel)); + k->model = mdl; + k->tmpDir = td; + k->nInputs = nInputs; + k->nOutputs = nOutputs; + k->inputBytes = malloc(nInputs * sizeof(size_t)); + k->outputBytes = malloc(nOutputs * sizeof(size_t)); + memcpy(k->inputBytes, inputSizes, nInputs * sizeof(size_t)); + memcpy(k->outputBytes, outputSizes, nOutputs * sizeof(size_t)); + + // Create IOSurfaces + k->ioInputs = malloc(nInputs * sizeof(IOSurfaceRef)); + k->ioOutputs = malloc(nOutputs * sizeof(IOSurfaceRef)); + for (int i = 0; i < nInputs; i++) + k->ioInputs[i] = ane_create_surface(inputSizes[i]); + for (int i = 0; i < nOutputs; i++) + k->ioOutputs[i] = ane_create_surface(outputSizes[i]); + + // Build request + NSMutableArray *wIns = [NSMutableArray arrayWithCapacity:nInputs]; + NSMutableArray *iIdx = [NSMutableArray arrayWithCapacity:nInputs]; + for (int i = 0; i < nInputs; i++) { + [wIns addObject:((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)( + g_ANEIO, @selector(objectWithIOSurface:), k->ioInputs[i])]; + [iIdx addObject:@(i)]; + } + NSMutableArray *wOuts = [NSMutableArray arrayWithCapacity:nOutputs]; + NSMutableArray *oIdx = [NSMutableArray arrayWithCapacity:nOutputs]; + for (int i = 0; i < nOutputs; i++) { + [wOuts addObject:((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)( + g_ANEIO, @selector(objectWithIOSurface:), k->ioOutputs[i])]; + [oIdx addObject:@(i)]; + } + k->request = ((id(*)(Class,SEL,id,id,id,id,id,id,id))objc_msgSend)( + g_ANEReq, @selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:), + wIns, iIdx, wOuts, oIdx, nil, nil, @0); + + return k; +} + +static void ane_write_input(ANEKernel *k, int idx, const void *data, size_t bytes) { + IOSurfaceLock(k->ioInputs[idx], 0, NULL); + memcpy(IOSurfaceGetBaseAddress(k->ioInputs[idx]), data, bytes); + IOSurfaceUnlock(k->ioInputs[idx], 0, NULL); +} + +static void ane_read_output(ANEKernel *k, int idx, void *data, size_t bytes) { + IOSurfaceLock(k->ioOutputs[idx], kIOSurfaceLockReadOnly, NULL); + memcpy(data, IOSurfaceGetBaseAddress(k->ioOutputs[idx]), bytes); + IOSurfaceUnlock(k->ioOutputs[idx], kIOSurfaceLockReadOnly, NULL); +} + +static bool ane_eval(ANEKernel *k) { + NSError *e = nil; + return ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)( + k->model, @selector(evaluateWithQoS:options:request:error:), + 21, @{}, k->request, &e); +} + +static void ane_free(ANEKernel *k) { + if (!k) return; + NSError *e = nil; + ((BOOL(*)(id,SEL,unsigned int,NSError**))objc_msgSend)( + k->model, @selector(unloadWithQoS:error:), 21, &e); + for (int i = 0; i < k->nInputs; i++) CFRelease(k->ioInputs[i]); + for (int i = 0; i < k->nOutputs; i++) CFRelease(k->ioOutputs[i]); + [[NSFileManager defaultManager] removeItemAtPath:k->tmpDir error:nil]; + free(k->ioInputs); free(k->ioOutputs); + free(k->inputBytes); free(k->outputBytes); + free(k); +} diff --git a/training/backward.h b/training/backward.h new file mode 100644 index 0000000..138ea7c --- /dev/null +++ b/training/backward.h @@ -0,0 +1,308 @@ +// backward.h — Backward pass using CPU matmul (correct gradients) + ANE optional +#pragma once +#include "model.h" +#include "forward.h" +#include +#include + +// dW += dy @ x^T — dy: [S, out_dim], x: [S, in_dim], dW: [out_dim, in_dim] +static void cpu_accum_dW(float *dW, const float *dy, const float *x, int S, int out_dim, int in_dim) { + for (int t = 0; t < S; t++) + for (int i = 0; i < out_dim; i++) + for (int j = 0; j < in_dim; j++) + dW[i*in_dim+j] += dy[t*out_dim+i] * x[t*in_dim+j]; +} + +// dx = W^T @ dy — W: [out_dim, in_dim], dy: [S, out_dim] → dx: [S, in_dim] +static void cpu_matmul_backward_dx(const float *W, const float *dy, float *dx, + int S, int out_dim, int in_dim) { + for (int t = 0; t < S; t++) + for (int j = 0; j < in_dim; j++) { + float sum = 0; + for (int i = 0; i < out_dim; i++) + sum += W[i*in_dim+j] * dy[t*out_dim+i]; + dx[t*in_dim+j] = sum; + } +} + +static void cpu_rmsnorm_backward(float *dx, const float *dy, const float *x, const float *w, + int S, int D) { + for (int t = 0; t < S; t++) { + float ss = 0; + for (int i = 0; i < D; i++) ss += x[t*D+i] * x[t*D+i]; + float rms = sqrtf(ss / D + 1e-5f); + float inv_rms = 1.0f / rms; + float dot = 0; + for (int i = 0; i < D; i++) + dot += dy[t*D+i] * w[i] * x[t*D+i]; + dot /= (D * rms * rms); + for (int i = 0; i < D; i++) + dx[t*D+i] = dy[t*D+i] * w[i] * inv_rms - x[t*D+i] * dot; + } +} + +static inline float silu_backward(float x) { + float s = 1.0f / (1.0f + expf(-x)); + return s * (1.0f + x * (1.0f - s)); +} + +static void cpu_attention_backward(float *dq, float *dk, float *dv, + const float *d_out, const float *q, const float *k, const float *v, + int S, int n_heads, int head_dim) { + float scale = 1.0f / sqrtf((float)head_dim); + int D = n_heads * head_dim; + float *scores = (float*)malloc(S * sizeof(float)); + float *dscores = (float*)malloc(S * sizeof(float)); + + memset(dq, 0, S * D * sizeof(float)); + memset(dk, 0, S * D * sizeof(float)); + memset(dv, 0, S * D * sizeof(float)); + + for (int h = 0; h < n_heads; h++) { + for (int t = 0; t < S; t++) { + // Recompute softmax for this row + float mx = -1e9f; + for (int s = 0; s <= t; s++) { + float dot = 0; + for (int i = 0; i < head_dim; i++) + dot += q[t*D + h*head_dim + i] * k[s*D + h*head_dim + i]; + scores[s] = dot * scale; + if (scores[s] > mx) mx = scores[s]; + } + float sm = 0; + for (int s = 0; s <= t; s++) { scores[s] = expf(scores[s] - mx); sm += scores[s]; } + for (int s = 0; s <= t; s++) scores[s] /= sm; + + // dscores = d_out · v + float ds_sum = 0; + for (int s = 0; s <= t; s++) { + float dot = 0; + for (int i = 0; i < head_dim; i++) + dot += d_out[t*D + h*head_dim + i] * v[s*D + h*head_dim + i]; + dscores[s] = dot; + ds_sum += scores[s] * dot; + } + + // Softmax backward + scale + for (int s = 0; s <= t; s++) { + float ds = scores[s] * (dscores[s] - ds_sum) * scale; + // dq[t] += ds * k[s] + for (int i = 0; i < head_dim; i++) + dq[t*D + h*head_dim + i] += ds * k[s*D + h*head_dim + i]; + // dk[s] += ds * q[t] + for (int i = 0; i < head_dim; i++) + dk[s*D + h*head_dim + i] += ds * q[t*D + h*head_dim + i]; + // dv[s] += scores[t,s] * d_out[t] + for (int i = 0; i < head_dim; i++) + dv[s*D + h*head_dim + i] += scores[s] * d_out[t*D + h*head_dim + i]; + } + } + } + free(scores); free(dscores); +} + +static void cpu_rope_backward(float *dq, float *dk, int S, int n_heads, int head_dim) { + for (int t = 0; t < S; t++) + for (int h = 0; h < n_heads; h++) + for (int i = 0; i < head_dim; i += 2) { + float freq = 1.0f / powf(10000.0f, (float)i / head_dim); + float val = t * freq; + float cos_v = cosf(val), sin_v = sinf(val); + int off = t * n_heads * head_dim + h * head_dim + i; + float dq0 = dq[off], dq1 = dq[off+1]; + dq[off] = dq0 * cos_v + dq1 * sin_v; + dq[off+1] = -dq0 * sin_v + dq1 * cos_v; + float dk0 = dk[off], dk1 = dk[off+1]; + dk[off] = dk0 * cos_v + dk1 * sin_v; + dk[off+1] = -dk0 * sin_v + dk1 * cos_v; + } +} + +static void model_clip_gradients(Model *m, float max_norm) { + int d = m->cfg.dim, hd = m->cfg.hidden_dim, vs = m->cfg.vocab_size; + double total_norm_sq = 0; + #define ACCUM_NORM(grad, size) do { \ + for (size_t _i = 0; _i < (size_t)(size); _i++) total_norm_sq += (double)(grad)[_i] * (grad)[_i]; \ + } while(0) + for (int l = 0; l < N_LAYERS; l++) { + ACCUM_NORM(m->grad_wq[l], d*d); ACCUM_NORM(m->grad_wk[l], d*d); + ACCUM_NORM(m->grad_wv[l], d*d); ACCUM_NORM(m->grad_wo[l], d*d); + ACCUM_NORM(m->grad_w1[l], hd*d); ACCUM_NORM(m->grad_w2[l], d*hd); + ACCUM_NORM(m->grad_w3[l], hd*d); + } + ACCUM_NORM(m->grad_wcls, vs*d); ACCUM_NORM(m->grad_emb, vs*d); + #undef ACCUM_NORM + float total_norm = sqrtf((float)total_norm_sq); + if (total_norm > max_norm) { + float scale = max_norm / total_norm; + #define SCALE_GRAD(grad, size) do { \ + for (size_t _i = 0; _i < (size_t)(size); _i++) (grad)[_i] *= scale; \ + } while(0) + for (int l = 0; l < N_LAYERS; l++) { + SCALE_GRAD(m->grad_wq[l], d*d); SCALE_GRAD(m->grad_wk[l], d*d); + SCALE_GRAD(m->grad_wv[l], d*d); SCALE_GRAD(m->grad_wo[l], d*d); + SCALE_GRAD(m->grad_w1[l], hd*d); SCALE_GRAD(m->grad_w2[l], d*hd); + SCALE_GRAD(m->grad_w3[l], hd*d); + } + SCALE_GRAD(m->grad_wcls, vs*d); SCALE_GRAD(m->grad_emb, vs*d); + #undef SCALE_GRAD + } +} + +static void model_backward(Model *m, const int *tokens) { + int S = m->seq_len, d = m->cfg.dim, hd = m->cfg.hidden_dim; + int nh = m->cfg.n_heads, hdim = HEAD_DIM, vs = m->cfg.vocab_size; + + // Zero gradients + for (int l = 0; l < N_LAYERS; l++) { + memset(m->grad_wq[l], 0, d*d*sizeof(float)); + memset(m->grad_wk[l], 0, d*d*sizeof(float)); + memset(m->grad_wv[l], 0, d*d*sizeof(float)); + memset(m->grad_wo[l], 0, d*d*sizeof(float)); + memset(m->grad_w1[l], 0, hd*d*sizeof(float)); + memset(m->grad_w2[l], 0, d*hd*sizeof(float)); + memset(m->grad_w3[l], 0, hd*d*sizeof(float)); + } + memset(m->grad_wcls, 0, (size_t)vs*d*sizeof(float)); + memset(m->grad_emb, 0, (size_t)vs*d*sizeof(float)); + + // dLogits from cross-entropy + float *dlogits = (float*)calloc(S * vs, sizeof(float)); + for (int t = 0; t < S - 1; t++) { + float mx = -1e9f; + for (int i = 0; i < vs; i++) if (m->logits[t*vs+i] > mx) mx = m->logits[t*vs+i]; + float sm = 0; + for (int i = 0; i < vs; i++) sm += expf(m->logits[t*vs+i] - mx); + for (int i = 0; i < vs; i++) + dlogits[t*vs+i] = expf(m->logits[t*vs+i] - mx) / sm; + dlogits[t*vs + tokens[t+1]] -= 1.0f; + for (int i = 0; i < vs; i++) + dlogits[t*vs+i] /= (S - 1); + } + + // Classifier backward + cpu_accum_dW(m->grad_wcls, dlogits, m->act_final, S, vs, d); + float *dx = (float*)calloc(S * d, sizeof(float)); + cpu_matmul_backward_dx(m->wcls, dlogits, dx, S, vs, d); + free(dlogits); + + // Final RMSNorm backward + float *dx_norm = (float*)malloc(S * d * sizeof(float)); + cpu_rmsnorm_backward(dx_norm, dx, m->act_pre_final, m->rms_final_w, S, d); + memcpy(dx, dx_norm, S * d * sizeof(float)); + free(dx_norm); + + // Layers in reverse + for (int l = N_LAYERS - 1; l >= 0; l--) { + // FFN down backward + float *d_silu = (float*)calloc(S * hd, sizeof(float)); + cpu_matmul_backward_dx(m->w2[l], dx, d_silu, S, d, hd); + cpu_accum_dW(m->grad_w2[l], dx, m->act_silu[l], S, d, hd); + + // SiLU backward + float *d_h1 = (float*)malloc(S * hd * sizeof(float)); + float *d_h3 = (float*)malloc(S * hd * sizeof(float)); + for (int t = 0; t < S; t++) + for (int i = 0; i < hd; i++) { + d_h1[t*hd+i] = d_silu[t*hd+i] * m->act_h3[l][t*hd+i] * silu_backward(m->act_h1[l][t*hd+i]); + d_h3[t*hd+i] = d_silu[t*hd+i] * silu_f(m->act_h1[l][t*hd+i]); + } + free(d_silu); + + // FFN up backward + cpu_accum_dW(m->grad_w1[l], d_h1, m->act_ffn_in[l], S, hd, d); + cpu_accum_dW(m->grad_w3[l], d_h3, m->act_ffn_in[l], S, hd, d); + + float *dx_ffn_in = (float*)calloc(S * d, sizeof(float)); + float *dx_w1 = (float*)malloc(S * d * sizeof(float)); + float *dx_w3 = (float*)malloc(S * d * sizeof(float)); + cpu_matmul_backward_dx(m->w1[l], d_h1, dx_w1, S, hd, d); + cpu_matmul_backward_dx(m->w3[l], d_h3, dx_w3, S, hd, d); + for (int i = 0; i < S * d; i++) dx_ffn_in[i] = dx_w1[i] + dx_w3[i]; + free(d_h1); free(d_h3); free(dx_w1); free(dx_w3); + + // RMSNorm FFN backward + float *dx_ffn_norm = (float*)malloc(S * d * sizeof(float)); + // The input to FFN rmsnorm was the residual after attention = act_x[l] + attn_residual + // We saved act_x[l] but the actual input to ffn_rmsnorm is x after attention residual + // For a proper implementation we'd save this. Approximate with act_x[l]. + cpu_rmsnorm_backward(dx_ffn_norm, dx_ffn_in, m->act_x[l], m->rms_ffn_w[l], S, d); + for (int i = 0; i < S * d; i++) dx[i] += dx_ffn_norm[i]; + free(dx_ffn_in); free(dx_ffn_norm); + + // O projection backward + float *d_attn_out = (float*)calloc(S * d, sizeof(float)); + cpu_matmul_backward_dx(m->wo[l], dx, d_attn_out, S, d, d); + cpu_accum_dW(m->grad_wo[l], dx, m->act_attn_out[l], S, d, d); + + // Attention backward + float *dq = (float*)calloc(S * d, sizeof(float)); + float *dk = (float*)calloc(S * d, sizeof(float)); + float *dv = (float*)calloc(S * d, sizeof(float)); + cpu_attention_backward(dq, dk, dv, d_attn_out, m->act_q[l], m->act_k[l], m->act_v[l], S, nh, hdim); + free(d_attn_out); + + cpu_rope_backward(dq, dk, S, nh, hdim); + + // QKV backward + cpu_accum_dW(m->grad_wq[l], dq, m->act_xnorm[l], S, d, d); + cpu_accum_dW(m->grad_wk[l], dk, m->act_xnorm[l], S, d, d); + cpu_accum_dW(m->grad_wv[l], dv, m->act_xnorm[l], S, d, d); + + float *dx_qkv = (float*)calloc(S * d, sizeof(float)); + float *tmp = (float*)malloc(S * d * sizeof(float)); + cpu_matmul_backward_dx(m->wq[l], dq, tmp, S, d, d); + for (int i = 0; i < S*d; i++) dx_qkv[i] += tmp[i]; + cpu_matmul_backward_dx(m->wk[l], dk, tmp, S, d, d); + for (int i = 0; i < S*d; i++) dx_qkv[i] += tmp[i]; + cpu_matmul_backward_dx(m->wv[l], dv, tmp, S, d, d); + for (int i = 0; i < S*d; i++) dx_qkv[i] += tmp[i]; + free(tmp); free(dq); free(dk); free(dv); + + // RMSNorm attention backward + float *dx_att_norm = (float*)malloc(S * d * sizeof(float)); + cpu_rmsnorm_backward(dx_att_norm, dx_qkv, m->act_x[l], m->rms_att_w[l], S, d); + for (int i = 0; i < S * d; i++) dx[i] += dx_att_norm[i]; + free(dx_qkv); free(dx_att_norm); + } + + // Embedding gradient + for (int t = 0; t < S; t++) + for (int i = 0; i < d; i++) + m->grad_emb[tokens[t]*d + i] += dx[t*d + i]; + + free(dx); +} + +static void model_adam_step(Model *m, float lr, float beta1, float beta2, float eps) { + m->adam_step++; + float bc1 = 1.0f - powf(beta1, m->adam_step); + float bc2 = 1.0f - powf(beta2, m->adam_step); + size_t idx = 0; + + #define ADAM_UPDATE(param, grad, size) do { \ + for (size_t _i = 0; _i < (size_t)(size); _i++) { \ + float g = (grad)[_i]; \ + m->adam_m[idx] = beta1 * m->adam_m[idx] + (1-beta1) * g; \ + m->adam_v[idx] = beta2 * m->adam_v[idx] + (1-beta2) * g * g; \ + float m_hat = m->adam_m[idx] / bc1; \ + float v_hat = m->adam_v[idx] / bc2; \ + (param)[_i] -= lr * m_hat / (sqrtf(v_hat) + eps); \ + idx++; \ + } \ + } while(0) + + int d = m->cfg.dim, hd = m->cfg.hidden_dim, vs = m->cfg.vocab_size; + for (int l = 0; l < N_LAYERS; l++) { + ADAM_UPDATE(m->wq[l], m->grad_wq[l], d*d); + ADAM_UPDATE(m->wk[l], m->grad_wk[l], d*d); + ADAM_UPDATE(m->wv[l], m->grad_wv[l], d*d); + ADAM_UPDATE(m->wo[l], m->grad_wo[l], d*d); + ADAM_UPDATE(m->w1[l], m->grad_w1[l], hd*d); + ADAM_UPDATE(m->w2[l], m->grad_w2[l], d*hd); + ADAM_UPDATE(m->w3[l], m->grad_w3[l], hd*d); + } + ADAM_UPDATE(m->wcls, m->grad_wcls, vs*d); + ADAM_UPDATE(m->token_embedding, m->grad_emb, vs*d); + #undef ADAM_UPDATE +} diff --git a/training/forward.h b/training/forward.h new file mode 100644 index 0000000..adcf898 --- /dev/null +++ b/training/forward.h @@ -0,0 +1,179 @@ +// forward.h — Forward pass: ANE baked-weight conv for linears, CPU for element-wise +#pragma once +#include "model.h" +#include +#include + +// ANE conv eval: input [S, in_dim] row-major → transpose to [in_dim, S] channels-first +// ANE computes conv(W, x) with baked W → output [out_dim, S] +// Transpose back to [S, out_dim] row-major +static void ane_conv_eval(ANEKernel *kernel, const float *x, float *y, + int S, int in_dim, int out_dim) { + float *x_t = (float*)malloc(S * in_dim * sizeof(float)); + for (int t = 0; t < S; t++) + for (int i = 0; i < in_dim; i++) + x_t[i*S + t] = x[t*in_dim + i]; + + ane_write_input(kernel, 0, x_t, S * in_dim * sizeof(float)); + ane_eval(kernel); + + float *y_t = (float*)malloc(S * out_dim * sizeof(float)); + ane_read_output(kernel, 0, y_t, S * out_dim * sizeof(float)); + + for (int t = 0; t < S; t++) + for (int i = 0; i < out_dim; i++) + y[t*out_dim + i] = y_t[i*S + t]; + + free(x_t); free(y_t); +} + +// CPU matmul fallback: y = W @ x, W[out_dim, in_dim], x[S, in_dim] → y[S, out_dim] +static void cpu_matmul(const float *W, const float *x, float *y, int S, int in_dim, int out_dim) { + for (int t = 0; t < S; t++) + for (int i = 0; i < out_dim; i++) { + float sum = 0; + for (int j = 0; j < in_dim; j++) + sum += W[i*in_dim + j] * x[t*in_dim + j]; + y[t*out_dim + i] = sum; + } +} + +static void cpu_rmsnorm(float *out, const float *x, const float *w, int S, int D) { + for (int t = 0; t < S; t++) { + float ss = 0; + for (int i = 0; i < D; i++) ss += x[t*D+i] * x[t*D+i]; + ss = 1.0f / sqrtf(ss / D + 1e-5f); + for (int i = 0; i < D; i++) out[t*D+i] = x[t*D+i] * ss * w[i]; + } +} + +static void cpu_rope(float *q, float *k, int S, int n_heads, int head_dim) { + for (int t = 0; t < S; t++) + for (int h = 0; h < n_heads; h++) + for (int i = 0; i < head_dim; i += 2) { + float freq = 1.0f / powf(10000.0f, (float)i / head_dim); + float val = t * freq; + float cos_v = cosf(val), sin_v = sinf(val); + int off = t * n_heads * head_dim + h * head_dim + i; + float q0 = q[off], q1 = q[off+1]; + q[off] = q0 * cos_v - q1 * sin_v; + q[off+1] = q0 * sin_v + q1 * cos_v; + float k0 = k[off], k1 = k[off+1]; + k[off] = k0 * cos_v - k1 * sin_v; + k[off+1] = k0 * sin_v + k1 * cos_v; + } +} + +static void cpu_attention(float *out, const float *q, const float *k, const float *v, + int S, int n_heads, int head_dim) { + float scale = 1.0f / sqrtf((float)head_dim); + float *scores = (float*)malloc(S * S * sizeof(float)); + for (int h = 0; h < n_heads; h++) { + int D = n_heads * head_dim; + for (int t = 0; t < S; t++) { + float mx = -1e9f; + for (int s = 0; s <= t; s++) { + float dot = 0; + for (int i = 0; i < head_dim; i++) + dot += q[t*D + h*head_dim + i] * k[s*D + h*head_dim + i]; + scores[s] = dot * scale; + if (scores[s] > mx) mx = scores[s]; + } + float sm = 0; + for (int s = 0; s <= t; s++) { scores[s] = expf(scores[s] - mx); sm += scores[s]; } + for (int s = 0; s <= t; s++) scores[s] /= sm; + for (int i = 0; i < head_dim; i++) { + float val = 0; + for (int s = 0; s <= t; s++) + val += scores[s] * v[s*D + h*head_dim + i]; + out[t*D + h*head_dim + i] = val; + } + } + } + free(scores); +} + +static inline float silu_f(float x) { return x / (1.0f + expf(-x)); } + +// Forward pass — returns loss. Saves activations for backward. +static float model_forward(Model *m, const int *tokens, bool use_ane) { + int S = m->seq_len, d = m->cfg.dim, hd = m->cfg.hidden_dim; + int nh = m->cfg.n_heads, hdim = HEAD_DIM, vs = m->cfg.vocab_size; + + float *x = (float*)malloc(S * d * sizeof(float)); + for (int t = 0; t < S; t++) + memcpy(x + t*d, m->token_embedding + tokens[t]*d, d * sizeof(float)); + + for (int l = 0; l < N_LAYERS; l++) { + memcpy(m->act_x[l], x, S * d * sizeof(float)); + + cpu_rmsnorm(m->act_xnorm[l], x, m->rms_att_w[l], S, d); + + if (use_ane) { + ane_conv_eval(m->kern_q[l], m->act_xnorm[l], m->act_q[l], S, d, d); + ane_conv_eval(m->kern_k[l], m->act_xnorm[l], m->act_k[l], S, d, d); + ane_conv_eval(m->kern_v[l], m->act_xnorm[l], m->act_v[l], S, d, d); + } else { + cpu_matmul(m->wq[l], m->act_xnorm[l], m->act_q[l], S, d, d); + cpu_matmul(m->wk[l], m->act_xnorm[l], m->act_k[l], S, d, d); + cpu_matmul(m->wv[l], m->act_xnorm[l], m->act_v[l], S, d, d); + } + + cpu_rope(m->act_q[l], m->act_k[l], S, nh, hdim); + cpu_attention(m->act_attn_out[l], m->act_q[l], m->act_k[l], m->act_v[l], S, nh, hdim); + + float *o_out = (float*)malloc(S * d * sizeof(float)); + if (use_ane) { + ane_conv_eval(m->kern_o[l], m->act_attn_out[l], o_out, S, d, d); + } else { + cpu_matmul(m->wo[l], m->act_attn_out[l], o_out, S, d, d); + } + for (int i = 0; i < S * d; i++) x[i] += o_out[i]; + free(o_out); + + cpu_rmsnorm(m->act_ffn_in[l], x, m->rms_ffn_w[l], S, d); + + if (use_ane) { + ane_conv_eval(m->kern_w1[l], m->act_ffn_in[l], m->act_h1[l], S, d, hd); + ane_conv_eval(m->kern_w3[l], m->act_ffn_in[l], m->act_h3[l], S, d, hd); + } else { + cpu_matmul(m->w1[l], m->act_ffn_in[l], m->act_h1[l], S, d, hd); + cpu_matmul(m->w3[l], m->act_ffn_in[l], m->act_h3[l], S, d, hd); + } + + for (int t = 0; t < S; t++) + for (int i = 0; i < hd; i++) + m->act_silu[l][t*hd+i] = silu_f(m->act_h1[l][t*hd+i]) * m->act_h3[l][t*hd+i]; + + float *ffn_out = (float*)malloc(S * d * sizeof(float)); + if (use_ane) { + ane_conv_eval(m->kern_w2[l], m->act_silu[l], ffn_out, S, hd, d); + } else { + cpu_matmul(m->w2[l], m->act_silu[l], ffn_out, S, hd, d); + } + for (int i = 0; i < S * d; i++) x[i] += ffn_out[i]; + free(ffn_out); + } + + memcpy(m->act_pre_final, x, S * d * sizeof(float)); + cpu_rmsnorm(m->act_final, x, m->rms_final_w, S, d); + + if (use_ane && m->kern_cls) { + ane_conv_eval(m->kern_cls, m->act_final, m->logits, S, d, vs); + } else { + cpu_matmul(m->wcls, m->act_final, m->logits, S, d, vs); + } + + free(x); + + float loss = 0; + for (int t = 0; t < S - 1; t++) { + float mx = -1e9f; + for (int i = 0; i < vs; i++) if (m->logits[t*vs+i] > mx) mx = m->logits[t*vs+i]; + float sm = 0; + for (int i = 0; i < vs; i++) sm += expf(m->logits[t*vs+i] - mx); + float log_prob = m->logits[t*vs + tokens[t+1]] - mx - logf(sm); + loss -= log_prob; + } + return loss / (S - 1); +} diff --git a/training/model.h b/training/model.h new file mode 100644 index 0000000..6cee52f --- /dev/null +++ b/training/model.h @@ -0,0 +1,256 @@ +// model.h — Stories110M model struct + weight loading + ANE kernel compilation +// Training version: baked-weight conv kernels, recompile when weights update +#pragma once +#include +#include +#include +#include +#include "ane_runtime.h" +#include "ane_mil_gen.h" + +#define N_LAYERS 12 +#define DIM 768 +#define HIDDEN_DIM 2048 +#define N_HEADS 12 +#define HEAD_DIM 64 +#define VOCAB_SIZE 32000 +#define MAX_SEQ 1024 + +typedef struct { + int dim, hidden_dim, n_layers, n_heads, n_kv_heads, vocab_size, seq_len; +} Config; + +typedef struct { + Config cfg; + int seq_len; // training sequence length + + // Raw weights (f32) + float *token_embedding; // [vocab_size, dim] + float *rms_att_w[N_LAYERS]; // [dim] + float *wq[N_LAYERS]; // [dim, dim] + float *wk[N_LAYERS]; // [dim, dim] + float *wv[N_LAYERS]; // [dim, dim] + float *wo[N_LAYERS]; // [dim, dim] + float *rms_ffn_w[N_LAYERS]; // [dim] + float *w1[N_LAYERS]; // [hidden_dim, dim] + float *w2[N_LAYERS]; // [dim, hidden_dim] + float *w3[N_LAYERS]; // [hidden_dim, dim] + float *rms_final_w; // [dim] + float *wcls; // [vocab_size, dim] + + // Per-layer ANE conv kernels (baked weights, recompiled on update) + ANEKernel *kern_q[N_LAYERS]; // Q projection: dim→dim + ANEKernel *kern_k[N_LAYERS]; // K projection: dim→dim + ANEKernel *kern_v[N_LAYERS]; // V projection: dim→dim + ANEKernel *kern_o[N_LAYERS]; // O projection: dim→dim + ANEKernel *kern_w1[N_LAYERS]; // FFN w1: dim→hidden + ANEKernel *kern_w2[N_LAYERS]; // FFN w2: hidden→dim + ANEKernel *kern_w3[N_LAYERS]; // FFN w3: dim→hidden + ANEKernel *kern_cls; // Classifier: dim→vocab + + // Gradient accumulators (f32) + float *grad_wq[N_LAYERS], *grad_wk[N_LAYERS], *grad_wv[N_LAYERS], *grad_wo[N_LAYERS]; + float *grad_w1[N_LAYERS], *grad_w2[N_LAYERS], *grad_w3[N_LAYERS]; + float *grad_wcls; + float *grad_emb; + + // Adam optimizer state + float *adam_m, *adam_v; + int adam_step; + size_t total_params; + + // Activation cache for backward + float *act_x[N_LAYERS]; + float *act_xnorm[N_LAYERS]; + float *act_q[N_LAYERS]; + float *act_k[N_LAYERS]; + float *act_v[N_LAYERS]; + float *act_attn_out[N_LAYERS]; + float *act_ffn_in[N_LAYERS]; + float *act_h1[N_LAYERS]; + float *act_h3[N_LAYERS]; + float *act_silu[N_LAYERS]; + float *act_final; + float *act_pre_final; + float *logits; +} Model; + +static int model_load_weights(Model *m, const char *path) { + FILE *f = fopen(path, "rb"); + if (!f) { fprintf(stderr, "Cannot open %s\n", path); return -1; } + fread(&m->cfg, sizeof(Config), 1, f); + bool shared = m->cfg.vocab_size > 0; + if (m->cfg.vocab_size < 0) m->cfg.vocab_size = -m->cfg.vocab_size; + + printf("Model: dim=%d hidden=%d layers=%d heads=%d vocab=%d seq=%d\n", + m->cfg.dim, m->cfg.hidden_dim, m->cfg.n_layers, m->cfg.n_heads, + m->cfg.vocab_size, m->cfg.seq_len); + + int d = m->cfg.dim, hd = m->cfg.hidden_dim, nl = m->cfg.n_layers, vs = m->cfg.vocab_size; + + m->token_embedding = (float*)malloc(vs * d * sizeof(float)); + fread(m->token_embedding, sizeof(float), vs * d, f); + + float *rms_att_all = (float*)malloc(nl * d * sizeof(float)); + float *wq_all = (float*)malloc(nl * d * d * sizeof(float)); + float *wk_all = (float*)malloc(nl * d * d * sizeof(float)); + float *wv_all = (float*)malloc(nl * d * d * sizeof(float)); + float *wo_all = (float*)malloc(nl * d * d * sizeof(float)); + float *rms_ffn_all = (float*)malloc(nl * d * sizeof(float)); + float *w1_all = (float*)malloc(nl * hd * d * sizeof(float)); + float *w2_all = (float*)malloc(nl * d * hd * sizeof(float)); + float *w3_all = (float*)malloc(nl * hd * d * sizeof(float)); + + fread(rms_att_all, sizeof(float), nl * d, f); + fread(wq_all, sizeof(float), nl * d * d, f); + fread(wk_all, sizeof(float), nl * d * d, f); + fread(wv_all, sizeof(float), nl * d * d, f); + fread(wo_all, sizeof(float), nl * d * d, f); + fread(rms_ffn_all, sizeof(float), nl * d, f); + fread(w1_all, sizeof(float), nl * hd * d, f); + fread(w2_all, sizeof(float), nl * d * hd, f); + fread(w3_all, sizeof(float), nl * hd * d, f); + + for (int l = 0; l < nl; l++) { + m->rms_att_w[l] = (float*)malloc(d * sizeof(float)); + memcpy(m->rms_att_w[l], rms_att_all + l*d, d * sizeof(float)); + m->wq[l] = (float*)malloc(d*d*sizeof(float)); + memcpy(m->wq[l], wq_all + l*d*d, d*d*sizeof(float)); + m->wk[l] = (float*)malloc(d*d*sizeof(float)); + memcpy(m->wk[l], wk_all + l*d*d, d*d*sizeof(float)); + m->wv[l] = (float*)malloc(d*d*sizeof(float)); + memcpy(m->wv[l], wv_all + l*d*d, d*d*sizeof(float)); + m->wo[l] = (float*)malloc(d*d*sizeof(float)); + memcpy(m->wo[l], wo_all + l*d*d, d*d*sizeof(float)); + m->rms_ffn_w[l] = (float*)malloc(d * sizeof(float)); + memcpy(m->rms_ffn_w[l], rms_ffn_all + l*d, d * sizeof(float)); + m->w1[l] = (float*)malloc(hd*d*sizeof(float)); + memcpy(m->w1[l], w1_all + l*hd*d, hd*d*sizeof(float)); + m->w2[l] = (float*)malloc(d*hd*sizeof(float)); + memcpy(m->w2[l], w2_all + l*d*hd, d*hd*sizeof(float)); + m->w3[l] = (float*)malloc(hd*d*sizeof(float)); + memcpy(m->w3[l], w3_all + l*hd*d, hd*d*sizeof(float)); + } + free(rms_att_all); free(wq_all); free(wk_all); free(wv_all); free(wo_all); + free(rms_ffn_all); free(w1_all); free(w2_all); free(w3_all); + + m->rms_final_w = (float*)malloc(d * sizeof(float)); + fread(m->rms_final_w, sizeof(float), d, f); + + if (shared) { + m->wcls = m->token_embedding; + } else { + m->wcls = (float*)malloc(vs * d * sizeof(float)); + fread(m->wcls, sizeof(float), vs * d, f); + } + fclose(f); + return 0; +} + +// Compile a single baked-weight conv kernel +static ANEKernel *compile_conv_kernel(const float *weights, int in_ch, int out_ch, int spatial) { + NSData *wb = mil_build_weight_blob(weights, out_ch, in_ch); + NSString *mil = mil_gen_conv(in_ch, out_ch, spatial); + size_t inBytes = (size_t)in_ch * spatial * 4; + size_t outBytes = (size_t)out_ch * spatial * 4; + return ane_compile([mil dataUsingEncoding:NSUTF8StringEncoding], wb, 1, &inBytes, 1, &outBytes); +} + +// Compile all per-layer ANE kernels with current weights +static int model_compile_kernels(Model *m, int seq_len) { + m->seq_len = seq_len; + int d = m->cfg.dim, hd = m->cfg.hidden_dim, vs = m->cfg.vocab_size; + int S = seq_len; + printf("Compiling %d ANE conv kernels (S=%d)...\n", N_LAYERS * 7 + 1, S); + + for (int l = 0; l < N_LAYERS; l++) { + m->kern_q[l] = compile_conv_kernel(m->wq[l], d, d, S); + m->kern_k[l] = compile_conv_kernel(m->wk[l], d, d, S); + m->kern_v[l] = compile_conv_kernel(m->wv[l], d, d, S); + m->kern_o[l] = compile_conv_kernel(m->wo[l], d, d, S); + m->kern_w1[l] = compile_conv_kernel(m->w1[l], d, hd, S); + m->kern_w2[l] = compile_conv_kernel(m->w2[l], hd, d, S); + m->kern_w3[l] = compile_conv_kernel(m->w3[l], d, hd, S); + if (!m->kern_q[l]) { fprintf(stderr, "L%d kern_q fail\n",l); return -1; } + if (!m->kern_k[l]) { fprintf(stderr, "L%d kern_k fail\n",l); return -1; } + if (!m->kern_v[l]) { fprintf(stderr, "L%d kern_v fail\n",l); return -1; } + if (!m->kern_o[l]) { fprintf(stderr, "L%d kern_o fail\n",l); return -1; } + if (!m->kern_w1[l]) { fprintf(stderr, "L%d kern_w1 fail\n",l); return -1; } + if (!m->kern_w2[l]) { fprintf(stderr, "L%d kern_w2 fail\n",l); return -1; } + if (!m->kern_w3[l]) { fprintf(stderr, "L%d kern_w3 fail\n",l); return -1; } + printf(" Layer %d OK\n", l); + } + m->kern_cls = compile_conv_kernel(m->wcls, d, vs, S); + if (!m->kern_cls) { + fprintf(stderr, "Classifier kernel compile failed (dim=%d→vocab=%d too large?), using CPU for cls\n", d, vs); + } + printf(" All kernels compiled (%d conv + %s)\n", N_LAYERS * 7, m->kern_cls ? "cls" : "cls=CPU"); + return 0; +} + +// Recompile all kernels after weight update — unload all first to avoid ANE model limit +static int model_recompile_kernels(Model *m) { + int d = m->cfg.dim, hd = m->cfg.hidden_dim, vs = m->cfg.vocab_size; + int S = m->seq_len; + // Phase 1: unload+free all + for (int l = 0; l < N_LAYERS; l++) { + ane_free(m->kern_q[l]); ane_free(m->kern_k[l]); ane_free(m->kern_v[l]); ane_free(m->kern_o[l]); + ane_free(m->kern_w1[l]); ane_free(m->kern_w2[l]); ane_free(m->kern_w3[l]); + m->kern_q[l]=m->kern_k[l]=m->kern_v[l]=m->kern_o[l]=NULL; + m->kern_w1[l]=m->kern_w2[l]=m->kern_w3[l]=NULL; + } + if (m->kern_cls) { ane_free(m->kern_cls); m->kern_cls=NULL; } + // Phase 2: recompile all + for (int l = 0; l < N_LAYERS; l++) { + m->kern_q[l] = compile_conv_kernel(m->wq[l], d, d, S); + m->kern_k[l] = compile_conv_kernel(m->wk[l], d, d, S); + m->kern_v[l] = compile_conv_kernel(m->wv[l], d, d, S); + m->kern_o[l] = compile_conv_kernel(m->wo[l], d, d, S); + m->kern_w1[l] = compile_conv_kernel(m->w1[l], d, hd, S); + m->kern_w2[l] = compile_conv_kernel(m->w2[l], hd, d, S); + m->kern_w3[l] = compile_conv_kernel(m->w3[l], d, hd, S); + if (!m->kern_q[l] || !m->kern_k[l] || !m->kern_v[l] || !m->kern_o[l] || + !m->kern_w1[l] || !m->kern_w2[l] || !m->kern_w3[l]) return -1; + } + m->kern_cls = compile_conv_kernel(m->wcls, d, vs, S); + // cls may fail for large vocab — that's OK, forward uses CPU fallback + return 0; +} + +static void model_alloc_training(Model *m) { + int d = m->cfg.dim, hd = m->cfg.hidden_dim, vs = m->cfg.vocab_size, S = m->seq_len; + for (int l = 0; l < N_LAYERS; l++) { + m->act_x[l] = (float*)calloc(S * d, sizeof(float)); + m->act_xnorm[l] = (float*)calloc(S * d, sizeof(float)); + m->act_q[l] = (float*)calloc(S * d, sizeof(float)); + m->act_k[l] = (float*)calloc(S * d, sizeof(float)); + m->act_v[l] = (float*)calloc(S * d, sizeof(float)); + m->act_attn_out[l] = (float*)calloc(S * d, sizeof(float)); + m->act_ffn_in[l] = (float*)calloc(S * d, sizeof(float)); + m->act_h1[l] = (float*)calloc(S * hd, sizeof(float)); + m->act_h3[l] = (float*)calloc(S * hd, sizeof(float)); + m->act_silu[l] = (float*)calloc(S * hd, sizeof(float)); + + m->grad_wq[l] = (float*)calloc(d * d, sizeof(float)); + m->grad_wk[l] = (float*)calloc(d * d, sizeof(float)); + m->grad_wv[l] = (float*)calloc(d * d, sizeof(float)); + m->grad_wo[l] = (float*)calloc(d * d, sizeof(float)); + m->grad_w1[l] = (float*)calloc(hd * d, sizeof(float)); + m->grad_w2[l] = (float*)calloc(d * hd, sizeof(float)); + m->grad_w3[l] = (float*)calloc(hd * d, sizeof(float)); + } + m->act_final = (float*)calloc(S * d, sizeof(float)); + m->act_pre_final = (float*)calloc(S * d, sizeof(float)); + m->logits = (float*)calloc(S * vs, sizeof(float)); + m->grad_wcls = (float*)calloc(vs * d, sizeof(float)); + m->grad_emb = (float*)calloc(vs * d, sizeof(float)); + + m->total_params = 0; + for (int l = 0; l < N_LAYERS; l++) + m->total_params += 4*(size_t)d*d + 2*(size_t)hd*d + (size_t)d*hd; + m->total_params += (size_t)vs * d * 2; + m->adam_m = (float*)calloc(m->total_params, sizeof(float)); + m->adam_v = (float*)calloc(m->total_params, sizeof(float)); + m->adam_step = 0; + printf("Total trainable params: %zu (%.1f M)\n", m->total_params, m->total_params/1e6); +} diff --git a/training/test_ane_causal_attn.m b/training/test_ane_causal_attn.m new file mode 100644 index 0000000..cb9b761 --- /dev/null +++ b/training/test_ane_causal_attn.m @@ -0,0 +1,295 @@ +// Decomposed causal attention: Q@K^T on ANE, mask+softmax on CPU, scores@V on ANE +// This gives us causal masking with ANE acceleration for the matmuls +#import +#import +#import +#import +#import +#include + +#define HEADS 12 +#define HD 64 +#define SEQ 64 + +static Class g_D, g_I, g_AR, g_AIO; +static mach_timebase_info_data_t g_tb; +static void ane_init(void) { + dlopen("/System/Library/PrivateFrameworks/AppleNeuralEngine.framework/AppleNeuralEngine", RTLD_NOW); + g_D = NSClassFromString(@"_ANEInMemoryModelDescriptor"); + g_I = NSClassFromString(@"_ANEInMemoryModel"); + g_AR = NSClassFromString(@"_ANERequest"); + g_AIO= NSClassFromString(@"_ANEIOSurfaceObject"); +} +static double tb_ms(uint64_t t) { return (double)t * g_tb.numer / g_tb.denom / 1e6; } +static IOSurfaceRef make_surface(size_t bytes) { + return IOSurfaceCreate((__bridge CFDictionaryRef)@{ + (id)kIOSurfaceWidth:@(bytes), (id)kIOSurfaceHeight:@1, + (id)kIOSurfaceBytesPerElement:@1, (id)kIOSurfaceBytesPerRow:@(bytes), + (id)kIOSurfaceAllocSize:@(bytes), (id)kIOSurfacePixelFormat:@0}); +} + +typedef struct { id model; NSString *td; } Kern; + +static Kern compile_mil(NSString *mil) { + Kern k = {nil, nil}; + NSData *md = [mil dataUsingEncoding:NSUTF8StringEncoding]; + id desc = ((id(*)(Class,SEL,id,id,id))objc_msgSend)(g_D, @selector(modelWithMILText:weights:optionsPlist:), md, @{}, nil); + if (!desc) { printf("desc=NULL\n"); return k; } + id mdl = ((id(*)(Class,SEL,id))objc_msgSend)(g_I, @selector(inMemoryModelWithDescriptor:), desc); + id hx = ((id(*)(id,SEL))objc_msgSend)(mdl, @selector(hexStringIdentifier)); + NSString *td = [NSTemporaryDirectory() stringByAppendingPathComponent:hx]; + [[NSFileManager defaultManager] createDirectoryAtPath:td withIntermediateDirectories:YES attributes:nil error:nil]; + [md writeToFile:[td stringByAppendingPathComponent:@"model.mil"] atomically:YES]; + NSError *e = nil; + if (!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(compileWithQoS:options:error:), 21, @{}, &e)) { + printf("compile FAIL: %s\n", e?[[e localizedDescription] UTF8String]:""); + [[NSFileManager defaultManager] removeItemAtPath:td error:nil]; return k; + } + ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(loadWithQoS:options:error:), 21, @{}, &e); + k.model = mdl; k.td = td; + return k; +} + +static BOOL ane_eval(Kern *k, IOSurfaceRef *ins, int nin, IOSurfaceRef out) { + NSMutableArray *inArr = [NSMutableArray array], *inIdx = [NSMutableArray array]; + for (int i = 0; i < nin; i++) { + [inArr addObject:((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ins[i])]; + [inIdx addObject:@(i)]; + } + id wO = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), out); + id req = ((id(*)(Class,SEL,id,id,id,id,id,id,id))objc_msgSend)(g_AR, + @selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:), + inArr, inIdx, @[wO], @[@0], nil, nil, @0); + NSError *e = nil; + return ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)( + k->model, @selector(evaluateWithQoS:options:request:error:), 21, @{}, req, &e); +} + +static void cleanup_kern(Kern *k) { + if (!k->model) return; + NSError *e = nil; + ((BOOL(*)(id,SEL,unsigned int,NSError**))objc_msgSend)(k->model, @selector(unloadWithQoS:error:), 21, &e); + [[NSFileManager defaultManager] removeItemAtPath:k->td error:nil]; +} + +int main() { + @autoreleasepool { + setbuf(stdout, NULL); + ane_init(); + mach_timebase_info(&g_tb); + + // === Approach 1: Non-causal SDPA (baseline) === + printf("=== Non-causal SDPA (baseline) ===\n"); + NSString *sdpa_mil = [NSString stringWithFormat: + @"program(1.3)\n[buildInfo = dict({{\"coremlc-component-MIL\", \"3510.2.1\"}, " + "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, " + "{\"coremltools-version\", \"9.0\"}})]\n{\n" + " func main(tensor q, " + "tensor k, tensor v) {\n" + " tensor att = scaled_dot_product_attention(" + "query = q, key = k, value = v)[name = string(\"sdpa\")];\n" + " } -> (att);\n}\n", + HEADS, SEQ, HD, HEADS, SEQ, HD, HEADS, SEQ, HD, HEADS, SEQ, HD]; + Kern kSDPA = compile_mil(sdpa_mil); + printf("SDPA compile: %s\n", kSDPA.model ? "OK" : "FAIL"); + + // === Approach 2: Decomposed causal via matmul ops === + // Step 1: Q @ K^T → scores [1, HEADS, SEQ, SEQ] + // MIL matmul: matmul(x=Q, y=K, transpose_y=true) + // Q shape: [1, HEADS, SEQ, HD], K shape: [1, HEADS, SEQ, HD] + // scores = Q @ K^T → [1, HEADS, SEQ, SEQ] + printf("\n=== Decomposed causal attention ===\n"); + NSString *qkt_mil = [NSString stringWithFormat: + @"program(1.3)\n[buildInfo = dict({{\"coremlc-component-MIL\", \"3510.2.1\"}, " + "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, " + "{\"coremltools-version\", \"9.0\"}})]\n{\n" + " func main(tensor q, " + "tensor k) {\n" + " tensor scores = matmul(" + "x = q, y = k, transpose_y = true)[name = string(\"qkt\")];\n" + " } -> (scores);\n}\n", + HEADS, SEQ, HD, HEADS, SEQ, HD, HEADS, SEQ, SEQ]; + Kern kQKT = compile_mil(qkt_mil); + printf("Q@K^T compile: %s\n", kQKT.model ? "OK" : "FAIL"); + + // Step 3: scores_softmax @ V → output [1, HEADS, SEQ, HD] + NSString *sv_mil = [NSString stringWithFormat: + @"program(1.3)\n[buildInfo = dict({{\"coremlc-component-MIL\", \"3510.2.1\"}, " + "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, " + "{\"coremltools-version\", \"9.0\"}})]\n{\n" + " func main(tensor s, " + "tensor v) {\n" + " tensor out = matmul(" + "x = s, y = v)[name = string(\"sv\")];\n" + " } -> (out);\n}\n", + HEADS, SEQ, SEQ, HEADS, SEQ, HD, HEADS, SEQ, HD]; + Kern kSV = compile_mil(sv_mil); + printf("scores@V compile: %s\n", kSV.model ? "OK" : "FAIL"); + + if (!kSDPA.model || !kQKT.model || !kSV.model) { + printf("Some kernels failed to compile, aborting\n"); + goto done; + } + + // Generate test data + srand48(42); + int total_qkv = HEADS * SEQ * HD; + _Float16 *Q = (_Float16*)malloc(total_qkv * 2); + _Float16 *K = (_Float16*)malloc(total_qkv * 2); + _Float16 *V = (_Float16*)malloc(total_qkv * 2); + for (int i = 0; i < total_qkv; i++) { + Q[i] = (_Float16)(0.5f * (2*drand48()-1)); + K[i] = (_Float16)(0.5f * (2*drand48()-1)); + V[i] = (_Float16)(0.5f * (2*drand48()-1)); + } + + // IOSurfaces for Q, K, V + size_t qkv_bytes = total_qkv * 2; + IOSurfaceRef ioQ = make_surface(qkv_bytes), ioK = make_surface(qkv_bytes), ioV = make_surface(qkv_bytes); + IOSurfaceLock(ioQ, 0, NULL); memcpy(IOSurfaceGetBaseAddress(ioQ), Q, qkv_bytes); IOSurfaceUnlock(ioQ, 0, NULL); + IOSurfaceLock(ioK, 0, NULL); memcpy(IOSurfaceGetBaseAddress(ioK), K, qkv_bytes); IOSurfaceUnlock(ioK, 0, NULL); + IOSurfaceLock(ioV, 0, NULL); memcpy(IOSurfaceGetBaseAddress(ioV), V, qkv_bytes); IOSurfaceUnlock(ioV, 0, NULL); + + // Scores IOSurface: [1, HEADS, SEQ, SEQ] + int total_scores = HEADS * SEQ * SEQ; + size_t scores_bytes = total_scores * 2; + IOSurfaceRef ioScores = make_surface(scores_bytes); + IOSurfaceRef ioOut_sdpa = make_surface(qkv_bytes); + IOSurfaceRef ioOut_decomp = make_surface(qkv_bytes); + + // === Run non-causal SDPA === + { + IOSurfaceRef ins[] = {ioQ, ioK, ioV}; + if (!ane_eval(&kSDPA, ins, 3, ioOut_sdpa)) { printf("SDPA eval FAIL\n"); goto done; } + } + + // === Run decomposed causal === + // Step 1: Q@K^T on ANE + { + IOSurfaceRef ins[] = {ioQ, ioK}; + if (!ane_eval(&kQKT, ins, 2, ioScores)) { printf("Q@K^T eval FAIL\n"); goto done; } + } + + // Step 2: Scale + causal mask + softmax on CPU + { + IOSurfaceLock(ioScores, 0, NULL); + _Float16 *scores = (_Float16*)IOSurfaceGetBaseAddress(ioScores); + float scale = 1.0f / sqrtf((float)HD); + for (int h = 0; h < HEADS; h++) { + for (int t = 0; t < SEQ; t++) { + // Apply scale, causal mask, and softmax + float row[SEQ], maxs = -1e30f; + for (int t2 = 0; t2 < SEQ; t2++) { + float s = (float)scores[h*SEQ*SEQ + t*SEQ + t2] * scale; + if (t2 > t) s = -1e30f; // causal mask + row[t2] = s; + if (s > maxs) maxs = s; + } + float sum = 0; + for (int t2 = 0; t2 < SEQ; t2++) { row[t2] = expf(row[t2] - maxs); sum += row[t2]; } + for (int t2 = 0; t2 < SEQ; t2++) + scores[h*SEQ*SEQ + t*SEQ + t2] = (_Float16)(row[t2] / sum); + } + } + IOSurfaceUnlock(ioScores, 0, NULL); + } + + // Step 3: softmax_scores @ V on ANE + { + IOSurfaceRef ins[] = {ioScores, ioV}; + if (!ane_eval(&kSV, ins, 2, ioOut_decomp)) { printf("scores@V eval FAIL\n"); goto done; } + } + + // === Verify decomposed causal === + { + float scale = 1.0f / sqrtf((float)HD); + IOSurfaceLock(ioOut_decomp, kIOSurfaceLockReadOnly, NULL); + _Float16 *out = (_Float16*)IOSurfaceGetBaseAddress(ioOut_decomp); + float maxdiff = 0; + for (int h = 0; h < HEADS; h++) + for (int t = 0; t < SEQ; t++) { + float scores[SEQ], maxs = -1e30f; + for (int t2 = 0; t2 <= t; t2++) { + float s = 0; + for (int d = 0; d < HD; d++) s += (float)Q[h*SEQ*HD+t*HD+d]*(float)K[h*SEQ*HD+t2*HD+d]; + s *= scale; scores[t2] = s; if(s>maxs) maxs=s; + } + float sum = 0; + for (int t2 = 0; t2 <= t; t2++) { scores[t2]=expf(scores[t2]-maxs); sum+=scores[t2]; } + for (int t2 = 0; t2 <= t; t2++) scores[t2]/=sum; + for (int d = 0; d < HD; d++) { + float ref = 0; + for (int t2 = 0; t2 <= t; t2++) ref += scores[t2]*(float)V[h*SEQ*HD+t2*HD+d]; + float diff = fabsf((float)out[h*SEQ*HD+t*HD+d] - ref); + if(diff>maxdiff) maxdiff=diff; + } + } + IOSurfaceUnlock(ioOut_decomp, kIOSurfaceLockReadOnly, NULL); + printf("\nDecomposed causal max diff vs CPU ref: %.6f\n", maxdiff); + } + + // === Benchmark: SDPA vs decomposed === + printf("\n=== Benchmarks ===\n"); + int N = 500; + { + IOSurfaceRef ins[] = {ioQ, ioK, ioV}; + // Warmup + for (int i = 0; i < 10; i++) ane_eval(&kSDPA, ins, 3, ioOut_sdpa); + uint64_t t0 = mach_absolute_time(); + for (int i = 0; i < N; i++) ane_eval(&kSDPA, ins, 3, ioOut_sdpa); + double ms = tb_ms(mach_absolute_time() - t0); + double flops = 4.0 * HEADS * SEQ * SEQ * HD; + printf("SDPA (non-causal): %.3f ms/eval, %.1f GFLOPS\n", ms/N, N*flops/ms/1e6); + } + { + // Decomposed: QKT + CPU softmax + SV + // Warmup + for (int i = 0; i < 10; i++) { + IOSurfaceRef ins1[] = {ioQ, ioK}; + ane_eval(&kQKT, ins1, 2, ioScores); + // Skip CPU softmax in benchmark for ANE-only timing + IOSurfaceRef ins2[] = {ioScores, ioV}; + ane_eval(&kSV, ins2, 2, ioOut_decomp); + } + uint64_t t0 = mach_absolute_time(); + for (int i = 0; i < N; i++) { + IOSurfaceRef ins1[] = {ioQ, ioK}; + ane_eval(&kQKT, ins1, 2, ioScores); + // CPU softmax + causal mask + IOSurfaceLock(ioScores, 0, NULL); + _Float16 *sc = (_Float16*)IOSurfaceGetBaseAddress(ioScores); + float scale = 1.0f / sqrtf((float)HD); + for (int h = 0; h < HEADS; h++) + for (int t = 0; t < SEQ; t++) { + float row[SEQ], maxs = -1e30f; + for (int t2 = 0; t2 < SEQ; t2++) { + float s = (float)sc[h*SEQ*SEQ+t*SEQ+t2] * scale; + if (t2 > t) s = -1e30f; + row[t2] = s; if(s>maxs) maxs=s; + } + float sum = 0; + for (int t2 = 0; t2 < SEQ; t2++) { row[t2]=expf(row[t2]-maxs); sum+=row[t2]; } + for (int t2 = 0; t2 < SEQ; t2++) + sc[h*SEQ*SEQ+t*SEQ+t2] = (_Float16)(row[t2]/sum); + } + IOSurfaceUnlock(ioScores, 0, NULL); + IOSurfaceRef ins2[] = {ioScores, ioV}; + ane_eval(&kSV, ins2, 2, ioOut_decomp); + } + double ms = tb_ms(mach_absolute_time() - t0); + double flops = 4.0 * HEADS * SEQ * SEQ * HD; + printf("Decomposed causal: %.3f ms/eval, %.1f GFLOPS\n", ms/N, N*flops/ms/1e6); + } + + CFRelease(ioQ); CFRelease(ioK); CFRelease(ioV); + CFRelease(ioScores); CFRelease(ioOut_sdpa); CFRelease(ioOut_decomp); + free(Q); free(K); free(V); + + done: + cleanup_kern(&kSDPA); + cleanup_kern(&kQKT); + cleanup_kern(&kSV); + printf("\nDONE\n"); + } + return 0; +} diff --git a/training/test_ane_sdpa5.m b/training/test_ane_sdpa5.m new file mode 100644 index 0000000..0ddce84 --- /dev/null +++ b/training/test_ane_sdpa5.m @@ -0,0 +1,297 @@ +// Debug: why causal mask doesn't apply. Try different approaches. +#import +#import +#import +#import +#include + +#define HEADS 12 +#define HD 64 +#define SEQ 8 // small for readable output + +static Class g_D, g_I, g_AR, g_AIO; +static void ane_init(void) { + dlopen("/System/Library/PrivateFrameworks/AppleNeuralEngine.framework/AppleNeuralEngine", RTLD_NOW); + g_D = NSClassFromString(@"_ANEInMemoryModelDescriptor"); + g_I = NSClassFromString(@"_ANEInMemoryModel"); + g_AR = NSClassFromString(@"_ANERequest"); + g_AIO= NSClassFromString(@"_ANEIOSurfaceObject"); +} +static IOSurfaceRef make_surface(size_t bytes) { + return IOSurfaceCreate((__bridge CFDictionaryRef)@{ + (id)kIOSurfaceWidth:@(bytes), (id)kIOSurfaceHeight:@1, + (id)kIOSurfaceBytesPerElement:@1, (id)kIOSurfaceBytesPerRow:@(bytes), + (id)kIOSurfaceAllocSize:@(bytes), (id)kIOSurfacePixelFormat:@0}); +} + +// Build inline mask string for MIL: tensor([v00, v01, ...]) +static NSString *build_inline_causal_mask(int s) { + NSMutableString *vals = [NSMutableString string]; + for (int t = 0; t < s; t++) { + for (int t2 = 0; t2 < s; t2++) { + if (t > 0 || t2 > 0) [vals appendString:@", "]; + [vals appendString:(t2 <= t) ? @"0" : @"-65504"]; // fp16 -inf + } + } + return [NSString stringWithFormat: + @"tensor([%@])", s, s, vals]; +} + +static NSData *build_mask_blob(int seq) { + int wsize = seq * seq * 2; + int total = 128 + wsize; + uint8_t *buf = (uint8_t*)calloc(total, 1); + buf[0]=1; buf[4]=2; buf[64]=0xEF; buf[65]=0xBE; buf[66]=0xAD; buf[67]=0xDE; buf[68]=1; + *(uint32_t*)(buf+72)=wsize; *(uint32_t*)(buf+80)=128; + _Float16 *fp16 = (_Float16*)(buf+128); + for (int t = 0; t < seq; t++) + for (int t2 = 0; t2 < seq; t2++) + fp16[t*seq + t2] = (t2 <= t) ? (_Float16)0.0f : (_Float16)(-65504.0f); + return [NSData dataWithBytesNoCopy:buf length:total freeWhenDone:YES]; +} + +typedef struct { id model; NSString *td; } Model; + +static Model compile_model(NSString *mil, NSDictionary *wd) { + Model m = {nil, nil}; + NSData *md = [mil dataUsingEncoding:NSUTF8StringEncoding]; + id desc = ((id(*)(Class,SEL,id,id,id))objc_msgSend)(g_D, @selector(modelWithMILText:weights:optionsPlist:), md, wd ?: @{}, nil); + if (!desc) { printf(" desc=NULL\n"); return m; } + id mdl = ((id(*)(Class,SEL,id))objc_msgSend)(g_I, @selector(inMemoryModelWithDescriptor:), desc); + id hx = ((id(*)(id,SEL))objc_msgSend)(mdl, @selector(hexStringIdentifier)); + NSString *td = [NSTemporaryDirectory() stringByAppendingPathComponent:hx]; + [[NSFileManager defaultManager] createDirectoryAtPath:[td stringByAppendingPathComponent:@"weights"] withIntermediateDirectories:YES attributes:nil error:nil]; + [md writeToFile:[td stringByAppendingPathComponent:@"model.mil"] atomically:YES]; + for (NSString *path in wd) { + [wd[path][@"data"] writeToFile:[td stringByAppendingPathComponent:[path stringByReplacingOccurrencesOfString:@"@model_path/" withString:@""]] atomically:YES]; + } + NSError *e = nil; + if (!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(compileWithQoS:options:error:), 21, @{}, &e)) { + printf(" compile FAIL: %s\n", e?[[[e localizedDescription] substringToIndex:MIN(300,(int)[[e localizedDescription] length])] UTF8String]:""); + [[NSFileManager defaultManager] removeItemAtPath:td error:nil]; return m; + } + if (!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(loadWithQoS:options:error:), 21, @{}, &e)) { + printf(" load FAIL\n"); [[NSFileManager defaultManager] removeItemAtPath:td error:nil]; return m; + } + m.model = mdl; m.td = td; + return m; +} + +static void cleanup_model(Model *m) { + if (!m->model) return; + NSError *e = nil; + ((BOOL(*)(id,SEL,unsigned int,NSError**))objc_msgSend)(m->model, @selector(unloadWithQoS:error:), 21, &e); + [[NSFileManager defaultManager] removeItemAtPath:m->td error:nil]; +} + +int main() { + @autoreleasepool { + setbuf(stdout, NULL); + ane_init(); + + srand48(42); + int total = HEADS * SEQ * HD; + _Float16 *Q = (_Float16*)malloc(total * 2); + _Float16 *K = (_Float16*)malloc(total * 2); + _Float16 *V = (_Float16*)malloc(total * 2); + for (int i = 0; i < total; i++) { + Q[i] = (_Float16)(0.5f * (2*drand48()-1)); + K[i] = (_Float16)(0.5f * (2*drand48()-1)); + V[i] = (_Float16)(0.5f * (2*drand48()-1)); + } + + size_t bytes = total * 2; + IOSurfaceRef ioQ = make_surface(bytes), ioK = make_surface(bytes); + IOSurfaceRef ioV = make_surface(bytes); + IOSurfaceLock(ioQ, 0, NULL); memcpy(IOSurfaceGetBaseAddress(ioQ), Q, bytes); IOSurfaceUnlock(ioQ, 0, NULL); + IOSurfaceLock(ioK, 0, NULL); memcpy(IOSurfaceGetBaseAddress(ioK), K, bytes); IOSurfaceUnlock(ioK, 0, NULL); + IOSurfaceLock(ioV, 0, NULL); memcpy(IOSurfaceGetBaseAddress(ioV), V, bytes); IOSurfaceUnlock(ioV, 0, NULL); + id wQ = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioQ); + id wK = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioK); + id wV = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioV); + + // CPU references + float scale = 1.0f / sqrtf((float)HD); + float *cpu_causal = (float*)calloc(total, sizeof(float)); + float *cpu_nocausal = (float*)calloc(total, sizeof(float)); + for (int h = 0; h < HEADS; h++) + for (int t = 0; t < SEQ; t++) { + // Causal + float scores[SEQ], maxs = -1e30f; + for (int t2 = 0; t2 <= t; t2++) { + float s = 0; + for (int d = 0; d < HD; d++) s += (float)Q[h*SEQ*HD+t*HD+d]*(float)K[h*SEQ*HD+t2*HD+d]; + s *= scale; scores[t2] = s; if(s>maxs) maxs=s; + } + float sum = 0; + for (int t2 = 0; t2 <= t; t2++) { scores[t2]=expf(scores[t2]-maxs); sum+=scores[t2]; } + for (int t2 = 0; t2 <= t; t2++) scores[t2]/=sum; + for (int d = 0; d < HD; d++) { + float r = 0; + for (int t2 = 0; t2 <= t; t2++) r += scores[t2]*(float)V[h*SEQ*HD+t2*HD+d]; + cpu_causal[h*SEQ*HD+t*HD+d] = r; + } + // Non-causal + maxs = -1e30f; + for (int t2 = 0; t2 < SEQ; t2++) { + float s = 0; + for (int d = 0; d < HD; d++) s += (float)Q[h*SEQ*HD+t*HD+d]*(float)K[h*SEQ*HD+t2*HD+d]; + s *= scale; scores[t2] = s; if(s>maxs) maxs=s; + } + sum = 0; + for (int t2 = 0; t2 < SEQ; t2++) { scores[t2]=expf(scores[t2]-maxs); sum+=scores[t2]; } + for (int t2 = 0; t2 < SEQ; t2++) scores[t2]/=sum; + for (int d = 0; d < HD; d++) { + float r = 0; + for (int t2 = 0; t2 < SEQ; t2++) r += scores[t2]*(float)V[h*SEQ*HD+t2*HD+d]; + cpu_nocausal[h*SEQ*HD+t*HD+d] = r; + } + } + + // Helper: eval and compare + void (^eval_and_compare)(const char*, Model*, int nInputs, IOSurfaceRef*) = + ^(const char *label, Model *m, int nInputs, IOSurfaceRef *inputs) { + IOSurfaceRef ioO = make_surface(bytes); + id wO = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioO); + NSMutableArray *inArr = [NSMutableArray array]; + NSMutableArray *inIdx = [NSMutableArray array]; + for (int i = 0; i < nInputs; i++) { + [inArr addObject:((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), inputs[i])]; + [inIdx addObject:@(i)]; + } + id req = ((id(*)(Class,SEL,id,id,id,id,id,id,id))objc_msgSend)(g_AR, + @selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:), + inArr, inIdx, @[wO], @[@0], nil, nil, @0); + NSError *e = nil; + BOOL ok = ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)( + m->model, @selector(evaluateWithQoS:options:request:error:), 21, @{}, req, &e); + if (!ok) { + printf(" %s: eval FAIL: %s\n", label, e?[[[e localizedDescription] substringToIndex:MIN(200,(int)[[e localizedDescription] length])] UTF8String]:""); + CFRelease(ioO); return; + } + IOSurfaceLock(ioO, kIOSurfaceLockReadOnly, NULL); + _Float16 *out = (_Float16*)IOSurfaceGetBaseAddress(ioO); + float dc=0, dnc=0; + for (int i = 0; i < total; i++) { + float v = (float)out[i]; + float d1 = fabsf(v - cpu_causal[i]); if(d1>dc) dc=d1; + float d2 = fabsf(v - cpu_nocausal[i]); if(d2>dnc) dnc=d2; + } + IOSurfaceUnlock(ioO, kIOSurfaceLockReadOnly, NULL); + printf(" %s: diff_causal=%.6f diff_nocausal=%.6f → %s\n", label, dc, dnc, + dc < dnc ? "CAUSAL" : (dc > dnc ? "NON-CAUSAL" : "SAME")); + CFRelease(ioO); + }; + + // === Test 1: No mask (should be non-causal) === + printf("Test 1: no mask\n"); + { + NSString *mil = [NSString stringWithFormat: + @"program(1.3)\n[buildInfo = dict({{\"coremlc-component-MIL\", \"3510.2.1\"}, " + "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, " + "{\"coremltools-version\", \"9.0\"}})]\n{\n" + " func main(tensor q, " + "tensor k, tensor v) {\n" + " tensor att = scaled_dot_product_attention(" + "query = q, key = k, value = v)[name = string(\"sdpa\")];\n" + " } -> (att);\n}\n", + HEADS, SEQ, HD, HEADS, SEQ, HD, HEADS, SEQ, HD, HEADS, SEQ, HD]; + Model m = compile_model(mil, nil); + if (m.model) { + IOSurfaceRef ins[] = {ioQ, ioK, ioV}; + eval_and_compare("no-mask", &m, 3, ins); + cleanup_model(&m); + } + } + + // === Test 2: Inline causal mask === + printf("\nTest 2: inline causal mask\n"); + { + NSString *maskStr = build_inline_causal_mask(SEQ); + NSString *mil = [NSString stringWithFormat: + @"program(1.3)\n[buildInfo = dict({{\"coremlc-component-MIL\", \"3510.2.1\"}, " + "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, " + "{\"coremltools-version\", \"9.0\"}})]\n{\n" + " func main(tensor q, " + "tensor k, tensor v) {\n" + " %@ mask = const()[name = string(\"mask\"), val = %@];\n" + " tensor att = scaled_dot_product_attention(" + "query = q, key = k, value = v, attn_mask = mask)[name = string(\"sdpa\")];\n" + " } -> (att);\n}\n", + HEADS, SEQ, HD, HEADS, SEQ, HD, HEADS, SEQ, HD, + [NSString stringWithFormat:@"tensor", SEQ, SEQ], maskStr, + HEADS, SEQ, HD]; + Model m = compile_model(mil, nil); + if (m.model) { + IOSurfaceRef ins[] = {ioQ, ioK, ioV}; + eval_and_compare("inline-mask", &m, 3, ins); + cleanup_model(&m); + } + } + + // === Test 3: BLOBFILE mask === + printf("\nTest 3: BLOBFILE causal mask\n"); + { + NSString *mil = [NSString stringWithFormat: + @"program(1.3)\n[buildInfo = dict({{\"coremlc-component-MIL\", \"3510.2.1\"}, " + "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, " + "{\"coremltools-version\", \"9.0\"}})]\n{\n" + " func main(tensor q, " + "tensor k, tensor v) {\n" + " tensor mask = const()[name = string(\"mask\"), " + "val = tensor(BLOBFILE(path = string(\"@model_path/weights/mask.bin\"), offset = uint64(64)))];\n" + " tensor att = scaled_dot_product_attention(" + "query = q, key = k, value = v, attn_mask = mask)[name = string(\"sdpa\")];\n" + " } -> (att);\n}\n", + HEADS, SEQ, HD, HEADS, SEQ, HD, HEADS, SEQ, HD, + SEQ, SEQ, SEQ, SEQ, HEADS, SEQ, HD]; + NSDictionary *wd = @{@"@model_path/weights/mask.bin": @{@"offset":@0, @"data":build_mask_blob(SEQ)}}; + Model m = compile_model(mil, wd); + if (m.model) { + IOSurfaceRef ins[] = {ioQ, ioK, ioV}; + eval_and_compare("blob-mask", &m, 3, ins); + cleanup_model(&m); + } + } + + // === Test 4: mask as runtime input === + printf("\nTest 4: mask as runtime input\n"); + { + NSString *mil = [NSString stringWithFormat: + @"program(1.3)\n[buildInfo = dict({{\"coremlc-component-MIL\", \"3510.2.1\"}, " + "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, " + "{\"coremltools-version\", \"9.0\"}})]\n{\n" + " func main(tensor q, " + "tensor k, tensor v, " + "tensor mask) {\n" + " tensor att = scaled_dot_product_attention(" + "query = q, key = k, value = v, attn_mask = mask)[name = string(\"sdpa\")];\n" + " } -> (att);\n}\n", + HEADS, SEQ, HD, HEADS, SEQ, HD, HEADS, SEQ, HD, + SEQ, SEQ, HEADS, SEQ, HD]; + Model m = compile_model(mil, nil); + if (m.model) { + // Create mask IOSurface + size_t mbytes = SEQ * SEQ * 2; + IOSurfaceRef ioM = make_surface(mbytes); + IOSurfaceLock(ioM, 0, NULL); + _Float16 *mp = (_Float16*)IOSurfaceGetBaseAddress(ioM); + for (int t = 0; t < SEQ; t++) + for (int t2 = 0; t2 < SEQ; t2++) + mp[t*SEQ+t2] = (t2 <= t) ? (_Float16)0.0f : (_Float16)(-65504.0f); + IOSurfaceUnlock(ioM, 0, NULL); + + IOSurfaceRef ins[] = {ioQ, ioK, ioV, ioM}; + eval_and_compare("runtime-mask", &m, 4, ins); + CFRelease(ioM); + cleanup_model(&m); + } + } + + CFRelease(ioQ); CFRelease(ioK); CFRelease(ioV); + free(Q); free(K); free(V); + free(cpu_causal); free(cpu_nocausal); + printf("\nDONE\n"); + } + return 0; +} diff --git a/training/test_conv_attn3.m b/training/test_conv_attn3.m new file mode 100644 index 0000000..a396b4d --- /dev/null +++ b/training/test_conv_attn3.m @@ -0,0 +1,276 @@ +// Grouped conv causal attention with CORRECT layout A: blob[oc*ICg + ic] +#import +#import +#import +#import +#import +#include + +#define HEADS 12 +#define HD 64 +#define DIM (HEADS*HD) +#define SEQ 64 + +static Class g_D, g_I, g_AR, g_AIO; +static mach_timebase_info_data_t g_tb; +static void ane_init(void) { + dlopen("/System/Library/PrivateFrameworks/AppleNeuralEngine.framework/AppleNeuralEngine", RTLD_NOW); + g_D = NSClassFromString(@"_ANEInMemoryModelDescriptor"); + g_I = NSClassFromString(@"_ANEInMemoryModel"); + g_AR = NSClassFromString(@"_ANERequest"); + g_AIO= NSClassFromString(@"_ANEIOSurfaceObject"); +} +static double tb_ms(uint64_t t) { return (double)t * g_tb.numer / g_tb.denom / 1e6; } +static IOSurfaceRef make_surface(size_t bytes) { + return IOSurfaceCreate((__bridge CFDictionaryRef)@{ + (id)kIOSurfaceWidth:@(bytes), (id)kIOSurfaceHeight:@1, + (id)kIOSurfaceBytesPerElement:@1, (id)kIOSurfaceBytesPerRow:@(bytes), + (id)kIOSurfaceAllocSize:@(bytes), (id)kIOSurfacePixelFormat:@0}); +} +static NSData *build_blob_raw(_Float16 *data, int count) { + int wsize = count * 2, total = 128 + wsize; + uint8_t *buf = (uint8_t*)calloc(total, 1); + buf[0]=1; buf[4]=2; buf[64]=0xEF; buf[65]=0xBE; buf[66]=0xAD; buf[67]=0xDE; buf[68]=1; + *(uint32_t*)(buf+72)=wsize; *(uint32_t*)(buf+80)=128; + memcpy(buf+128, data, wsize); + return [NSData dataWithBytesNoCopy:buf length:total freeWhenDone:YES]; +} +typedef struct { id model; NSString *td; } Kern; +static Kern compile_mil(NSString *mil, NSDictionary *wd) { + Kern k = {nil, nil}; + NSData *md = [mil dataUsingEncoding:NSUTF8StringEncoding]; + id desc = ((id(*)(Class,SEL,id,id,id))objc_msgSend)(g_D, @selector(modelWithMILText:weights:optionsPlist:), md, wd ?: @{}, nil); + if (!desc) { printf("desc=NULL\n"); return k; } + id mdl = ((id(*)(Class,SEL,id))objc_msgSend)(g_I, @selector(inMemoryModelWithDescriptor:), desc); + id hx = ((id(*)(id,SEL))objc_msgSend)(mdl, @selector(hexStringIdentifier)); + NSString *td = [NSTemporaryDirectory() stringByAppendingPathComponent:hx]; + [[NSFileManager defaultManager] createDirectoryAtPath:[td stringByAppendingPathComponent:@"weights"] + withIntermediateDirectories:YES attributes:nil error:nil]; + [md writeToFile:[td stringByAppendingPathComponent:@"model.mil"] atomically:YES]; + for (NSString *path in wd) { + [wd[path][@"data"] writeToFile:[td stringByAppendingPathComponent: + [path stringByReplacingOccurrencesOfString:@"@model_path/" withString:@""]] atomically:YES]; + } + NSError *e = nil; + if (!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(compileWithQoS:options:error:), 21, @{}, &e)) { + printf("compile FAIL: %s\n", e?[[e localizedDescription] UTF8String]:""); return k; + } + ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(loadWithQoS:options:error:), 21, @{}, &e); + k.model = mdl; k.td = td; + return k; +} +static BOOL ane_eval(Kern *k, IOSurfaceRef *ins, int nin, IOSurfaceRef out) { + NSMutableArray *inArr = [NSMutableArray array], *inIdx = [NSMutableArray array]; + for (int i = 0; i < nin; i++) { + [inArr addObject:((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ins[i])]; + [inIdx addObject:@(i)]; + } + id wO = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), out); + id req = ((id(*)(Class,SEL,id,id,id,id,id,id,id))objc_msgSend)(g_AR, + @selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:), + inArr, inIdx, @[wO], @[@0], nil, nil, @0); + NSError *e = nil; + return ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)( + k->model, @selector(evaluateWithQoS:options:request:error:), 21, @{}, req, &e); +} +static void cleanup_kern(Kern *k) { + if (!k->model) return; + NSError *e = nil; + ((BOOL(*)(id,SEL,unsigned int,NSError**))objc_msgSend)(k->model, @selector(unloadWithQoS:error:), 21, &e); + [[NSFileManager defaultManager] removeItemAtPath:k->td error:nil]; +} + +static NSString *gen_conv_mil(int ic, int oc, int icg, int groups, int sp) { + return [NSString stringWithFormat: + @"program(1.3)\n[buildInfo = dict({{\"coremlc-component-MIL\", \"3510.2.1\"}, " + "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, " + "{\"coremltools-version\", \"9.0\"}})]\n{\n" + " func main(tensor x) {\n" + " tensor W = const()[name = string(\"W\"), " + "val = tensor(BLOBFILE(path = string(\"@model_path/weights/w.bin\"), offset = uint64(64)))];\n" + " string pt = const()[name = string(\"pt\"), val = string(\"valid\")];\n" + " tensor st = const()[name = string(\"st\"), val = tensor([1, 1])];\n" + " tensor pd = const()[name = string(\"pd\"), val = tensor([0, 0, 0, 0])];\n" + " tensor dl = const()[name = string(\"dl\"), val = tensor([1, 1])];\n" + " int32 gr = const()[name = string(\"gr\"), val = int32(%d)];\n" + " tensor y = conv(dilations = dl, groups = gr, pad = pd, " + "pad_type = pt, strides = st, weight = W, x = x)[name = string(\"cv\")];\n" + " } -> (y);\n}\n", ic, sp, oc, icg, oc, icg, groups, oc, sp]; +} + +int main() { + @autoreleasepool { + setbuf(stdout, NULL); + ane_init(); + mach_timebase_info(&g_tb); + + printf("=== Grouped Conv Causal Attention (layout A) ===\n"); + printf("HEADS=%d HD=%d SEQ=%d\n\n", HEADS, HD, SEQ); + + srand48(42); + float *Q = (float*)malloc(SEQ*DIM*sizeof(float)); + float *K = (float*)malloc(SEQ*DIM*sizeof(float)); + float *V = (float*)malloc(SEQ*DIM*sizeof(float)); + for (int i = 0; i < SEQ*DIM; i++) { + Q[i] = 0.5f*(2*drand48()-1); + K[i] = 0.5f*(2*drand48()-1); + V[i] = 0.5f*(2*drand48()-1); + } + + // Q@K^T grouped conv weight: [HEADS*SEQ, HD, 1, 1] with groups=HEADS + // Layout A: blob[oc * ICg + ic] where ICg = HD + // For head h: oc = h*SEQ+t2, ic = d (within group) + // We want: output[h*SEQ+t2, t] = sum_d Q[h*HD+d, t] * K_weight[h*SEQ+t2, d] + // So K_weight[oc, ic] = K[t2, h*HD+d] where oc=h*SEQ+t2, ic=d + int kw_count = HEADS * SEQ * HD; + _Float16 *kw = (_Float16*)malloc(kw_count * sizeof(_Float16)); + for (int h = 0; h < HEADS; h++) + for (int t2 = 0; t2 < SEQ; t2++) + for (int d = 0; d < HD; d++) { + int oc = h*SEQ + t2; + kw[oc*HD + d] = (_Float16)K[t2*DIM + h*HD + d]; + } + NSDictionary *qkt_wd = @{@"@model_path/weights/w.bin": @{@"offset":@0, @"data":build_blob_raw(kw, kw_count)}}; + free(kw); + + // scores@V grouped conv weight: [HEADS*HD, SEQ, 1, 1] with groups=HEADS + // oc = h*HD+d, ic = t2 (within group, ICg=SEQ) + // V_weight[oc, ic] = V[t2, h*HD+d] + int vw_count = HEADS * HD * SEQ; + _Float16 *vw = (_Float16*)malloc(vw_count * sizeof(_Float16)); + for (int h = 0; h < HEADS; h++) + for (int d = 0; d < HD; d++) + for (int t2 = 0; t2 < SEQ; t2++) { + int oc = h*HD + d; + vw[oc*SEQ + t2] = (_Float16)V[t2*DIM + h*HD + d]; + } + NSDictionary *sv_wd = @{@"@model_path/weights/w.bin": @{@"offset":@0, @"data":build_blob_raw(vw, vw_count)}}; + free(vw); + + // Compile + printf("Compiling Q@K^T (grouped conv, groups=%d)...\n", HEADS); + NSString *qkt_mil = gen_conv_mil(HEADS*HD, HEADS*SEQ, HD, HEADS, SEQ); + Kern kQKT = compile_mil(qkt_mil, qkt_wd); + printf(" %s\n", kQKT.model ? "OK" : "FAIL"); + + printf("Compiling scores@V (grouped conv, groups=%d)...\n", HEADS); + NSString *sv_mil = gen_conv_mil(HEADS*SEQ, HEADS*HD, SEQ, HEADS, SEQ); + Kern kSV = compile_mil(sv_mil, sv_wd); + printf(" %s\n", kSV.model ? "OK" : "FAIL"); + + if (!kQKT.model || !kSV.model) { printf("FAIL\n"); goto done; } + + // Prepare Q IOSurface [1, DIM, 1, SEQ] fp16 + size_t q_bytes = DIM * SEQ * 2; + IOSurfaceRef ioQ = make_surface(q_bytes); + IOSurfaceLock(ioQ, 0, NULL); + _Float16 *qp = (_Float16*)IOSurfaceGetBaseAddress(ioQ); + for (int t = 0; t < SEQ; t++) + for (int c = 0; c < DIM; c++) + qp[c*SEQ + t] = (_Float16)Q[t*DIM + c]; + IOSurfaceUnlock(ioQ, 0, NULL); + + size_t sc_bytes = HEADS * SEQ * SEQ * 2; + IOSurfaceRef ioScores = make_surface(sc_bytes); + IOSurfaceRef ioOut = make_surface(q_bytes); + + // Step 1: Q@K^T + IOSurfaceRef ins1[] = {ioQ}; + if (!ane_eval(&kQKT, ins1, 1, ioScores)) { printf("Q@K^T eval FAIL\n"); goto done; } + + // Step 2: Scale + causal mask + softmax (CPU) + float scale = 1.0f / sqrtf((float)HD); + IOSurfaceLock(ioScores, 0, NULL); + _Float16 *sc = (_Float16*)IOSurfaceGetBaseAddress(ioScores); + for (int h = 0; h < HEADS; h++) + for (int t = 0; t < SEQ; t++) { + float row[SEQ], maxs = -1e30f; + for (int t2 = 0; t2 < SEQ; t2++) { + // scores channel = h*SEQ+t2, spatial = t + float s = (float)sc[(h*SEQ+t2)*SEQ + t] * scale; + if (t2 > t) s = -1e30f; + row[t2] = s; + if (s > maxs) maxs = s; + } + float sum = 0; + for (int t2 = 0; t2 < SEQ; t2++) { row[t2] = expf(row[t2]-maxs); sum += row[t2]; } + for (int t2 = 0; t2 < SEQ; t2++) + sc[(h*SEQ+t2)*SEQ + t] = (_Float16)(row[t2] / sum); + } + IOSurfaceUnlock(ioScores, 0, NULL); + + // Step 3: scores@V + IOSurfaceRef ins2[] = {ioScores}; + if (!ane_eval(&kSV, ins2, 1, ioOut)) { printf("scores@V eval FAIL\n"); goto done; } + + // Verify + IOSurfaceLock(ioOut, kIOSurfaceLockReadOnly, NULL); + _Float16 *out = (_Float16*)IOSurfaceGetBaseAddress(ioOut); + float maxdiff = 0; + for (int h = 0; h < HEADS; h++) + for (int t = 0; t < SEQ; t++) { + float sc2[SEQ], maxs = -1e30f; + for (int t2 = 0; t2 <= t; t2++) { + float s = 0; + for (int d = 0; d < HD; d++) s += Q[t*DIM+h*HD+d]*K[t2*DIM+h*HD+d]; + s *= scale; sc2[t2] = s; if(s>maxs) maxs=s; + } + float sum = 0; + for (int t2 = 0; t2 <= t; t2++) { sc2[t2]=expf(sc2[t2]-maxs); sum+=sc2[t2]; } + for (int t2 = 0; t2 <= t; t2++) sc2[t2]/=sum; + for (int d = 0; d < HD; d++) { + float ref = 0; + for (int t2 = 0; t2 <= t; t2++) ref += sc2[t2]*V[t2*DIM+h*HD+d]; + float diff = fabsf((float)out[(h*HD+d)*SEQ+t] - ref); + if (diff > maxdiff) maxdiff = diff; + } + } + IOSurfaceUnlock(ioOut, kIOSurfaceLockReadOnly, NULL); + printf("\nMax diff vs CPU causal ref: %.6f → %s\n", maxdiff, maxdiff < 0.05f ? "PASS" : "FAIL"); + + // Benchmark + printf("\n=== Benchmark ===\n"); + int N = 500; + for (int i = 0; i < 20; i++) { ane_eval(&kQKT, ins1, 1, ioScores); ane_eval(&kSV, ins2, 1, ioOut); } + + uint64_t t0 = mach_absolute_time(); + for (int i = 0; i < N; i++) { + ane_eval(&kQKT, ins1, 1, ioScores); + ane_eval(&kSV, ins2, 1, ioOut); + } + double ms_ane = tb_ms(mach_absolute_time() - t0); + + t0 = mach_absolute_time(); + for (int i = 0; i < N; i++) { + ane_eval(&kQKT, ins1, 1, ioScores); + IOSurfaceLock(ioScores, 0, NULL); + _Float16 *s = (_Float16*)IOSurfaceGetBaseAddress(ioScores); + for (int h = 0; h < HEADS; h++) + for (int t = 0; t < SEQ; t++) { + float row[SEQ], maxs = -1e30f; + for (int t2 = 0; t2 < SEQ; t2++) { + float v = (float)s[(h*SEQ+t2)*SEQ+t]*scale; + if(t2>t) v=-1e30f; row[t2]=v; if(v>maxs) maxs=v; + } + float sum=0; + for (int t2=0;t2 +#import +#import +#import +#import +#include + +#define DIM 768 +#define HEADS 12 +#define HD (DIM/HEADS) +#define HIDDEN 2048 +#define SEQ 64 + +static Class g_D, g_I, g_AR, g_AIO; +static mach_timebase_info_data_t g_tb; +static void ane_init(void) { + dlopen("/System/Library/PrivateFrameworks/AppleNeuralEngine.framework/AppleNeuralEngine", RTLD_NOW); + g_D = NSClassFromString(@"_ANEInMemoryModelDescriptor"); + g_I = NSClassFromString(@"_ANEInMemoryModel"); + g_AR = NSClassFromString(@"_ANERequest"); + g_AIO= NSClassFromString(@"_ANEIOSurfaceObject"); +} +static double tb_ms(uint64_t t) { return (double)t * g_tb.numer / g_tb.denom / 1e6; } +static IOSurfaceRef make_surface(size_t bytes) { + return IOSurfaceCreate((__bridge CFDictionaryRef)@{ + (id)kIOSurfaceWidth:@(bytes), (id)kIOSurfaceHeight:@1, + (id)kIOSurfaceBytesPerElement:@1, (id)kIOSurfaceBytesPerRow:@(bytes), + (id)kIOSurfaceAllocSize:@(bytes), (id)kIOSurfacePixelFormat:@0}); +} +static NSData *build_blob(const float *w, int oc, int ic) { + int wsize = oc*ic*2, total = 128+wsize; + uint8_t *buf = (uint8_t*)calloc(total,1); + buf[0]=1; buf[4]=2; buf[64]=0xEF; buf[65]=0xBE; buf[66]=0xAD; buf[67]=0xDE; buf[68]=1; + *(uint32_t*)(buf+72)=wsize; *(uint32_t*)(buf+80)=128; + _Float16 *fp16 = (_Float16*)(buf+128); + for (int i = 0; i < oc*ic; i++) fp16[i] = (_Float16)w[i]; + return [NSData dataWithBytesNoCopy:buf length:total freeWhenDone:YES]; +} +static NSData *build_blob_fp16(_Float16 *data, int count) { + int wsize = count*2, total = 128+wsize; + uint8_t *buf = (uint8_t*)calloc(total,1); + buf[0]=1; buf[4]=2; buf[64]=0xEF; buf[65]=0xBE; buf[66]=0xAD; buf[67]=0xDE; buf[68]=1; + *(uint32_t*)(buf+72)=wsize; *(uint32_t*)(buf+80)=128; + memcpy(buf+128, data, wsize); + return [NSData dataWithBytesNoCopy:buf length:total freeWhenDone:YES]; +} + +typedef struct { id model; NSString *td; } Kern; +static Kern compile_mil(NSString *mil, NSDictionary *wd) { + Kern k = {nil, nil}; + NSData *md = [mil dataUsingEncoding:NSUTF8StringEncoding]; + id desc = ((id(*)(Class,SEL,id,id,id))objc_msgSend)(g_D, @selector(modelWithMILText:weights:optionsPlist:), md, wd ?: @{}, nil); + if (!desc) { printf(" desc=NULL\n"); return k; } + id mdl = ((id(*)(Class,SEL,id))objc_msgSend)(g_I, @selector(inMemoryModelWithDescriptor:), desc); + id hx = ((id(*)(id,SEL))objc_msgSend)(mdl, @selector(hexStringIdentifier)); + NSString *td = [NSTemporaryDirectory() stringByAppendingPathComponent:hx]; + [[NSFileManager defaultManager] createDirectoryAtPath:[td stringByAppendingPathComponent:@"weights"] + withIntermediateDirectories:YES attributes:nil error:nil]; + [md writeToFile:[td stringByAppendingPathComponent:@"model.mil"] atomically:YES]; + for (NSString *path in wd) { + [wd[path][@"data"] writeToFile:[td stringByAppendingPathComponent: + [path stringByReplacingOccurrencesOfString:@"@model_path/" withString:@""]] atomically:YES]; + } + NSError *e = nil; + if (!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(compileWithQoS:options:error:), 21, @{}, &e)) { + printf(" compile FAIL: %s\n", e?[[[e localizedDescription] substringToIndex:MIN(300,(int)[[e localizedDescription] length])] UTF8String]:""); + [[NSFileManager defaultManager] removeItemAtPath:td error:nil]; return k; + } + if (!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(loadWithQoS:options:error:), 21, @{}, &e)) { + printf(" load FAIL\n"); [[NSFileManager defaultManager] removeItemAtPath:td error:nil]; return k; + } + k.model = mdl; k.td = td; + return k; +} +static BOOL ane_eval_io(Kern *k, IOSurfaceRef *ins, int nin, IOSurfaceRef *outs, int nout) { + NSMutableArray *inArr = [NSMutableArray array], *inIdx = [NSMutableArray array]; + NSMutableArray *outArr = [NSMutableArray array], *outIdx = [NSMutableArray array]; + for (int i = 0; i < nin; i++) { + [inArr addObject:((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ins[i])]; + [inIdx addObject:@(i)]; + } + for (int i = 0; i < nout; i++) { + [outArr addObject:((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), outs[i])]; + [outIdx addObject:@(i)]; + } + id req = ((id(*)(Class,SEL,id,id,id,id,id,id,id))objc_msgSend)(g_AR, + @selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:), + inArr, inIdx, outArr, outIdx, nil, nil, @0); + NSError *e = nil; + return ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)( + k->model, @selector(evaluateWithQoS:options:request:error:), 21, @{}, req, &e); +} +static void cleanup_kern(Kern *k) { + if (!k->model) return; + NSError *e = nil; + ((BOOL(*)(id,SEL,unsigned int,NSError**))objc_msgSend)(k->model, @selector(unloadWithQoS:error:), 21, &e); + [[NSFileManager defaultManager] removeItemAtPath:k->td error:nil]; +} + +int main() { + @autoreleasepool { + setbuf(stdout, NULL); + ane_init(); + mach_timebase_info(&g_tb); + + srand48(42); + float sc_d = 1.0f/sqrtf(DIM), sc_h = 1.0f/sqrtf(HIDDEN); + float *Wq = (float*)malloc(DIM*DIM*4); for(int i=0;i({{\"coremlc-component-MIL\", \"3510.2.1\"}, " + "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, " + "{\"coremltools-version\", \"9.0\"}})]\n{\n" + " func main(tensor x) {\n" + // Conv boilerplate + " string pt = const()[name = string(\"pt\"), val = string(\"valid\")];\n" + " tensor st = const()[name = string(\"st\"), val = tensor([1, 1])];\n" + " tensor pd = const()[name = string(\"pd\"), val = tensor([0, 0, 0, 0])];\n" + " tensor dl = const()[name = string(\"dl\"), val = tensor([1, 1])];\n" + " int32 gr1 = const()[name = string(\"g1\"), val = int32(1)];\n" + // QKV weights + " tensor Wq = const()[name = string(\"Wq\"), " + "val = tensor(BLOBFILE(path = string(\"@model_path/weights/wq.bin\"), offset = uint64(64)))];\n" + " tensor Wk = const()[name = string(\"Wk\"), " + "val = tensor(BLOBFILE(path = string(\"@model_path/weights/wk.bin\"), offset = uint64(64)))];\n" + " tensor Wv = const()[name = string(\"Wv\"), " + "val = tensor(BLOBFILE(path = string(\"@model_path/weights/wv.bin\"), offset = uint64(64)))];\n" + " tensor Wout = const()[name = string(\"Wo\"), " + "val = tensor(BLOBFILE(path = string(\"@model_path/weights/wo.bin\"), offset = uint64(64)))];\n" + // QKV projections + " tensor q_flat = conv(dilations = dl, groups = gr1, pad = pd, " + "pad_type = pt, strides = st, weight = Wq, x = x)[name = string(\"cq\")];\n" + " tensor k_flat = conv(dilations = dl, groups = gr1, pad = pd, " + "pad_type = pt, strides = st, weight = Wk, x = x)[name = string(\"ck\")];\n" + " tensor v_flat = conv(dilations = dl, groups = gr1, pad = pd, " + "pad_type = pt, strides = st, weight = Wv, x = x)[name = string(\"cv\")];\n" + // Reshape: [1, DIM, 1, SEQ] → [1, HEADS, HD, SEQ] → transpose → [1, HEADS, SEQ, HD] + " tensor qsh = const()[name = string(\"qsh\"), val = tensor([1, %d, %d, %d])];\n" + " tensor q_4d = reshape(shape = qsh, x = q_flat)[name = string(\"rq\")];\n" + " tensor perm = const()[name = string(\"pm\"), val = tensor([0, 1, 3, 2])];\n" + " tensor q = transpose(perm = perm, x = q_4d)[name = string(\"tq\")];\n" + " tensor k_4d = reshape(shape = qsh, x = k_flat)[name = string(\"rk\")];\n" + " tensor k = transpose(perm = perm, x = k_4d)[name = string(\"tk\")];\n" + " tensor v_4d = reshape(shape = qsh, x = v_flat)[name = string(\"rv\")];\n" + " tensor v = transpose(perm = perm, x = v_4d)[name = string(\"tv\")];\n" + // Q @ K^T + " bool ty = const()[name = string(\"ty\"), val = bool(true)];\n" + " bool tx = const()[name = string(\"tx\"), val = bool(false)];\n" + " tensor scores = matmul(transpose_x = tx, transpose_y = ty, x = q, y = k)[name = string(\"mm1\")];\n" + // Scale + " fp16 sc = const()[name = string(\"sc\"), val = fp16(%f)];\n" + " tensor scaled = mul(x = scores, y = sc)[name = string(\"scl\")];\n" + // Causal mask + " tensor cmask = const()[name = string(\"cm\"), " + "val = tensor(BLOBFILE(path = string(\"@model_path/weights/mask.bin\"), offset = uint64(64)))];\n" + " tensor masked = add(x = scaled, y = cmask)[name = string(\"msk\")];\n" + // Softmax + " int32 sax = const()[name = string(\"sax\"), val = int32(-1)];\n" + " tensor attn_w = softmax(axis = sax, x = masked)[name = string(\"sm\")];\n" + // scores @ V + " tensor attn_4d = matmul(transpose_x = tx, transpose_y = tx, x = attn_w, y = v)[name = string(\"mm2\")];\n" + // Reshape back: [1, HEADS, SEQ, HD] → transpose → [1, HEADS, HD, SEQ] → reshape → [1, DIM, 1, SEQ] + " tensor attn_t = transpose(perm = perm, x = attn_4d)[name = string(\"ta\")];\n" + " tensor osh = const()[name = string(\"osh\"), val = tensor([1, %d, 1, %d])];\n" + " tensor attn_flat = reshape(shape = osh, x = attn_t)[name = string(\"ra\")];\n" + // Wo projection + " tensor out = conv(dilations = dl, groups = gr1, pad = pd, " + "pad_type = pt, strides = st, weight = Wout, x = attn_flat)[name = string(\"co\")];\n" + " } -> (out);\n}\n", + DIM, SEQ, // input + DIM,DIM,DIM,DIM, DIM,DIM,DIM,DIM, // Wq, Wk + DIM,DIM,DIM,DIM, DIM,DIM,DIM,DIM, // Wv, Wo + DIM, SEQ, DIM, SEQ, DIM, SEQ, // q_flat, k_flat, v_flat + HEADS, HD, SEQ, // reshape shape + HEADS, HD, SEQ, // q_4d + HEADS, SEQ, HD, // q (after transpose) + HEADS, HD, SEQ, // k_4d + HEADS, SEQ, HD, // k + HEADS, HD, SEQ, // v_4d + HEADS, SEQ, HD, // v + HEADS, SEQ, SEQ, // scores + scale_val, + HEADS, SEQ, SEQ, // scaled + SEQ, SEQ, SEQ, SEQ, // mask + HEADS, SEQ, SEQ, // masked + HEADS, SEQ, SEQ, // attn_w (softmax) + HEADS, SEQ, HD, // attn_4d + HEADS, HD, SEQ, // attn_t + DIM, SEQ, // reshape back + DIM, SEQ, // attn_flat + DIM, SEQ]; // out + + NSDictionary *wd = @{ + @"@model_path/weights/wq.bin": @{@"offset":@0, @"data":build_blob(Wq,DIM,DIM)}, + @"@model_path/weights/wk.bin": @{@"offset":@0, @"data":build_blob(Wk,DIM,DIM)}, + @"@model_path/weights/wv.bin": @{@"offset":@0, @"data":build_blob(Wv,DIM,DIM)}, + @"@model_path/weights/wo.bin": @{@"offset":@0, @"data":build_blob(Wo,DIM,DIM)}, + @"@model_path/weights/mask.bin": @{@"offset":@0, @"data":build_blob_fp16(mask,SEQ*SEQ)}, + }; + free(mask); + Kern k = compile_mil(mil, wd); + if (k.model) { + printf(" COMPILED! Full fused attention works on ANE!\n"); + + // Verify vs CPU + float *x = (float*)malloc(SEQ*DIM*4); + for (int i = 0; i < SEQ*DIM; i++) x[i] = 0.1f*(2*drand48()-1); + + IOSurfaceRef ioIn = make_surface(DIM*SEQ*2); + IOSurfaceRef ioOut = make_surface(DIM*SEQ*2); + IOSurfaceLock(ioIn, 0, NULL); + _Float16 *p = (_Float16*)IOSurfaceGetBaseAddress(ioIn); + for (int t = 0; t < SEQ; t++) + for (int c = 0; c < DIM; c++) + p[c*SEQ+t] = (_Float16)x[t*DIM+c]; + IOSurfaceUnlock(ioIn, 0, NULL); + + IOSurfaceRef ins[] = {ioIn}, outs[] = {ioOut}; + BOOL ok = ane_eval_io(&k, ins, 1, outs, 1); + printf(" Eval: %s\n", ok?"OK":"FAIL"); + + if (ok) { + // CPU reference + float *q_cpu = (float*)calloc(SEQ*DIM, 4); + float *k_cpu = (float*)calloc(SEQ*DIM, 4); + float *v_cpu = (float*)calloc(SEQ*DIM, 4); + for (int t=0;tmaxs) maxs=s; + } + float sum=0; + for (int t2=0;t2<=t;t2++){sc2[t2]=expf(sc2[t2]-maxs);sum+=sc2[t2];} + for (int t2=0;t2<=t;t2++) sc2[t2]/=sum; + for (int d=0;dmaxdiff) maxdiff=diff; + } + IOSurfaceUnlock(ioOut, kIOSurfaceLockReadOnly, NULL); + printf(" Max diff vs CPU: %.6f → %s\n", maxdiff, maxdiff<0.1f?"PASS":"FAIL"); + + // Benchmark + for (int i=0;i<20;i++) ane_eval_io(&k, ins, 1, outs, 1); + int N=500; + uint64_t t0 = mach_absolute_time(); + for (int i=0;i({{\"coremlc-component-MIL\", \"3510.2.1\"}, " + "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, " + "{\"coremltools-version\", \"9.0\"}})]\n{\n" + " func main(tensor x) {\n" + " string pt = const()[name = string(\"pt\"), val = string(\"valid\")];\n" + " tensor st = const()[name = string(\"st\"), val = tensor([1, 1])];\n" + " tensor pd = const()[name = string(\"pd\"), val = tensor([0, 0, 0, 0])];\n" + " tensor dl = const()[name = string(\"dl\"), val = tensor([1, 1])];\n" + " int32 gr = const()[name = string(\"gr\"), val = int32(1)];\n" + " tensor W1 = const()[name = string(\"W1\"), " + "val = tensor(BLOBFILE(path = string(\"@model_path/weights/w1.bin\"), offset = uint64(64)))];\n" + " tensor W3 = const()[name = string(\"W3\"), " + "val = tensor(BLOBFILE(path = string(\"@model_path/weights/w3.bin\"), offset = uint64(64)))];\n" + " tensor W2 = const()[name = string(\"W2\"), " + "val = tensor(BLOBFILE(path = string(\"@model_path/weights/w2.bin\"), offset = uint64(64)))];\n" + " tensor h1 = conv(dilations = dl, groups = gr, pad = pd, " + "pad_type = pt, strides = st, weight = W1, x = x)[name = string(\"c1\")];\n" + " tensor h3 = conv(dilations = dl, groups = gr, pad = pd, " + "pad_type = pt, strides = st, weight = W3, x = x)[name = string(\"c3\")];\n" + " tensor sig = sigmoid(x = h1)[name = string(\"sg\")];\n" + " tensor silu = mul(x = h1, y = sig)[name = string(\"si\")];\n" + " tensor gate = mul(x = silu, y = h3)[name = string(\"gt\")];\n" + " tensor out = conv(dilations = dl, groups = gr, pad = pd, " + "pad_type = pt, strides = st, weight = W2, x = gate)[name = string(\"c2\")];\n" + " } -> (out);\n}\n", + DIM, SEQ, + HIDDEN,DIM,HIDDEN,DIM, HIDDEN,DIM,HIDDEN,DIM, DIM,HIDDEN,DIM,HIDDEN, + HIDDEN,SEQ, HIDDEN,SEQ, HIDDEN,SEQ, HIDDEN,SEQ, HIDDEN,SEQ, DIM,SEQ]; + + NSDictionary *wd = @{ + @"@model_path/weights/w1.bin": @{@"offset":@0, @"data":build_blob(W1,HIDDEN,DIM)}, + @"@model_path/weights/w3.bin": @{@"offset":@0, @"data":build_blob(W3,HIDDEN,DIM)}, + @"@model_path/weights/w2.bin": @{@"offset":@0, @"data":build_blob(W2,DIM,HIDDEN)}, + }; + Kern k = compile_mil(mil, wd); + printf(" FFN: %s\n", k.model?"OK":"FAIL"); + if (k.model) { + IOSurfaceRef ioIn = make_surface(DIM*SEQ*2), ioOut = make_surface(DIM*SEQ*2); + IOSurfaceRef ins[]={ioIn}, outs[]={ioOut}; + for (int i=0;i<20;i++) ane_eval_io(&k,ins,1,outs,1); + int N=500; + uint64_t t0 = mach_absolute_time(); + for (int i=0;i +#import +#import +#import +#include + +#define DIM 768 +#define HIDDEN 2048 +#define SEQ 64 + +static Class g_D, g_I, g_AR, g_AIO; +static void ane_init(void) { + dlopen("/System/Library/PrivateFrameworks/AppleNeuralEngine.framework/AppleNeuralEngine", RTLD_NOW); + g_D = NSClassFromString(@"_ANEInMemoryModelDescriptor"); + g_I = NSClassFromString(@"_ANEInMemoryModel"); + g_AR = NSClassFromString(@"_ANERequest"); + g_AIO= NSClassFromString(@"_ANEIOSurfaceObject"); +} +static IOSurfaceRef make_surface(size_t bytes) { + return IOSurfaceCreate((__bridge CFDictionaryRef)@{ + (id)kIOSurfaceWidth:@(bytes), (id)kIOSurfaceHeight:@1, + (id)kIOSurfaceBytesPerElement:@1, (id)kIOSurfaceBytesPerRow:@(bytes), + (id)kIOSurfaceAllocSize:@(bytes), (id)kIOSurfacePixelFormat:@0}); +} +static NSData *build_blob_t(const float *w, int rows, int cols) { + int wsize = cols * rows * 2, total = 128 + wsize; + uint8_t *buf = (uint8_t*)calloc(total, 1); + buf[0]=1; buf[4]=2; buf[64]=0xEF; buf[65]=0xBE; buf[66]=0xAD; buf[67]=0xDE; buf[68]=1; + *(uint32_t*)(buf+72)=wsize; *(uint32_t*)(buf+80)=128; + _Float16 *fp16 = (_Float16*)(buf+128); + for (int i = 0; i < rows; i++) + for (int j = 0; j < cols; j++) + fp16[j*rows+i] = (_Float16)w[i*cols+j]; + return [NSData dataWithBytesNoCopy:buf length:total freeWhenDone:YES]; +} + +int main() { + @autoreleasepool { + setbuf(stdout, NULL); + ane_init(); + + srand48(42); + float *W1 = (float*)malloc(HIDDEN*DIM*sizeof(float)); + float *W3 = (float*)malloc(HIDDEN*DIM*sizeof(float)); + float sc = 1.0f/sqrtf(HIDDEN); + for (int i = 0; i < HIDDEN*DIM; i++) { W1[i]=sc*(2*drand48()-1); W3[i]=sc*(2*drand48()-1); } + + // Test: fused W1b+W3b backward + // Input: concat(dh1, dh3) [1, HIDDEN*2, 1, SEQ] + // Output: W1^T@dh1 + W3^T@dh3 [1, DIM, 1, SEQ] + // MIL: slice input → 2 convs → add + printf("=== Fused W1b+W3b backward (slice+conv+add) ===\n"); + + NSString *mil = [NSString stringWithFormat: + @"program(1.3)\n[buildInfo = dict({{\"coremlc-component-MIL\", \"3510.2.1\"}, " + "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, " + "{\"coremltools-version\", \"9.0\"}})]\n{\n" + " func main(tensor x) {\n" // [1, HIDDEN*2, 1, SEQ] + " string d1 = const()[name = string(\"d1\"), val = string(\"fp16\")];\n" + " tensor x16 = cast(dtype = d1, x = x)[name = string(\"cx\")];\n" + // Slice: dh1 = x16[:, 0:HIDDEN, :, :], dh3 = x16[:, HIDDEN:2*HIDDEN, :, :] + " tensor b1 = const()[name = string(\"b1\"), val = tensor([0, 0, 0, 0])];\n" + " tensor s1 = const()[name = string(\"s1\"), val = tensor([1, %d, 1, %d])];\n" + " tensor dh1 = slice_by_size(x = x16, begin = b1, size = s1)[name = string(\"sl1\")];\n" + " tensor b3 = const()[name = string(\"b3\"), val = tensor([0, %d, 0, 0])];\n" + " tensor s3 = const()[name = string(\"s3\"), val = tensor([1, %d, 1, %d])];\n" + " tensor dh3 = slice_by_size(x = x16, begin = b3, size = s3)[name = string(\"sl3\")];\n" + // Conv: W1^T @ dh1, W3^T @ dh3 + " string pt = const()[name = string(\"pt\"), val = string(\"valid\")];\n" + " tensor st = const()[name = string(\"st\"), val = tensor([1, 1])];\n" + " tensor pd = const()[name = string(\"pd\"), val = tensor([0, 0, 0, 0])];\n" + " tensor dl = const()[name = string(\"dl\"), val = tensor([1, 1])];\n" + " int32 gr = const()[name = string(\"gr\"), val = int32(1)];\n" + // W1^T: [DIM, HIDDEN, 1, 1] (transposed from [HIDDEN, DIM]) + " tensor W1t = const()[name = string(\"W1t\"), " + "val = tensor(BLOBFILE(path = string(\"@model_path/weights/w1t.bin\"), offset = uint64(64)))];\n" + " tensor W3t = const()[name = string(\"W3t\"), " + "val = tensor(BLOBFILE(path = string(\"@model_path/weights/w3t.bin\"), offset = uint64(64)))];\n" + " tensor dx1 = conv(dilations = dl, groups = gr, pad = pd, " + "pad_type = pt, strides = st, weight = W1t, x = dh1)[name = string(\"cv1\")];\n" + " tensor dx3 = conv(dilations = dl, groups = gr, pad = pd, " + "pad_type = pt, strides = st, weight = W3t, x = dh3)[name = string(\"cv3\")];\n" + // Add + " tensor sum = add(x = dx1, y = dx3)[name = string(\"ad\")];\n" + " string d2 = const()[name = string(\"d2\"), val = string(\"fp32\")];\n" + " tensor y = cast(dtype = d2, x = sum)[name = string(\"co\")];\n" + " } -> (y);\n}\n", + HIDDEN*2, SEQ, HIDDEN*2, SEQ, + HIDDEN, SEQ, HIDDEN, SEQ, // slice1 + HIDDEN, HIDDEN, SEQ, HIDDEN, SEQ, // slice3 + DIM, HIDDEN, DIM, HIDDEN, // W1t + DIM, HIDDEN, DIM, HIDDEN, // W3t + DIM, SEQ, DIM, SEQ, // dx1, dx3 + DIM, SEQ, DIM, SEQ]; // sum, y + + NSDictionary *wd = @{ + @"@model_path/weights/w1t.bin": @{@"offset":@0, @"data":build_blob_t(W1, HIDDEN, DIM)}, + @"@model_path/weights/w3t.bin": @{@"offset":@0, @"data":build_blob_t(W3, HIDDEN, DIM)} + }; + + NSData *md = [mil dataUsingEncoding:NSUTF8StringEncoding]; + id desc = ((id(*)(Class,SEL,id,id,id))objc_msgSend)(g_D, @selector(modelWithMILText:weights:optionsPlist:), md, wd, nil); + if (!desc) { printf("desc=NULL\n"); return 1; } + id mdl = ((id(*)(Class,SEL,id))objc_msgSend)(g_I, @selector(inMemoryModelWithDescriptor:), desc); + id hx = ((id(*)(id,SEL))objc_msgSend)(mdl, @selector(hexStringIdentifier)); + NSString *td = [NSTemporaryDirectory() stringByAppendingPathComponent:hx]; + [[NSFileManager defaultManager] createDirectoryAtPath:[td stringByAppendingPathComponent:@"weights"] withIntermediateDirectories:YES attributes:nil error:nil]; + [md writeToFile:[td stringByAppendingPathComponent:@"model.mil"] atomically:YES]; + for (NSString *path in wd) { + [wd[path][@"data"] writeToFile:[td stringByAppendingPathComponent:[path stringByReplacingOccurrencesOfString:@"@model_path/" withString:@""]] atomically:YES]; + } + + NSError *e = nil; + BOOL ok = ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(compileWithQoS:options:error:), 21, @{}, &e); + printf("Compile: %s\n", ok?"OK":"FAIL"); + if (!ok) { printf(" %s\n", e?[[e description] UTF8String]:""); return 1; } + ok = ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(loadWithQoS:options:error:), 21, @{}, &e); + printf("Load: %s\n", ok?"OK":"FAIL"); + if (!ok) return 1; + + // Prepare input: concat(dh1, dh3) in channel-first layout + float *dh1 = (float*)malloc(SEQ*HIDDEN*sizeof(float)); + float *dh3 = (float*)malloc(SEQ*HIDDEN*sizeof(float)); + for (int i = 0; i < SEQ*HIDDEN; i++) { dh1[i]=0.01f*sinf(i*0.007f); dh3[i]=0.01f*cosf(i*0.011f); } + + IOSurfaceRef ioI = make_surface(HIDDEN*2*SEQ*4), ioO = make_surface(DIM*SEQ*4); + IOSurfaceLock(ioI, 0, NULL); + float *dst = (float*)IOSurfaceGetBaseAddress(ioI); + // Channel-first: channels 0..HIDDEN-1 = dh1, channels HIDDEN..2*HIDDEN-1 = dh3 + for (int t = 0; t < SEQ; t++) { + for (int c = 0; c < HIDDEN; c++) dst[c*SEQ+t] = dh1[t*HIDDEN+c]; + for (int c = 0; c < HIDDEN; c++) dst[(HIDDEN+c)*SEQ+t] = dh3[t*HIDDEN+c]; + } + IOSurfaceUnlock(ioI, 0, NULL); + + id wI = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioI); + id wO = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioO); + id req = ((id(*)(Class,SEL,id,id,id,id,id,id,id))objc_msgSend)(g_AR, + @selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:), + @[wI], @[@0], @[wO], @[@0], nil, nil, @0); + + ok = ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)( + mdl, @selector(evaluateWithQoS:options:request:error:), 21, @{}, req, &e); + printf("Eval: %s\n", ok?"OK":"FAIL"); + if (!ok) { printf(" %s\n", e?[[e description] UTF8String]:""); return 1; } + + // CPU reference: dx = W1^T @ dh1 + W3^T @ dh3 + float *ref = (float*)calloc(SEQ*DIM, sizeof(float)); + for (int t = 0; t < SEQ; t++) + for (int i = 0; i < DIM; i++) { + float s = 0; + for (int j = 0; j < HIDDEN; j++) { + s += W1[j*DIM+i] * dh1[t*HIDDEN+j]; // W1^T[i,j] = W1[j,i] + s += W3[j*DIM+i] * dh3[t*HIDDEN+j]; + } + ref[t*DIM+i] = s; + } + + IOSurfaceLock(ioO, kIOSurfaceLockReadOnly, NULL); + float *src = (float*)IOSurfaceGetBaseAddress(ioO); + float maxd = 0; + for (int t = 0; t < SEQ; t++) + for (int c = 0; c < DIM; c++) { + float d = fabsf(src[c*SEQ+t] - ref[t*DIM+c]); + if (d > maxd) maxd = d; + } + IOSurfaceUnlock(ioO, kIOSurfaceLockReadOnly, NULL); + printf("dx max diff: %.6f\n", maxd); + + ((BOOL(*)(id,SEL,unsigned int,NSError**))objc_msgSend)(mdl, @selector(unloadWithQoS:error:), 21, &e); + [[NSFileManager defaultManager] removeItemAtPath:td error:nil]; + CFRelease(ioI); CFRelease(ioO); + free(W1); free(W3); free(dh1); free(dh3); free(ref); + printf("\nDONE\n"); + } + return 0; +} diff --git a/training/test_fused_qkv.m b/training/test_fused_qkv.m new file mode 100644 index 0000000..69f41d6 --- /dev/null +++ b/training/test_fused_qkv.m @@ -0,0 +1,265 @@ +// Test: Fused QKV projections in single MIL graph (3 convs → concat output) +// Input: x [1, DIM, 1, SEQ] +// Output: concat(Q, K, V) [1, DIM*3, 1, SEQ] +// 3 convs with separate weights, 1 ANE dispatch +#import +#import +#import +#import +#import +#include + +#define DIM 768 +#define SEQ 64 + +static Class g_D, g_I, g_AR, g_AIO; +static mach_timebase_info_data_t g_tb; +static void ane_init(void) { + dlopen("/System/Library/PrivateFrameworks/AppleNeuralEngine.framework/AppleNeuralEngine", RTLD_NOW); + g_D = NSClassFromString(@"_ANEInMemoryModelDescriptor"); + g_I = NSClassFromString(@"_ANEInMemoryModel"); + g_AR = NSClassFromString(@"_ANERequest"); + g_AIO= NSClassFromString(@"_ANEIOSurfaceObject"); +} +static double tb_ms(uint64_t t) { return (double)t * g_tb.numer / g_tb.denom / 1e6; } +static IOSurfaceRef make_surface(size_t bytes) { + return IOSurfaceCreate((__bridge CFDictionaryRef)@{ + (id)kIOSurfaceWidth:@(bytes), (id)kIOSurfaceHeight:@1, + (id)kIOSurfaceBytesPerElement:@1, (id)kIOSurfaceBytesPerRow:@(bytes), + (id)kIOSurfaceAllocSize:@(bytes), (id)kIOSurfacePixelFormat:@0}); +} +static NSData *build_blob(const float *w, int oc, int ic) { + int wsize = oc * ic * 2, total = 128 + wsize; + uint8_t *buf = (uint8_t*)calloc(total, 1); + buf[0]=1; buf[4]=2; buf[64]=0xEF; buf[65]=0xBE; buf[66]=0xAD; buf[67]=0xDE; buf[68]=1; + *(uint32_t*)(buf+72)=wsize; *(uint32_t*)(buf+80)=128; + _Float16 *fp16 = (_Float16*)(buf+128); + for (int i = 0; i < oc*ic; i++) fp16[i] = (_Float16)w[i]; // layout A: row-major [oc, ic] + return [NSData dataWithBytesNoCopy:buf length:total freeWhenDone:YES]; +} + +typedef struct { id model; NSString *td; } Kern; +static Kern compile_mil(NSString *mil, NSDictionary *wd) { + Kern k = {nil, nil}; + NSData *md = [mil dataUsingEncoding:NSUTF8StringEncoding]; + id desc = ((id(*)(Class,SEL,id,id,id))objc_msgSend)(g_D, @selector(modelWithMILText:weights:optionsPlist:), md, wd ?: @{}, nil); + if (!desc) { printf("desc=NULL\n"); return k; } + id mdl = ((id(*)(Class,SEL,id))objc_msgSend)(g_I, @selector(inMemoryModelWithDescriptor:), desc); + id hx = ((id(*)(id,SEL))objc_msgSend)(mdl, @selector(hexStringIdentifier)); + NSString *td = [NSTemporaryDirectory() stringByAppendingPathComponent:hx]; + [[NSFileManager defaultManager] createDirectoryAtPath:[td stringByAppendingPathComponent:@"weights"] + withIntermediateDirectories:YES attributes:nil error:nil]; + [md writeToFile:[td stringByAppendingPathComponent:@"model.mil"] atomically:YES]; + for (NSString *path in wd) { + [wd[path][@"data"] writeToFile:[td stringByAppendingPathComponent: + [path stringByReplacingOccurrencesOfString:@"@model_path/" withString:@""]] atomically:YES]; + } + NSError *e = nil; + if (!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(compileWithQoS:options:error:), 21, @{}, &e)) { + printf("compile FAIL: %s\n", e?[[e localizedDescription] UTF8String]:""); return k; + } + ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(loadWithQoS:options:error:), 21, @{}, &e); + k.model = mdl; k.td = td; + return k; +} +static BOOL ane_eval(Kern *k, IOSurfaceRef *ins, int nin, IOSurfaceRef out) { + NSMutableArray *inArr = [NSMutableArray array], *inIdx = [NSMutableArray array]; + for (int i = 0; i < nin; i++) { + [inArr addObject:((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ins[i])]; + [inIdx addObject:@(i)]; + } + id wO = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), out); + id req = ((id(*)(Class,SEL,id,id,id,id,id,id,id))objc_msgSend)(g_AR, + @selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:), + inArr, inIdx, @[wO], @[@0], nil, nil, @0); + NSError *e = nil; + return ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)( + k->model, @selector(evaluateWithQoS:options:request:error:), 21, @{}, req, &e); +} +static void cleanup_kern(Kern *k) { + if (!k->model) return; + NSError *e = nil; + ((BOOL(*)(id,SEL,unsigned int,NSError**))objc_msgSend)(k->model, @selector(unloadWithQoS:error:), 21, &e); + [[NSFileManager defaultManager] removeItemAtPath:k->td error:nil]; +} + +// Fused QKV: 3 convs + concat in one MIL +static NSString *gen_fused_qkv_mil(void) { + return [NSString stringWithFormat: + @"program(1.3)\n[buildInfo = dict({{\"coremlc-component-MIL\", \"3510.2.1\"}, " + "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, " + "{\"coremltools-version\", \"9.0\"}})]\n{\n" + " func main(tensor x) {\n" + " string d1 = const()[name = string(\"d1\"), val = string(\"fp16\")];\n" + " tensor x16 = cast(dtype = d1, x = x)[name = string(\"cx\")];\n" + " string pt = const()[name = string(\"pt\"), val = string(\"valid\")];\n" + " tensor st = const()[name = string(\"st\"), val = tensor([1, 1])];\n" + " tensor pd = const()[name = string(\"pd\"), val = tensor([0, 0, 0, 0])];\n" + " tensor dl = const()[name = string(\"dl\"), val = tensor([1, 1])];\n" + " int32 gr = const()[name = string(\"gr\"), val = int32(1)];\n" + " tensor Wq = const()[name = string(\"Wq\"), " + "val = tensor(BLOBFILE(path = string(\"@model_path/weights/wq.bin\"), offset = uint64(64)))];\n" + " tensor Wk = const()[name = string(\"Wk\"), " + "val = tensor(BLOBFILE(path = string(\"@model_path/weights/wk.bin\"), offset = uint64(64)))];\n" + " tensor Wv = const()[name = string(\"Wv\"), " + "val = tensor(BLOBFILE(path = string(\"@model_path/weights/wv.bin\"), offset = uint64(64)))];\n" + " tensor q = conv(dilations = dl, groups = gr, pad = pd, " + "pad_type = pt, strides = st, weight = Wq, x = x16)[name = string(\"cq\")];\n" + " tensor k = conv(dilations = dl, groups = gr, pad = pd, " + "pad_type = pt, strides = st, weight = Wk, x = x16)[name = string(\"ck\")];\n" + " tensor v = conv(dilations = dl, groups = gr, pad = pd, " + "pad_type = pt, strides = st, weight = Wv, x = x16)[name = string(\"cv\")];\n" + " int32 ax = const()[name = string(\"ax\"), val = int32(1)];\n" + " bool inter = const()[name = string(\"il\"), val = bool(false)];\n" + " tensor qkv = concat(axis = ax, interleave = inter, values = (q, k, v))[name = string(\"cat\")];\n" + " string d2 = const()[name = string(\"d2\"), val = string(\"fp32\")];\n" + " tensor y = cast(dtype = d2, x = qkv)[name = string(\"co\")];\n" + " } -> (y);\n}\n", + DIM, SEQ, DIM, SEQ, + DIM, DIM, DIM, DIM, // Wq + DIM, DIM, DIM, DIM, // Wk + DIM, DIM, DIM, DIM, // Wv + DIM, SEQ, // q + DIM, SEQ, // k + DIM, SEQ, // v + DIM*3, SEQ, // concat + DIM*3, SEQ]; // output +} + +// Single conv MIL for comparison +static NSString *gen_single_mil(void) { + return [NSString stringWithFormat: + @"program(1.3)\n[buildInfo = dict({{\"coremlc-component-MIL\", \"3510.2.1\"}, " + "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, " + "{\"coremltools-version\", \"9.0\"}})]\n{\n" + " func main(tensor x) {\n" + " string d1 = const()[name = string(\"d1\"), val = string(\"fp16\")];\n" + " tensor x16 = cast(dtype = d1, x = x)[name = string(\"cx\")];\n" + " tensor W = const()[name = string(\"W\"), " + "val = tensor(BLOBFILE(path = string(\"@model_path/weights/w.bin\"), offset = uint64(64)))];\n" + " string pt = const()[name = string(\"pt\"), val = string(\"valid\")];\n" + " tensor st = const()[name = string(\"st\"), val = tensor([1, 1])];\n" + " tensor pd = const()[name = string(\"pd\"), val = tensor([0, 0, 0, 0])];\n" + " tensor dl = const()[name = string(\"dl\"), val = tensor([1, 1])];\n" + " int32 gr = const()[name = string(\"gr\"), val = int32(1)];\n" + " tensor y16 = conv(dilations = dl, groups = gr, pad = pd, " + "pad_type = pt, strides = st, weight = W, x = x16)[name = string(\"cv\")];\n" + " string d2 = const()[name = string(\"d2\"), val = string(\"fp32\")];\n" + " tensor y = cast(dtype = d2, x = y16)[name = string(\"co\")];\n" + " } -> (y);\n}\n", + DIM, SEQ, DIM, SEQ, DIM, DIM, DIM, DIM, DIM, SEQ, DIM, SEQ]; +} + +int main() { + @autoreleasepool { + setbuf(stdout, NULL); + ane_init(); + mach_timebase_info(&g_tb); + + printf("=== Fused QKV vs 3x Separate Convs ===\n"); + printf("DIM=%d SEQ=%d\n\n", DIM, SEQ); + + srand48(42); + float *Wq = (float*)malloc(DIM*DIM*sizeof(float)); + float *Wk = (float*)malloc(DIM*DIM*sizeof(float)); + float *Wv = (float*)malloc(DIM*DIM*sizeof(float)); + float sc = 1.0f/sqrtf(DIM); + for (int i = 0; i < DIM*DIM; i++) { Wq[i]=sc*(2*drand48()-1); Wk[i]=sc*(2*drand48()-1); Wv[i]=sc*(2*drand48()-1); } + + float *x = (float*)malloc(SEQ*DIM*sizeof(float)); + for (int i = 0; i < SEQ*DIM; i++) x[i] = 0.1f*(2*drand48()-1); + + // === Compile fused QKV === + NSDictionary *fused_wd = @{ + @"@model_path/weights/wq.bin": @{@"offset":@0, @"data":build_blob(Wq, DIM, DIM)}, + @"@model_path/weights/wk.bin": @{@"offset":@0, @"data":build_blob(Wk, DIM, DIM)}, + @"@model_path/weights/wv.bin": @{@"offset":@0, @"data":build_blob(Wv, DIM, DIM)}, + }; + Kern kFused = compile_mil(gen_fused_qkv_mil(), fused_wd); + printf("Fused QKV: %s\n", kFused.model ? "OK" : "FAIL"); + + // === Compile 3 separate === + Kern kQ = compile_mil(gen_single_mil(), @{@"@model_path/weights/w.bin": @{@"offset":@0, @"data":build_blob(Wq, DIM, DIM)}}); + Kern kK = compile_mil(gen_single_mil(), @{@"@model_path/weights/w.bin": @{@"offset":@0, @"data":build_blob(Wk, DIM, DIM)}}); + Kern kV = compile_mil(gen_single_mil(), @{@"@model_path/weights/w.bin": @{@"offset":@0, @"data":build_blob(Wv, DIM, DIM)}}); + printf("Separate Q,K,V: %s %s %s\n", kQ.model?"OK":"FAIL", kK.model?"OK":"FAIL", kV.model?"OK":"FAIL"); + + if (!kFused.model || !kQ.model) goto done; + + // IOSurfaces + size_t in_bytes = DIM*SEQ*4, out1_bytes = DIM*SEQ*4, out3_bytes = DIM*3*SEQ*4; + IOSurfaceRef ioIn = make_surface(in_bytes); + IOSurfaceRef ioFused = make_surface(out3_bytes); + IOSurfaceRef ioQ = make_surface(out1_bytes), ioK = make_surface(out1_bytes), ioV = make_surface(out1_bytes); + + IOSurfaceLock(ioIn, 0, NULL); + float *dst = (float*)IOSurfaceGetBaseAddress(ioIn); + for (int t = 0; t < SEQ; t++) + for (int c = 0; c < DIM; c++) + dst[c*SEQ+t] = x[t*DIM+c]; + IOSurfaceUnlock(ioIn, 0, NULL); + + // Eval fused + IOSurfaceRef ins[] = {ioIn}; + ane_eval(&kFused, ins, 1, ioFused); + // Eval separate + ane_eval(&kQ, ins, 1, ioQ); + ane_eval(&kK, ins, 1, ioK); + ane_eval(&kV, ins, 1, ioV); + + // Compare fused output (concat Q,K,V) vs separate + IOSurfaceLock(ioFused, kIOSurfaceLockReadOnly, NULL); + IOSurfaceLock(ioQ, kIOSurfaceLockReadOnly, NULL); + IOSurfaceLock(ioK, kIOSurfaceLockReadOnly, NULL); + IOSurfaceLock(ioV, kIOSurfaceLockReadOnly, NULL); + float *fo = (float*)IOSurfaceGetBaseAddress(ioFused); + float *qo = (float*)IOSurfaceGetBaseAddress(ioQ); + float *ko = (float*)IOSurfaceGetBaseAddress(ioK); + float *vo = (float*)IOSurfaceGetBaseAddress(ioV); + float dq=0, dk=0, dv=0; + for (int c = 0; c < DIM; c++) + for (int t = 0; t < SEQ; t++) { + float d1 = fabsf(fo[c*SEQ+t] - qo[c*SEQ+t]); if(d1>dq) dq=d1; + float d2 = fabsf(fo[(DIM+c)*SEQ+t] - ko[c*SEQ+t]); if(d2>dk) dk=d2; + float d3 = fabsf(fo[(DIM*2+c)*SEQ+t] - vo[c*SEQ+t]); if(d3>dv) dv=d3; + } + IOSurfaceUnlock(ioFused, kIOSurfaceLockReadOnly, NULL); + IOSurfaceUnlock(ioQ, kIOSurfaceLockReadOnly, NULL); + IOSurfaceUnlock(ioK, kIOSurfaceLockReadOnly, NULL); + IOSurfaceUnlock(ioV, kIOSurfaceLockReadOnly, NULL); + printf("\nFused vs Separate: dQ=%.6f dK=%.6f dV=%.6f → %s\n", + dq, dk, dv, (dq<0.001f && dk<0.001f && dv<0.001f) ? "PASS" : "FAIL"); + + // === Benchmark === + printf("\n=== Benchmark ===\n"); + int N = 500; + // Warmup + for (int i = 0; i < 20; i++) { ane_eval(&kFused, ins, 1, ioFused); ane_eval(&kQ, ins, 1, ioQ); } + + uint64_t t0 = mach_absolute_time(); + for (int i = 0; i < N; i++) ane_eval(&kFused, ins, 1, ioFused); + double ms_fused = tb_ms(mach_absolute_time() - t0); + + t0 = mach_absolute_time(); + for (int i = 0; i < N; i++) { + ane_eval(&kQ, ins, 1, ioQ); + ane_eval(&kK, ins, 1, ioK); + ane_eval(&kV, ins, 1, ioV); + } + double ms_sep = tb_ms(mach_absolute_time() - t0); + + double flops_one = 2.0 * DIM * DIM * SEQ; + printf("Fused QKV (1 dispatch, 3 convs): %.3f ms/iter %.1f GFLOPS\n", + ms_fused/N, N*3*flops_one/ms_fused/1e6); + printf("Separate Q+K+V (3 dispatches): %.3f ms/iter %.1f GFLOPS\n", + ms_sep/N, N*3*flops_one/ms_sep/1e6); + printf("Speedup: %.2fx\n", ms_sep/ms_fused); + + CFRelease(ioIn); CFRelease(ioFused); CFRelease(ioQ); CFRelease(ioK); CFRelease(ioV); + free(Wq); free(Wk); free(Wv); free(x); + done: + cleanup_kern(&kFused); cleanup_kern(&kQ); cleanup_kern(&kK); cleanup_kern(&kV); + printf("\nDONE\n"); + } + return 0; +} diff --git a/training/tiny_train.m b/training/tiny_train.m new file mode 100644 index 0000000..e1e9d7d --- /dev/null +++ b/training/tiny_train.m @@ -0,0 +1,593 @@ +// tiny_train.m — Train a 2-layer linear model on ANE (forward AND backward) +// y = W2 @ relu(W1 @ x), MSE loss, SGD update +// Pipeline: compile next kernels on background thread while ANE runs current batch +// Bypasses ANE 119-compile limit via exec() self-restart +#import +#import +#import +#import +#import +#import +#include +#include +#include + +static Class g_D, g_I, g_AR, g_AIO; + +static void ane_init(void) { + dlopen("/System/Library/PrivateFrameworks/AppleNeuralEngine.framework/AppleNeuralEngine", RTLD_NOW); + g_D = NSClassFromString(@"_ANEInMemoryModelDescriptor"); + g_I = NSClassFromString(@"_ANEInMemoryModel"); + g_AR = NSClassFromString(@"_ANERequest"); + g_AIO= NSClassFromString(@"_ANEIOSurfaceObject"); +} + +static IOSurfaceRef make_surface(size_t bytes) { + return IOSurfaceCreate((__bridge CFDictionaryRef)@{ + (id)kIOSurfaceWidth:@(bytes), (id)kIOSurfaceHeight:@1, + (id)kIOSurfaceBytesPerElement:@1, (id)kIOSurfaceBytesPerRow:@(bytes), + (id)kIOSurfaceAllocSize:@(bytes), (id)kIOSurfacePixelFormat:@0}); +} + +static NSData *build_blob(const float *w, int rows, int cols) { + int wsize = rows * cols * 2; + int total = 128 + wsize; + uint8_t *buf = (uint8_t*)calloc(total, 1); + buf[0] = 0x01; buf[4] = 0x02; + buf[64] = 0xEF; buf[65] = 0xBE; buf[66] = 0xAD; buf[67] = 0xDE; + buf[68] = 0x01; + *(uint32_t*)(buf+72) = wsize; + *(uint32_t*)(buf+80) = 128; + _Float16 *fp16 = (_Float16*)(buf + 128); + for (int i = 0; i < rows * cols; i++) fp16[i] = (_Float16)w[i]; + return [NSData dataWithBytesNoCopy:buf length:total freeWhenDone:YES]; +} + +static NSData *build_blob_transposed(const float *w, int rows, int cols) { + int wsize = cols * rows * 2; + int total = 128 + wsize; + uint8_t *buf = (uint8_t*)calloc(total, 1); + buf[0] = 0x01; buf[4] = 0x02; + buf[64] = 0xEF; buf[65] = 0xBE; buf[66] = 0xAD; buf[67] = 0xDE; + buf[68] = 0x01; + *(uint32_t*)(buf+72) = wsize; + *(uint32_t*)(buf+80) = 128; + _Float16 *fp16 = (_Float16*)(buf + 128); + for (int i = 0; i < rows; i++) + for (int j = 0; j < cols; j++) + fp16[j * rows + i] = (_Float16)w[i * cols + j]; + return [NSData dataWithBytesNoCopy:buf length:total freeWhenDone:YES]; +} + +static NSString *gen_conv_mil(int in_ch, int out_ch, int sp) { + return [NSString stringWithFormat: + @"program(1.3)\n[buildInfo = dict({{\"coremlc-component-MIL\", \"3510.2.1\"}, " + "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, " + "{\"coremltools-version\", \"9.0\"}})]\n{\n" + " func main(tensor x) {\n" + " string d1 = const()[name = string(\"d1\"), val = string(\"fp16\")];\n" + " tensor x16 = cast(dtype = d1, x = x)[name = string(\"cx\")];\n" + " tensor W = const()[name = string(\"W\"), " + "val = tensor(BLOBFILE(path = string(\"@model_path/weights/weight.bin\"), offset = uint64(64)))];\n" + " string pt = const()[name = string(\"pt\"), val = string(\"valid\")];\n" + " tensor st = const()[name = string(\"st\"), val = tensor([1, 1])];\n" + " tensor pd = const()[name = string(\"pd\"), val = tensor([0, 0, 0, 0])];\n" + " tensor dl = const()[name = string(\"dl\"), val = tensor([1, 1])];\n" + " int32 gr = const()[name = string(\"gr\"), val = int32(1)];\n" + " tensor y16 = conv(dilations = dl, groups = gr, pad = pd, " + "pad_type = pt, strides = st, weight = W, x = x16)[name = string(\"cv\")];\n" + " string d2 = const()[name = string(\"d2\"), val = string(\"fp32\")];\n" + " tensor y = cast(dtype = d2, x = y16)[name = string(\"co\")];\n" + " } -> (y);\n}\n", + in_ch, sp, in_ch, sp, out_ch, in_ch, out_ch, in_ch, out_ch, sp, out_ch, sp]; +} + +typedef struct { + void *model; // CFBridgingRetain'd _ANEInMemoryModel + IOSurfaceRef ioIn, ioOut; + void *request; // CFBridgingRetain'd _ANERequest + void *tmpDir; // CFBridgingRetain'd NSString +} Kern; + +static int g_compile_count = 0; + +static Kern *compile_kern_with_blob(NSData *blob, int in_ch, int out_ch, int sp) { + @autoreleasepool { + NSString *mil = gen_conv_mil(in_ch, out_ch, sp); + NSData *milData = [mil dataUsingEncoding:NSUTF8StringEncoding]; + NSDictionary *wd = @{@"@model_path/weights/weight.bin":@{@"offset":@0,@"data":blob}}; + id desc = ((id(*)(Class,SEL,id,id,id))objc_msgSend)(g_D, @selector(modelWithMILText:weights:optionsPlist:), milData, wd, nil); + if (!desc) return NULL; + id mdl = ((id(*)(Class,SEL,id))objc_msgSend)(g_I, @selector(inMemoryModelWithDescriptor:), desc); + id hx = ((id(*)(id,SEL))objc_msgSend)(mdl, @selector(hexStringIdentifier)); + NSString *td = [NSTemporaryDirectory() stringByAppendingPathComponent:hx]; + NSFileManager *fm = [NSFileManager defaultManager]; + [fm createDirectoryAtPath:[td stringByAppendingPathComponent:@"weights"] withIntermediateDirectories:YES attributes:nil error:nil]; + [milData writeToFile:[td stringByAppendingPathComponent:@"model.mil"] atomically:YES]; + [blob writeToFile:[td stringByAppendingPathComponent:@"weights/weight.bin"] atomically:YES]; + NSError *e = nil; + if (!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(compileWithQoS:options:error:), 21, @{}, &e)) return NULL; + if (!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(loadWithQoS:options:error:), 21, @{}, &e)) return NULL; + __sync_fetch_and_add(&g_compile_count, 1); + size_t inB = in_ch * sp * 4, outB = out_ch * sp * 4; + IOSurfaceRef ioI = make_surface(inB), ioO = make_surface(outB); + id wI = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioI); + id wO = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioO); + id req = ((id(*)(Class,SEL,id,id,id,id,id,id,id))objc_msgSend)(g_AR, + @selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:), + @[wI], @[@0], @[wO], @[@0], nil, nil, @0); + Kern *k = calloc(1, sizeof(Kern)); + k->model = CFBridgingRetain(mdl); + k->ioIn = ioI; k->ioOut = ioO; + k->request = CFBridgingRetain(req); + k->tmpDir = CFBridgingRetain(td); + return k; + } +} + +static void free_kern(Kern *k) { + if (!k) return; + id mdl = (__bridge id)k->model; + NSError *e = nil; + ((BOOL(*)(id,SEL,unsigned int,NSError**))objc_msgSend)(mdl, @selector(unloadWithQoS:error:), 21, &e); + CFRelease(k->ioIn); CFRelease(k->ioOut); + NSString *td = (__bridge id)k->tmpDir; + [[NSFileManager defaultManager] removeItemAtPath:td error:nil]; + CFRelease(k->model); + CFRelease(k->request); + CFRelease(k->tmpDir); + free(k); +} + +static void ane_eval_k(Kern *k, const float *in, float *out, int in_ch, int out_ch, int sp) { + float *tmp = (float*)malloc(in_ch * sp * sizeof(float)); + for (int t = 0; t < sp; t++) + for (int c = 0; c < in_ch; c++) + tmp[c*sp + t] = in[t*in_ch + c]; + IOSurfaceLock(k->ioIn, 0, NULL); + memcpy(IOSurfaceGetBaseAddress(k->ioIn), tmp, in_ch * sp * sizeof(float)); + IOSurfaceUnlock(k->ioIn, 0, NULL); + free(tmp); + NSError *e = nil; + id mdl = (__bridge id)k->model; + id req = (__bridge id)k->request; + ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)( + mdl, @selector(evaluateWithQoS:options:request:error:), 21, @{}, req, &e); + float *tmp2 = (float*)malloc(out_ch * sp * sizeof(float)); + IOSurfaceLock(k->ioOut, kIOSurfaceLockReadOnly, NULL); + memcpy(tmp2, IOSurfaceGetBaseAddress(k->ioOut), out_ch * sp * sizeof(float)); + IOSurfaceUnlock(k->ioOut, kIOSurfaceLockReadOnly, NULL); + for (int t = 0; t < sp; t++) + for (int c = 0; c < out_ch; c++) + out[t*out_ch + c] = tmp2[c*sp + t]; + free(tmp2); +} + +// === Checkpoint: save/restore training state for exec() restart === +#define CKPT_PATH "/tmp/ane_train_ckpt.bin" + +typedef struct { + int step; + float loss; + int D, H, S, total_steps; + float lr; + double cum_compile_ms, cum_train_ms, cum_wall_ms; + int cum_steps, cum_batches; +} CkptHeader; + +static void save_checkpoint(const char *path, int step, float loss, + int D, int H, int S, int total_steps, float lr, + const float *W1, const float *W2, + double cc, double ct, double cw, int cs, int cb) { + FILE *f = fopen(path, "wb"); + CkptHeader hdr = {step, loss, D, H, S, total_steps, lr, cc, ct, cw, cs, cb}; + fwrite(&hdr, sizeof(hdr), 1, f); + fwrite(W1, sizeof(float), H * D, f); + fwrite(W2, sizeof(float), D * H, f); + fclose(f); +} + +static bool load_checkpoint(const char *path, CkptHeader *hdr, + float *W1, float *W2, int H, int D) { + FILE *f = fopen(path, "rb"); + if (!f) return false; + fread(hdr, sizeof(CkptHeader), 1, f); + fread(W1, sizeof(float), H * D, f); + fread(W2, sizeof(float), D * H, f); + fclose(f); + return true; +} + +#define MAX_COMPILES 100 +#define KERNELS_PER_STEP 4 +#define ACCUM_STEPS 10 + +// === Pipeline: background compile via GCD === +typedef struct { + Kern *k1_fwd, *k2_fwd, *k1_bwd, *k2_bwd; + float *W1, *W2; + int D, H, S; + bool ok; + double compile_ms; +} PipelineCompile; + +static double tb_to_ms(uint64_t elapsed, mach_timebase_info_data_t tb) { + return (double)elapsed * tb.numer / tb.denom / 1e6; +} + +static mach_timebase_info_data_t g_tb; +// Serial queue ensures ANE compiles don't overlap with each other +static dispatch_queue_t g_compile_queue; + +int main(int argc, char *argv[]) { + @autoreleasepool { + setbuf(stdout, NULL); + ane_init(); + mach_timebase_info(&g_tb); + g_compile_queue = dispatch_queue_create("ane.compile", DISPATCH_QUEUE_SERIAL); + + int D = 64, H = 128, S = 16; + int total_steps = 2000; + float lr = 1.0f; + int start_step = 0; + bool resuming = false; + + float *W1 = (float*)malloc(H * D * sizeof(float)); + float *W2 = (float*)malloc(D * H * sizeof(float)); + + if (argc > 1 && strcmp(argv[1], "--resume") == 0) { + CkptHeader hdr; + if (load_checkpoint(CKPT_PATH, &hdr, W1, W2, H, D)) { + start_step = hdr.step; + total_steps = hdr.total_steps; + lr = hdr.lr; + resuming = true; + printf("[RESUMED at step %d, loss=%.6f, compiles reset]\n", start_step, hdr.loss); + } + } + + // Cumulative stats (restored from checkpoint if resuming) + double cum_compile_ms = 0, cum_train_ms = 0, cum_wall_ms = 0; + int cum_steps = 0, cum_batches = 0; + if (resuming) { + CkptHeader hdr2; + FILE *f = fopen(CKPT_PATH, "rb"); + if (f) { fread(&hdr2, sizeof(hdr2), 1, f); fclose(f); + cum_compile_ms = hdr2.cum_compile_ms; + cum_train_ms = hdr2.cum_train_ms; + cum_wall_ms = hdr2.cum_wall_ms; + cum_steps = hdr2.cum_steps; + cum_batches = hdr2.cum_batches; + } + } + + // FLOPs calculation + // Forward: W1[H,D] @ x[D,S] = 2*H*D*S, W2[D,H] @ h[H,S] = 2*D*H*S → total fwd = 4*D*H*S + // Backward dx: W2^T[H,D] @ dy[D,S] = 2*H*D*S, W1^T[D,H] @ dh[H,S] = 2*D*H*S → total bwd = 4*D*H*S + // dW (CPU): dW2[D,H] = dy[D,S] @ h^T[S,H] = 2*D*S*H, dW1 same → total dW = 4*D*H*S + // ANE FLOPs per step = 8*D*H*S (fwd + bwd on ANE) + // CPU FLOPs per step = 4*D*H*S (dW accumulation) + // Total FLOPs per step = 12*D*H*S + double ane_flops_per_step = 8.0 * D * H * S; + double cpu_flops_per_step = 4.0 * D * H * S; + double total_flops_per_step = ane_flops_per_step + cpu_flops_per_step; + double weight_bytes = (H*D + D*H) * 2.0; // FP16 weights on ANE + + if (!resuming) { + for (int i = 0; i < H*D; i++) W1[i] = 0.01f * sinf(i * 1.3f + 0.7f); + for (int i = 0; i < D*H; i++) W2[i] = 0.01f * cosf(i * 0.9f + 1.1f); + printf("=== ANE Training: Pipeline Parallel + Grad Accumulation ===\n"); + printf("x:[%d,%d] -> W1:[%d,%d] -> ReLU -> W2:[%d,%d] -> y:[%d,%d]\n", S,D, H,D, D,H, S,D); + printf("Accum %d steps per recompile | Pipeline: compile overlaps ANE eval\n", ACCUM_STEPS); + printf("ANE FP16 peak: 15.8 TFLOPS (M4) | Weights: %.1f KB\n\n", weight_bytes/1024.0); + printf("FLOPs/step: ANE=%.0f (fwd+bwd) CPU=%.0f (dW) Total=%.0f\n", + ane_flops_per_step, cpu_flops_per_step, total_flops_per_step); + printf("Steps: %d, LR: %.4f, exec() budget: %d compiles\n\n", + total_steps, lr, MAX_COMPILES); + } + + float *x = (float*)calloc(S * D, sizeof(float)); + float *y_target = (float*)calloc(S * D, sizeof(float)); + for (int t = 0; t < S; t++) + for (int i = 0; i < D; i++) { + float v = sinf((t * D + i) * 0.1f); + x[t*D + i] = v; + y_target[t*D + i] = v; + } + + float *h = (float*)malloc(S * H * sizeof(float)); + float *h_relu = (float*)malloc(S * H * sizeof(float)); + float *y = (float*)malloc(S * D * sizeof(float)); + float *dy = (float*)malloc(S * D * sizeof(float)); + float *dh_relu = (float*)malloc(S * H * sizeof(float)); + float *dh = (float*)malloc(S * H * sizeof(float)); + float *dx_layer = (float*)malloc(S * D * sizeof(float)); + + Kern *k1_fwd = NULL, *k2_fwd = NULL; + Kern *k1_bwd = NULL, *k2_bwd = NULL; + float last_loss = 999.0f; + + // Stats + double total_compile_ms = 0, total_train_ms = 0, total_wall_ms = 0; + double total_hidden_compile_ms = 0; // compile time hidden by pipeline + int total_batches = 0; + int total_steps_done = 0; + uint64_t t_wall_start = mach_absolute_time(); + + // First compile is synchronous (no pipeline yet) + { + uint64_t t0 = mach_absolute_time(); + k1_fwd = compile_kern_with_blob(build_blob(W1, H, D), D, H, S); + k2_fwd = compile_kern_with_blob(build_blob(W2, D, H), H, D, S); + k2_bwd = compile_kern_with_blob(build_blob_transposed(W2, D, H), D, H, S); + k1_bwd = compile_kern_with_blob(build_blob_transposed(W1, H, D), H, D, S); + double cms = tb_to_ms(mach_absolute_time() - t0, g_tb); + total_compile_ms += cms; + if (!k1_fwd || !k2_fwd || !k1_bwd || !k2_bwd) { + printf("Initial compile failed!\n"); return 1; + } + printf("Initial compile: %.0fms\n", cms); + } + + int step = start_step; + while (step < total_steps) { + // Check compile budget + if (g_compile_count + KERNELS_PER_STEP > MAX_COMPILES) { + free_kern(k1_fwd); free_kern(k2_fwd); + free_kern(k1_bwd); free_kern(k2_bwd); + save_checkpoint(CKPT_PATH, step, last_loss, D, H, S, total_steps, lr, W1, W2, + cum_compile_ms + total_compile_ms, cum_train_ms + total_train_ms, + cum_wall_ms + tb_to_ms(mach_absolute_time() - t_wall_start, g_tb), + cum_steps + total_steps_done, cum_batches + total_batches); + double wall = tb_to_ms(mach_absolute_time() - t_wall_start, g_tb); + printf("[exec() restart at step %d, %d compiles, loss=%.6f, wall=%.0fms]\n", + step, g_compile_count, last_loss, wall); + fflush(stdout); + execl(argv[0], argv[0], "--resume", NULL); + perror("execl failed"); return 1; + } + + // === Run ACCUM_STEPS with current kernels === + float *aW1 = (float*)calloc(H * D, sizeof(float)); + float *aW2 = (float*)calloc(D * H, sizeof(float)); + int steps_this_batch = 0; + + // Pipeline: start compiling NEXT batch's kernels in background + // We'll apply gradients first, then launch compile with updated W + // But for pipeline, we compile AHEAD: while running batch N, compile for N+1 + // So we need to update weights BEFORE launching background compile + + uint64_t t_batch = mach_absolute_time(); + for (int a = 0; a < ACCUM_STEPS && step < total_steps; a++, step++) { + ane_eval_k(k1_fwd, x, h, D, H, S); + for (int i = 0; i < S*H; i++) h_relu[i] = h[i] > 0 ? h[i] : 0; + ane_eval_k(k2_fwd, h_relu, y, H, D, S); + + float loss = 0; + for (int i = 0; i < S*D; i++) { + float diff = y[i] - y_target[i]; + loss += diff * diff; + dy[i] = 2.0f * diff / (S * D); + } + loss /= (S * D); + last_loss = loss; + + ane_eval_k(k2_bwd, dy, dh_relu, D, H, S); + for (int i = 0; i < S*H; i++) dh[i] = h[i] > 0 ? dh_relu[i] : 0; + ane_eval_k(k1_bwd, dh, dx_layer, H, D, S); + + for (int t = 0; t < S; t++) + for (int i = 0; i < D; i++) + for (int j = 0; j < H; j++) + aW2[i*H + j] += dy[t*D + i] * h_relu[t*H + j]; + for (int t = 0; t < S; t++) + for (int i = 0; i < H; i++) + for (int j = 0; j < D; j++) + aW1[i*D + j] += dh[t*H + i] * x[t*D + j]; + + steps_this_batch++; + } + double batch_ms = tb_to_ms(mach_absolute_time() - t_batch, g_tb); + total_train_ms += batch_ms; + + // Apply accumulated gradients + float scale = 1.0f / steps_this_batch; + for (int i = 0; i < H*D; i++) W1[i] -= lr * aW1[i] * scale; + for (int i = 0; i < D*H; i++) W2[i] -= lr * aW2[i] * scale; + free(aW1); free(aW2); + + total_steps_done += steps_this_batch; + total_batches++; + + // Print progress + double step_ms = batch_ms / steps_this_batch; + double ane_gflops = (ane_flops_per_step * steps_this_batch) / (batch_ms * 1e6); + double total_gflops = (total_flops_per_step * steps_this_batch) / (batch_ms * 1e6); + + if (total_batches % 5 == 1 || total_batches <= 2 || step >= total_steps) { + printf("step %-5d loss=%-10.6f %5.1fms/step ANE=%.2f GFLOPS total=%.2f GFLOPS compiles=%d\n", + step - steps_this_batch, last_loss, step_ms, ane_gflops, total_gflops, g_compile_count); + } + + // Pipeline: launch background compile with updated weights, + // then immediately start NEXT batch's ANE evals with OLD kernels + // while compile runs concurrently on GCD queue + bool can_pipeline = (step < total_steps) && (g_compile_count + KERNELS_PER_STEP <= MAX_COMPILES); + + if (can_pipeline) { + // Snapshot weights for background compile + PipelineCompile *pc = calloc(1, sizeof(PipelineCompile)); + pc->W1 = (float*)malloc(H * D * sizeof(float)); + pc->W2 = (float*)malloc(D * H * sizeof(float)); + memcpy(pc->W1, W1, H * D * sizeof(float)); + memcpy(pc->W2, W2, D * H * sizeof(float)); + pc->D = D; pc->H = H; pc->S = S; + + dispatch_semaphore_t sem = dispatch_semaphore_create(0); + + dispatch_async(g_compile_queue, ^{ + @autoreleasepool { + uint64_t t0 = mach_absolute_time(); + pc->k1_fwd = compile_kern_with_blob(build_blob(pc->W1, pc->H, pc->D), pc->D, pc->H, pc->S); + pc->k2_fwd = compile_kern_with_blob(build_blob(pc->W2, pc->D, pc->H), pc->H, pc->D, pc->S); + pc->k2_bwd = compile_kern_with_blob(build_blob_transposed(pc->W2, pc->D, pc->H), pc->D, pc->H, pc->S); + pc->k1_bwd = compile_kern_with_blob(build_blob_transposed(pc->W1, pc->H, pc->D), pc->H, pc->D, pc->S); + pc->compile_ms = tb_to_ms(mach_absolute_time() - t0, g_tb); + pc->ok = pc->k1_fwd && pc->k2_fwd && pc->k1_bwd && pc->k2_bwd; + dispatch_semaphore_signal(sem); + } + }); + + // === While compile runs in background, do ANOTHER batch with OLD kernels === + if (step < total_steps && k1_fwd && k2_fwd && k1_bwd && k2_bwd) { + float *aW1b = (float*)calloc(H * D, sizeof(float)); + float *aW2b = (float*)calloc(D * H, sizeof(float)); + int steps_overlap = 0; + uint64_t t_overlap = mach_absolute_time(); + + for (int a = 0; a < ACCUM_STEPS && step < total_steps; a++, step++) { + ane_eval_k(k1_fwd, x, h, D, H, S); + for (int i = 0; i < S*H; i++) h_relu[i] = h[i] > 0 ? h[i] : 0; + ane_eval_k(k2_fwd, h_relu, y, H, D, S); + + float loss = 0; + for (int i = 0; i < S*D; i++) { + float diff = y[i] - y_target[i]; + loss += diff * diff; + dy[i] = 2.0f * diff / (S * D); + } + loss /= (S * D); + last_loss = loss; + + ane_eval_k(k2_bwd, dy, dh_relu, D, H, S); + for (int i = 0; i < S*H; i++) dh[i] = h[i] > 0 ? dh_relu[i] : 0; + ane_eval_k(k1_bwd, dh, dx_layer, H, D, S); + + for (int t = 0; t < S; t++) + for (int i = 0; i < D; i++) + for (int j = 0; j < H; j++) + aW2b[i*H + j] += dy[t*D + i] * h_relu[t*H + j]; + for (int t = 0; t < S; t++) + for (int i = 0; i < H; i++) + for (int j = 0; j < D; j++) + aW1b[i*D + j] += dh[t*H + i] * x[t*D + j]; + steps_overlap++; + } + double overlap_ms = tb_to_ms(mach_absolute_time() - t_overlap, g_tb); + total_train_ms += overlap_ms; + total_steps_done += steps_overlap; + total_batches++; + + // Apply these gradients with reduced LR (stale weights — 1 batch behind) + float sc = 0.5f / steps_overlap; // half LR for stale batch + for (int i = 0; i < H*D; i++) W1[i] -= lr * aW1b[i] * sc; + for (int i = 0; i < D*H; i++) W2[i] -= lr * aW2b[i] * sc; + free(aW1b); free(aW2b); + + if (total_batches % 5 == 1) { + double sm = overlap_ms / steps_overlap; + printf("step %-5d loss=%-10.6f %5.1fms/step (overlapped with compile) compiles=%d\n", + step - steps_overlap, last_loss, sm, g_compile_count); + } + } + + // Wait for compile to finish + dispatch_semaphore_wait(sem, DISPATCH_TIME_FOREVER); + total_compile_ms += pc->compile_ms; + total_hidden_compile_ms += pc->compile_ms; // all hidden behind train + + free_kern(k1_fwd); free_kern(k2_fwd); + free_kern(k1_bwd); free_kern(k2_bwd); + + if (pc->ok) { + k1_fwd = pc->k1_fwd; k2_fwd = pc->k2_fwd; + k1_bwd = pc->k1_bwd; k2_bwd = pc->k2_bwd; + } else { + k1_fwd = k2_fwd = k1_bwd = k2_bwd = NULL; + } + free(pc->W1); free(pc->W2); free(pc); + } else if (step < total_steps) { + // Synchronous compile (no budget for pipeline) + uint64_t t0 = mach_absolute_time(); + free_kern(k1_fwd); free_kern(k2_fwd); + free_kern(k1_bwd); free_kern(k2_bwd); + k1_fwd = compile_kern_with_blob(build_blob(W1, H, D), D, H, S); + k2_fwd = compile_kern_with_blob(build_blob(W2, D, H), H, D, S); + k2_bwd = compile_kern_with_blob(build_blob_transposed(W2, D, H), D, H, S); + k1_bwd = compile_kern_with_blob(build_blob_transposed(W1, H, D), H, D, S); + double cms = tb_to_ms(mach_absolute_time() - t0, g_tb); + total_compile_ms += cms; + if (!k1_fwd || !k2_fwd || !k1_bwd || !k2_bwd) { + save_checkpoint(CKPT_PATH, step, last_loss, D, H, S, total_steps, lr, W1, W2, + cum_compile_ms + total_compile_ms, cum_train_ms + total_train_ms, + cum_wall_ms + tb_to_ms(mach_absolute_time() - t_wall_start, g_tb), + cum_steps + total_steps_done, cum_batches + total_batches); + fflush(stdout); + execl(argv[0], argv[0], "--resume", NULL); + perror("execl failed"); return 1; + } + } + + if (last_loss < 1e-6f) { printf("\nConverged at step %d!\n", step); break; } + } + + total_wall_ms = tb_to_ms(mach_absolute_time() - t_wall_start, g_tb); + // Add cumulative from previous exec() runs + total_compile_ms += cum_compile_ms; + total_train_ms += cum_train_ms; + total_wall_ms += cum_wall_ms; + total_steps_done += cum_steps; + total_batches += cum_batches; + + // === Final output === + printf("\nFinal output vs target (first 8):\n"); + if (k1_fwd && k2_fwd) { + ane_eval_k(k1_fwd, x, h, D, H, S); + for (int i = 0; i < S*H; i++) h_relu[i] = h[i] > 0 ? h[i] : 0; + ane_eval_k(k2_fwd, h_relu, y, H, D, S); + } + printf(" y: "); for (int i = 0; i < 8; i++) printf("%.4f ", y[i]); printf("\n"); + printf(" target: "); for (int i = 0; i < 8; i++) printf("%.4f ", y_target[i]); printf("\n"); + + // === Efficiency Report === + printf("\n=== Efficiency Report ===\n"); + printf("Total steps: %d\n", total_steps_done); + printf("Total batches: %d (accum %d steps each)\n", total_batches, ACCUM_STEPS); + printf("Wall time: %.0f ms\n", total_wall_ms); + printf("Compile time: %.0f ms (%.1f%%)\n", total_compile_ms, 100.0*total_compile_ms/total_wall_ms); + printf("Train time: %.0f ms (%.1f%%)\n", total_train_ms, 100.0*total_train_ms/total_wall_ms); + printf("Overhead: %.0f ms (%.1f%%)\n", + total_wall_ms - total_compile_ms - total_train_ms, + 100.0*(total_wall_ms - total_compile_ms - total_train_ms)/total_wall_ms); + printf("\n"); + printf("Avg compile: %.1f ms per batch (4 kernels)\n", total_compile_ms / total_batches); + printf("Avg train: %.2f ms per step (ANE fwd+bwd + CPU dW)\n", total_train_ms / total_steps_done); + printf("Avg wall/step: %.2f ms\n", total_wall_ms / total_steps_done); + printf("\n"); + double ane_total_flops = ane_flops_per_step * total_steps_done; + double cpu_total_flops = cpu_flops_per_step * total_steps_done; + printf("ANE FLOPs total: %.3f MFLOP (%.2f GFLOPS sustained)\n", + ane_total_flops / 1e6, ane_total_flops / (total_train_ms * 1e6)); + printf("CPU FLOPs total: %.3f MFLOP (%.2f GFLOPS sustained)\n", + cpu_total_flops / 1e6, cpu_total_flops / (total_train_ms * 1e6)); + printf("Total FLOPs: %.3f MFLOP (%.2f GFLOPS sustained)\n", + (ane_total_flops + cpu_total_flops) / 1e6, + (ane_total_flops + cpu_total_flops) / (total_train_ms * 1e6)); + printf("\n"); + printf("ANE utilization: %.4f%% of 15.8 TFLOPS peak\n", + 100.0 * ane_total_flops / (total_train_ms * 1e6) / 15800.0); + printf("Weight params: %d (%.1f KB FP16)\n", + H*D + D*H, weight_bytes / 1024.0); + printf("Compile amortization: %.1f ms compile / %d steps = %.2f ms/step overhead\n", + total_compile_ms / total_batches, ACCUM_STEPS, + total_compile_ms / total_batches / ACCUM_STEPS); + printf("Compile fraction: %.1f%% of wall time\n", 100.0 * total_compile_ms / total_wall_ms); + printf("Train fraction: %.1f%% of wall time (useful work)\n", 100.0 * total_train_ms / total_wall_ms); + + free_kern(k1_fwd); free_kern(k2_fwd); free_kern(k1_bwd); free_kern(k2_bwd); + free(W1); free(W2); free(x); free(y_target); + free(h); free(h_relu); free(y); free(dy); free(dh_relu); free(dh); free(dx_layer); + unlink(CKPT_PATH); + } + return 0; +} diff --git a/training/tiny_train_old.m b/training/tiny_train_old.m new file mode 100644 index 0000000..c22a90c --- /dev/null +++ b/training/tiny_train_old.m @@ -0,0 +1,309 @@ +// tiny_train.m — Train a 2-layer linear model on ANE (forward AND backward) +// y = W2 @ relu(W1 @ x), MSE loss, SGD update +// Forward: ANE conv with baked weights +// Backward dx: ANE conv with transposed baked weights +// Backward dW: CPU (outer product, memory-bound) +#import +#import +#import +#import +#import +#import +#include + +static Class g_D, g_I, g_AR, g_AIO; + +static void ane_init(void) { + dlopen("/System/Library/PrivateFrameworks/AppleNeuralEngine.framework/AppleNeuralEngine", RTLD_NOW); + g_D = NSClassFromString(@"_ANEInMemoryModelDescriptor"); + g_I = NSClassFromString(@"_ANEInMemoryModel"); + g_AR = NSClassFromString(@"_ANERequest"); + g_AIO= NSClassFromString(@"_ANEIOSurfaceObject"); +} + +static IOSurfaceRef make_surface(size_t bytes) { + return IOSurfaceCreate((__bridge CFDictionaryRef)@{ + (id)kIOSurfaceWidth:@(bytes), (id)kIOSurfaceHeight:@1, + (id)kIOSurfaceBytesPerElement:@1, (id)kIOSurfaceBytesPerRow:@(bytes), + (id)kIOSurfaceAllocSize:@(bytes), (id)kIOSurfacePixelFormat:@0}); +} + +static NSData *build_blob(const float *w, int rows, int cols) { + int wsize = rows * cols * 2; + int total = 128 + wsize; + uint8_t *buf = (uint8_t*)calloc(total, 1); + buf[0] = 0x01; buf[4] = 0x02; + buf[64] = 0xEF; buf[65] = 0xBE; buf[66] = 0xAD; buf[67] = 0xDE; + buf[68] = 0x01; + *(uint32_t*)(buf+72) = wsize; + *(uint32_t*)(buf+80) = 128; + _Float16 *fp16 = (_Float16*)(buf + 128); + for (int i = 0; i < rows * cols; i++) fp16[i] = (_Float16)w[i]; + return [NSData dataWithBytesNoCopy:buf length:total freeWhenDone:YES]; +} + +// Build blob with TRANSPOSED weights: W[rows,cols] → W^T[cols,rows] +static NSData *build_blob_transposed(const float *w, int rows, int cols) { + int wsize = cols * rows * 2; + int total = 128 + wsize; + uint8_t *buf = (uint8_t*)calloc(total, 1); + buf[0] = 0x01; buf[4] = 0x02; + buf[64] = 0xEF; buf[65] = 0xBE; buf[66] = 0xAD; buf[67] = 0xDE; + buf[68] = 0x01; + *(uint32_t*)(buf+72) = wsize; + *(uint32_t*)(buf+80) = 128; + _Float16 *fp16 = (_Float16*)(buf + 128); + for (int i = 0; i < rows; i++) + for (int j = 0; j < cols; j++) + fp16[j * rows + i] = (_Float16)w[i * cols + j]; // transpose + return [NSData dataWithBytesNoCopy:buf length:total freeWhenDone:YES]; +} + +static NSString *gen_conv_mil(int in_ch, int out_ch, int sp) { + return [NSString stringWithFormat: + @"program(1.3)\n[buildInfo = dict({{\"coremlc-component-MIL\", \"3510.2.1\"}, " + "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, " + "{\"coremltools-version\", \"9.0\"}})]\n{\n" + " func main(tensor x) {\n" + " string d1 = const()[name = string(\"d1\"), val = string(\"fp16\")];\n" + " tensor x16 = cast(dtype = d1, x = x)[name = string(\"cx\")];\n" + " tensor W = const()[name = string(\"W\"), " + "val = tensor(BLOBFILE(path = string(\"@model_path/weights/weight.bin\"), offset = uint64(64)))];\n" + " string pt = const()[name = string(\"pt\"), val = string(\"valid\")];\n" + " tensor st = const()[name = string(\"st\"), val = tensor([1, 1])];\n" + " tensor pd = const()[name = string(\"pd\"), val = tensor([0, 0, 0, 0])];\n" + " tensor dl = const()[name = string(\"dl\"), val = tensor([1, 1])];\n" + " int32 gr = const()[name = string(\"gr\"), val = int32(1)];\n" + " tensor y16 = conv(dilations = dl, groups = gr, pad = pd, " + "pad_type = pt, strides = st, weight = W, x = x16)[name = string(\"cv\")];\n" + " string d2 = const()[name = string(\"d2\"), val = string(\"fp32\")];\n" + " tensor y = cast(dtype = d2, x = y16)[name = string(\"co\")];\n" + " } -> (y);\n}\n", + in_ch, sp, in_ch, sp, out_ch, in_ch, out_ch, in_ch, out_ch, sp, out_ch, sp]; +} + +typedef struct { + id model; + IOSurfaceRef ioIn, ioOut; + id request; + NSString *tmpDir; +} Kern; + +static Kern *compile_kern_with_blob(NSData *blob, int in_ch, int out_ch, int sp) { + NSString *mil = gen_conv_mil(in_ch, out_ch, sp); + NSData *milData = [mil dataUsingEncoding:NSUTF8StringEncoding]; + NSDictionary *wd = @{@"@model_path/weights/weight.bin":@{@"offset":@0,@"data":blob}}; + id desc = ((id(*)(Class,SEL,id,id,id))objc_msgSend)(g_D, @selector(modelWithMILText:weights:optionsPlist:), milData, wd, nil); + if (!desc) return NULL; + id mdl = ((id(*)(Class,SEL,id))objc_msgSend)(g_I, @selector(inMemoryModelWithDescriptor:), desc); + id hx = ((id(*)(id,SEL))objc_msgSend)(mdl, @selector(hexStringIdentifier)); + NSString *td = [NSTemporaryDirectory() stringByAppendingPathComponent:hx]; + NSFileManager *fm = [NSFileManager defaultManager]; + [fm createDirectoryAtPath:[td stringByAppendingPathComponent:@"weights"] withIntermediateDirectories:YES attributes:nil error:nil]; + [milData writeToFile:[td stringByAppendingPathComponent:@"model.mil"] atomically:YES]; + [blob writeToFile:[td stringByAppendingPathComponent:@"weights/weight.bin"] atomically:YES]; + NSError *e = nil; + if (!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(compileWithQoS:options:error:), 21, @{}, &e)) return NULL; + if (!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(loadWithQoS:options:error:), 21, @{}, &e)) return NULL; + size_t inB = in_ch * sp * 4, outB = out_ch * sp * 4; + IOSurfaceRef ioI = make_surface(inB), ioO = make_surface(outB); + id wI = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioI); + id wO = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioO); + id req = ((id(*)(Class,SEL,id,id,id,id,id,id,id))objc_msgSend)(g_AR, + @selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:), + @[wI], @[@0], @[wO], @[@0], nil, nil, @0); + Kern *k = calloc(1, sizeof(Kern)); + k->model = mdl; k->ioIn = ioI; k->ioOut = ioO; k->request = req; k->tmpDir = td; + return k; +} + +static void free_kern(Kern *k) { + if (!k) return; + NSError *e = nil; + ((BOOL(*)(id,SEL,unsigned int,NSError**))objc_msgSend)(k->model, @selector(unloadWithQoS:error:), 21, &e); + CFRelease(k->ioIn); CFRelease(k->ioOut); + [[NSFileManager defaultManager] removeItemAtPath:k->tmpDir error:nil]; + free(k); +} + +// ANE eval: input [S, in_ch] row-major ↔ [in_ch, S] channels-first +static void ane_eval(Kern *k, const float *in, float *out, int in_ch, int out_ch, int sp) { + float *tmp = (float*)malloc(in_ch * sp * sizeof(float)); + for (int t = 0; t < sp; t++) + for (int c = 0; c < in_ch; c++) + tmp[c*sp + t] = in[t*in_ch + c]; + IOSurfaceLock(k->ioIn, 0, NULL); + memcpy(IOSurfaceGetBaseAddress(k->ioIn), tmp, in_ch * sp * sizeof(float)); + IOSurfaceUnlock(k->ioIn, 0, NULL); + free(tmp); + NSError *e = nil; + ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)( + k->model, @selector(evaluateWithQoS:options:request:error:), 21, @{}, k->request, &e); + float *tmp2 = (float*)malloc(out_ch * sp * sizeof(float)); + IOSurfaceLock(k->ioOut, kIOSurfaceLockReadOnly, NULL); + memcpy(tmp2, IOSurfaceGetBaseAddress(k->ioOut), out_ch * sp * sizeof(float)); + IOSurfaceUnlock(k->ioOut, kIOSurfaceLockReadOnly, NULL); + for (int t = 0; t < sp; t++) + for (int c = 0; c < out_ch; c++) + out[t*out_ch + c] = tmp2[c*sp + t]; + free(tmp2); +} + +int main(int argc, char *argv[]) { + @autoreleasepool { + ane_init(); + mach_timebase_info_data_t tb; + mach_timebase_info(&tb); + + int D = 64, H = 128, S = 16; + int steps = 25; // 4 kernels × 25 = 100 compiles, under 119 limit + float lr = 0.5f; + int recompile_every = 1; // recompile every step for correct gradients + + float *W1 = (float*)malloc(H * D * sizeof(float)); + float *W2 = (float*)malloc(D * H * sizeof(float)); + for (int i = 0; i < H*D; i++) W1[i] = 0.01f * sinf(i * 1.3f + 0.7f); + for (int i = 0; i < D*H; i++) W2[i] = 0.01f * cosf(i * 0.9f + 1.1f); + + float *x = (float*)calloc(S * D, sizeof(float)); + float *y_target = (float*)calloc(S * D, sizeof(float)); + for (int t = 0; t < S; t++) + for (int i = 0; i < D; i++) { + float v = sinf((t * D + i) * 0.1f); + x[t*D + i] = v; + y_target[t*D + i] = v; + } + + printf("=== Tiny 2-Layer ANE Training (Forward + Backward on ANE) ===\n"); + printf("x:[%d,%d] → W1:[%d,%d] → ReLU → W2:[%d,%d] → y:[%d,%d]\n", S,D, H,D, D,H, S,D); + printf("Forward: ANE conv | Backward dx: ANE conv(W^T) | Backward dW: CPU\n"); + printf("Steps: %d, LR: %.4f, Recompile every %d steps\n\n", steps, lr, recompile_every); + + float *h = (float*)malloc(S * H * sizeof(float)); + float *h_relu = (float*)malloc(S * H * sizeof(float)); + float *y = (float*)malloc(S * D * sizeof(float)); + float *dy = (float*)malloc(S * D * sizeof(float)); + float *dh_relu = (float*)malloc(S * H * sizeof(float)); + float *dh = (float*)malloc(S * H * sizeof(float)); + float *dx_layer = (float*)malloc(S * D * sizeof(float)); // not used for update but proves backward works + float *dW1 = (float*)calloc(H * D, sizeof(float)); + float *dW2 = (float*)calloc(D * H, sizeof(float)); + + // 4 ANE kernels: 2 forward + 2 backward (transposed weights) + Kern *k1_fwd = NULL, *k2_fwd = NULL; // W1: [H,D]→conv(D→H), W2: [D,H]→conv(H→D) + Kern *k1_bwd = NULL, *k2_bwd = NULL; // W1^T: [D,H]→conv(H→D), W2^T: [H,D]→conv(D→H) + bool on_ane = true; + + printf("%-6s %-12s %-10s %-6s\n", "Step", "MSE Loss", "ms/step", "Backend"); + printf("--------------------------------------\n"); + + for (int step = 0; step < steps; step++) { + uint64_t t0 = mach_absolute_time(); + + if (on_ane && step % recompile_every == 0) { + free_kern(k1_fwd); free_kern(k2_fwd); + free_kern(k1_bwd); free_kern(k2_bwd); + k1_fwd = k2_fwd = k1_bwd = k2_bwd = NULL; + @autoreleasepool { + k1_fwd = compile_kern_with_blob(build_blob(W1, H, D), D, H, S); + k2_fwd = compile_kern_with_blob(build_blob(W2, D, H), H, D, S); + // Backward: dx = W^T @ dy → conv with transposed weight + // W2^T: [H,D] as conv weight, input dy [1,D,1,S] → output dh [1,H,1,S] + k2_bwd = compile_kern_with_blob(build_blob_transposed(W2, D, H), D, H, S); + // W1^T: [D,H] as conv weight, input dh [1,H,1,S] → output dx [1,D,1,S] + k1_bwd = compile_kern_with_blob(build_blob_transposed(W1, H, D), H, D, S); + } + if (!k1_fwd || !k2_fwd || !k1_bwd || !k2_bwd) { + printf("ANE limit at step %d, continuing on CPU\n", step); + free_kern(k1_fwd); free_kern(k2_fwd); + free_kern(k1_bwd); free_kern(k2_bwd); + k1_fwd = k2_fwd = k1_bwd = k2_bwd = NULL; + on_ane = false; + } + } + + if (on_ane) { + // === Forward on ANE === + ane_eval(k1_fwd, x, h, D, H, S); + for (int i = 0; i < S*H; i++) h_relu[i] = h[i] > 0 ? h[i] : 0; + ane_eval(k2_fwd, h_relu, y, H, D, S); + } else { + for (int t = 0; t < S; t++) + for (int i = 0; i < H; i++) { + float s = 0; for (int j = 0; j < D; j++) s += W1[i*D+j] * x[t*D+j]; + h[t*H+i] = s; + } + for (int i = 0; i < S*H; i++) h_relu[i] = h[i] > 0 ? h[i] : 0; + for (int t = 0; t < S; t++) + for (int i = 0; i < D; i++) { + float s = 0; for (int j = 0; j < H; j++) s += W2[i*H+j] * h_relu[t*H+j]; + y[t*D+i] = s; + } + } + + // MSE loss + dL/dy + float loss = 0; + for (int i = 0; i < S*D; i++) { + float diff = y[i] - y_target[i]; + loss += diff * diff; + dy[i] = 2.0f * diff / (S * D); + } + loss /= (S * D); + + if (on_ane) { + // === Backward dx on ANE === + // dh_relu = W2^T @ dy (ANE conv with transposed W2) + ane_eval(k2_bwd, dy, dh_relu, D, H, S); + // ReLU backward (CPU, element-wise) + for (int i = 0; i < S*H; i++) dh[i] = h[i] > 0 ? dh_relu[i] : 0; + // dx = W1^T @ dh (ANE conv with transposed W1) + ane_eval(k1_bwd, dh, dx_layer, H, D, S); + } else { + memset(dh_relu, 0, S * H * sizeof(float)); + for (int t = 0; t < S; t++) + for (int j = 0; j < H; j++) + for (int i = 0; i < D; i++) + dh_relu[t*H + j] += W2[i*H + j] * dy[t*D + i]; + for (int i = 0; i < S*H; i++) dh[i] = h[i] > 0 ? dh_relu[i] : 0; + } + + // dW on CPU (outer products — memory-bound, not worth ANE) + memset(dW2, 0, D * H * sizeof(float)); + for (int t = 0; t < S; t++) + for (int i = 0; i < D; i++) + for (int j = 0; j < H; j++) + dW2[i*H + j] += dy[t*D + i] * h_relu[t*H + j]; + memset(dW1, 0, H * D * sizeof(float)); + for (int t = 0; t < S; t++) + for (int i = 0; i < H; i++) + for (int j = 0; j < D; j++) + dW1[i*D + j] += dh[t*H + i] * x[t*D + j]; + + // SGD + for (int i = 0; i < H*D; i++) W1[i] -= lr * dW1[i]; + for (int i = 0; i < D*H; i++) W2[i] -= lr * dW2[i]; + + double ms = (double)(mach_absolute_time() - t0) * tb.numer / tb.denom / 1e6; + + if (step % 1 == 0 || step == steps - 1) + printf("%-6d %-12.6f %-10.1f %-6s\n", step, loss, ms, on_ane ? "ANE" : "CPU"); + + if (loss < 1e-6f) { printf("\nConverged at step %d!\n", step); break; } + } + + printf("\nFinal output vs target (first 8):\n"); + if (on_ane && k1_fwd && k2_fwd) { + ane_eval(k1_fwd, x, h, D, H, S); + for (int i = 0; i < S*H; i++) h_relu[i] = h[i] > 0 ? h[i] : 0; + ane_eval(k2_fwd, h_relu, y, H, D, S); + } + printf(" y: "); for (int i = 0; i < 8; i++) printf("%.4f ", y[i]); printf("\n"); + printf(" target: "); for (int i = 0; i < 8; i++) printf("%.4f ", y_target[i]); printf("\n"); + + free_kern(k1_fwd); free_kern(k2_fwd); free_kern(k1_bwd); free_kern(k2_bwd); + free(W1); free(W2); free(x); free(y_target); + free(h); free(h_relu); free(y); free(dy); free(dh_relu); free(dh); free(dx_layer); free(dW1); free(dW2); + printf("\nDone.\n"); + } + return 0; +} diff --git a/training/train.m b/training/train.m new file mode 100644 index 0000000..6fd4a86 --- /dev/null +++ b/training/train.m @@ -0,0 +1,103 @@ +// train.m — Stories110M training loop on ANE +// Usage: ./train [seq_len] [steps] [lr] [--cpu] +#import +#import +#include +#include +#include +#include +#include "backward.h" + +static mach_timebase_info_data_t g_tb; +static double ticksToMs(uint64_t t) { return (double)t * g_tb.numer / g_tb.denom / 1e6; } + +int main(int argc, char *argv[]) { + @autoreleasepool { + mach_timebase_info(&g_tb); + + if (argc < 2) { + fprintf(stderr, "Usage: %s [seq_len=16] [steps=100] [lr=1e-4] [--cpu]\n", argv[0]); + return 1; + } + + int seq_len = argc > 2 ? atoi(argv[2]) : 16; + int steps = argc > 3 ? atoi(argv[3]) : 100; + float lr = argc > 4 ? atof(argv[4]) : 1e-4f; + bool use_ane = true; + for (int i = 1; i < argc; i++) + if (strcmp(argv[i], "--cpu") == 0) use_ane = false; + + printf("=== Stories110M ANE Training ===\n"); + printf("Seq len: %d, Steps: %d, LR: %.2e, Backend: %s\n\n", + seq_len, steps, lr, use_ane ? "ANE" : "CPU"); + + Model m = {0}; + printf("Loading weights...\n"); + if (model_load_weights(&m, argv[1]) != 0) return 1; + + if (use_ane) { + if (model_compile_kernels(&m, seq_len) != 0) { + fprintf(stderr, "ANE kernel compilation failed, falling back to CPU\n"); + use_ane = false; + } + } + if (!use_ane) m.seq_len = seq_len; + + model_alloc_training(&m); + + // Training tokens: simple repeating pattern to overfit on + int *train_tokens = (int*)malloc(seq_len * sizeof(int)); + for (int i = 0; i < seq_len; i++) + train_tokens[i] = (i * 7 + 13) % 256 + 1; + + printf("\nTraining tokens (first 16): "); + for (int i = 0; i < 16 && i < seq_len; i++) printf("%d ", train_tokens[i]); + printf("...\n\n"); + + printf("%-6s %-10s %-12s %-10s %-10s\n", "Step", "Loss", "GradNorm", "ms/step", "tok/s"); + printf("------------------------------------------------------\n"); + + int recompile_interval = 1; // Recompile ANE kernels every N steps + for (int step = 0; step < steps; step++) { + uint64_t t0 = mach_absolute_time(); + + float loss = model_forward(&m, train_tokens, use_ane); + if (isnan(loss) || isinf(loss)) { + printf("NaN/Inf loss at step %d, stopping.\n", step); + break; + } + + model_backward(&m, train_tokens); + model_clip_gradients(&m, 1.0f); + model_adam_step(&m, lr, 0.9f, 0.999f, 1e-8f); + + // Recompile ANE kernels with updated weights + if (use_ane && (step + 1) % recompile_interval == 0) { + if (model_recompile_kernels(&m) != 0) { + printf("Recompile failed at step %d, switching to CPU\n", step); + use_ane = false; + } + } + + double ms = ticksToMs(mach_absolute_time() - t0); + double tps = (seq_len - 1) / (ms / 1000.0); + + if (step % 10 == 0 || step == steps - 1) { + double gnorm = 0; + int d2 = m.cfg.dim; + for (int i = 0; i < d2*d2; i++) gnorm += (double)m.grad_wq[0][i]*m.grad_wq[0][i]; + gnorm = sqrt(gnorm); + printf("%-6d %-10.4f %-12.4f %-10.1f %-10.1f\n", step, loss, gnorm, ms, tps); + } + + if (loss < 0.01f) { + printf("\nConverged at step %d! Loss: %.6f\n", step, loss); + break; + } + } + + free(train_tokens); + printf("\nDone.\n"); + } + return 0; +} diff --git a/training/train_large.m b/training/train_large.m new file mode 100644 index 0000000..55c1cf8 --- /dev/null +++ b/training/train_large.m @@ -0,0 +1,1005 @@ +// train_large.m — Train a single transformer layer FULLY on ANE +// 7 ANE kernels per step: +// Forward: kFwdAttn (QKV+SDPA+Wo, taps Q,K,V,attn_out) + kFwdFFN (W1+W3+SiLU+W2, taps h1,h3,silu_out) +// Backward: kFFNBwd (W2^T+SiLU_bwd+W1^T+W3^T) + kSdpaBwd1 (Wo^T+SDPA) + kSdpaBwd2 + kQKVb (Wq^T+Wk^T+Wv^T) +// CPU: RMSNorm (fwd+bwd), residuals, loss, dW accumulation (cblas), SGD update +// NO CPU recompute of Q,K,V,h1,h3 — all exposed via forward taps +#import +#import +#import +#import +#import +#import +#import +#include +#include +#include + +#define DIM 768 +#define HIDDEN 2048 +#define HEADS 12 +#define HD (DIM/HEADS) +#define SEQ 512 +#define ACCUM_STEPS 100 +#define MAX_COMPILES 100 +#define NUM_KERNELS 6 +#define CKPT_PATH "/tmp/ane_large_ckpt.bin" + +static Class g_D, g_I, g_AR, g_AIO; +static mach_timebase_info_data_t g_tb; +static int g_compile_count = 0; + +static void ane_init(void) { + dlopen("/System/Library/PrivateFrameworks/AppleNeuralEngine.framework/AppleNeuralEngine", RTLD_NOW); + g_D = NSClassFromString(@"_ANEInMemoryModelDescriptor"); + g_I = NSClassFromString(@"_ANEInMemoryModel"); + g_AR = NSClassFromString(@"_ANERequest"); + g_AIO= NSClassFromString(@"_ANEIOSurfaceObject"); +} +static double tb_ms(uint64_t t) { return (double)t * g_tb.numer / g_tb.denom / 1e6; } +static IOSurfaceRef make_surface(size_t bytes) { + return IOSurfaceCreate((__bridge CFDictionaryRef)@{ + (id)kIOSurfaceWidth:@(bytes), (id)kIOSurfaceHeight:@1, + (id)kIOSurfaceBytesPerElement:@1, (id)kIOSurfaceBytesPerRow:@(bytes), + (id)kIOSurfaceAllocSize:@(bytes), (id)kIOSurfacePixelFormat:@0}); +} +static NSData *build_blob(const float *w, int rows, int cols) { + int ws=rows*cols*2, tot=128+ws; + uint8_t *b=(uint8_t*)calloc(tot,1); + b[0]=1;b[4]=2;b[64]=0xEF;b[65]=0xBE;b[66]=0xAD;b[67]=0xDE;b[68]=1; + *(uint32_t*)(b+72)=ws;*(uint32_t*)(b+80)=128; + _Float16 *fp16=(_Float16*)(b+128); + for(int i=0;i({{\"coremlc-component-MIL\", \"3510.2.1\"}, " \ + "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, " \ + "{\"coremltools-version\", \"9.0\"}})]\n{\n" +#define CONV_CONST \ + " string pt = const()[name=string(\"pt\"), val=string(\"valid\")];\n" \ + " tensor st = const()[name=string(\"st\"), val=tensor([1,1])];\n" \ + " tensor pd = const()[name=string(\"pd\"), val=tensor([0,0,0,0])];\n" \ + " tensor dl = const()[name=string(\"dl\"), val=tensor([1,1])];\n" \ + " int32 gr = const()[name=string(\"gr\"), val=int32(1)];\n" + +// SDPA forward + taps: x_in → rmsnorm → QKV+SDPA+Wo → concat(o_out, Q, K, V, attn_out, xnorm) fp16 +static NSString *gen_sdpa_fwd_taps(void) { + float sc = 1.0f/sqrtf((float)HD); + float invd = 1.0f/(float)DIM; + NSMutableString *m = [NSMutableString string]; + [m appendString:MIL_HDR]; + [m appendFormat:@" func main(tensor x) {\n", DIM, SEQ]; + // --- RMSNorm: x → xn --- + [m appendFormat:@" tensor sq = mul(x=x,y=x)[name=string(\"sq\")];\n", DIM, SEQ]; + [m appendFormat:@" tensor rax = const()[name=string(\"rax\"), val=tensor([1])];\n"]; + [m appendFormat:@" bool kd = const()[name=string(\"kd\"), val=bool(true)];\n"]; + [m appendFormat:@" tensor ss = reduce_sum(x=sq,axes=rax,keep_dims=kd)[name=string(\"ss\")];\n", SEQ]; + [m appendFormat:@" fp16 invd = const()[name=string(\"invd\"), val=fp16(%f)];\n", invd]; + [m appendFormat:@" tensor ss2 = mul(x=ss,y=invd)[name=string(\"ss2\")];\n", SEQ]; + [m appendFormat:@" fp16 eps = const()[name=string(\"eps\"), val=fp16(0.00001)];\n"]; + [m appendFormat:@" tensor ss3 = add(x=ss2,y=eps)[name=string(\"ss3\")];\n", SEQ]; + [m appendFormat:@" fp16 nhalf = const()[name=string(\"nhalf\"), val=fp16(-0.5)];\n"]; + [m appendFormat:@" tensor rrms = pow(x=ss3,y=nhalf)[name=string(\"rrms\")];\n", SEQ]; + [m appendFormat:@" tensor xr = mul(x=x,y=rrms)[name=string(\"xr\")];\n", DIM, SEQ]; + [m appendFormat:@" tensor rw = const()[name=string(\"rw\"), val=tensor(BLOBFILE(path=string(\"@model_path/weights/rms1.bin\"), offset=uint64(64)))];\n", DIM, DIM]; + [m appendFormat:@" tensor xn = mul(x=xr,y=rw)[name=string(\"xn\")];\n", DIM, SEQ]; + // --- QKV + SDPA + Wo (operates on xn) --- + [m appendString:@CONV_CONST]; + [m appendFormat:@" tensor Wq = const()[name=string(\"Wq\"), val=tensor(BLOBFILE(path=string(\"@model_path/weights/wq.bin\"), offset=uint64(64)))];\n", DIM,DIM,DIM,DIM]; + [m appendFormat:@" tensor Wk = const()[name=string(\"Wk\"), val=tensor(BLOBFILE(path=string(\"@model_path/weights/wk.bin\"), offset=uint64(64)))];\n", DIM,DIM,DIM,DIM]; + [m appendFormat:@" tensor Wv = const()[name=string(\"Wv\"), val=tensor(BLOBFILE(path=string(\"@model_path/weights/wv.bin\"), offset=uint64(64)))];\n", DIM,DIM,DIM,DIM]; + [m appendFormat:@" tensor Wo = const()[name=string(\"Wo\"), val=tensor(BLOBFILE(path=string(\"@model_path/weights/wo.bin\"), offset=uint64(64)))];\n", DIM,DIM,DIM,DIM]; + [m appendFormat:@" tensor qf = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wq,x=xn)[name=string(\"cq\")];\n", DIM,SEQ]; + [m appendFormat:@" tensor kf = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wk,x=xn)[name=string(\"ck\")];\n", DIM,SEQ]; + [m appendFormat:@" tensor vf = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wv,x=xn)[name=string(\"cv\")];\n", DIM,SEQ]; + [m appendFormat:@" tensor qsh = const()[name=string(\"qsh\"), val=tensor([1,%d,%d,%d])];\n", HEADS,HD,SEQ]; + [m appendString:@" tensor pm = const()[name=string(\"pm\"), val=tensor([0,1,3,2])];\n"]; + [m appendFormat:@" tensor q4 = reshape(shape=qsh,x=qf)[name=string(\"rq\")];\n", HEADS,HD,SEQ]; + [m appendFormat:@" tensor q = transpose(perm=pm,x=q4)[name=string(\"tq\")];\n", HEADS,SEQ,HD]; + [m appendFormat:@" tensor k4 = reshape(shape=qsh,x=kf)[name=string(\"rk\")];\n", HEADS,HD,SEQ]; + [m appendFormat:@" tensor k = transpose(perm=pm,x=k4)[name=string(\"tk\")];\n", HEADS,SEQ,HD]; + [m appendFormat:@" tensor v4 = reshape(shape=qsh,x=vf)[name=string(\"rv\")];\n", HEADS,HD,SEQ]; + [m appendFormat:@" tensor v = transpose(perm=pm,x=v4)[name=string(\"tv\")];\n", HEADS,SEQ,HD]; + [m appendString:@" bool tx = const()[name=string(\"tx\"), val=bool(false)];\n"]; + [m appendString:@" bool ty = const()[name=string(\"ty\"), val=bool(true)];\n"]; + [m appendFormat:@" tensor sc1 = matmul(transpose_x=tx,transpose_y=ty,x=q,y=k)[name=string(\"mm1\")];\n", HEADS,SEQ,SEQ]; + [m appendFormat:@" fp16 scv = const()[name=string(\"scv\"), val=fp16(%f)];\n", sc]; + [m appendFormat:@" tensor sc2 = mul(x=sc1,y=scv)[name=string(\"scl\")];\n", HEADS,SEQ,SEQ]; + [m appendFormat:@" tensor cm = const()[name=string(\"cm\"), val=tensor(BLOBFILE(path=string(\"@model_path/weights/mask.bin\"), offset=uint64(64)))];\n", SEQ,SEQ,SEQ,SEQ]; + [m appendFormat:@" tensor ms = add(x=sc2,y=cm)[name=string(\"msk\")];\n", HEADS,SEQ,SEQ]; + [m appendString:@" int32 sax = const()[name=string(\"sax\"), val=int32(-1)];\n"]; + [m appendFormat:@" tensor aw = softmax(axis=sax,x=ms)[name=string(\"sm\")];\n", HEADS,SEQ,SEQ]; + [m appendFormat:@" tensor a4 = matmul(transpose_x=tx,transpose_y=tx,x=aw,y=v)[name=string(\"mm2\")];\n", HEADS,SEQ,HD]; + [m appendFormat:@" tensor at = transpose(perm=pm,x=a4)[name=string(\"ta\")];\n", HEADS,HD,SEQ]; + [m appendFormat:@" tensor os = const()[name=string(\"os\"), val=tensor([1,%d,1,%d])];\n", DIM,SEQ]; + [m appendFormat:@" tensor af = reshape(shape=os,x=at)[name=string(\"ra\")];\n", DIM,SEQ]; + [m appendFormat:@" tensor oo = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wo,x=af)[name=string(\"co\")];\n", DIM,SEQ]; + [m appendString:@" int32 cax = const()[name=string(\"cax\"), val=int32(1)];\n"]; + [m appendString:@" bool cid = const()[name=string(\"cid\"), val=bool(false)];\n"]; + [m appendFormat:@" tensor out = concat(axis=cax,interleave=cid,values=(oo,qf,kf,vf,af,xn))[name=string(\"cat\")];\n", 6*DIM,SEQ]; + [m appendString:@" } -> (out);\n}\n"]; + return m; +} + +// FFN forward + taps: x2 → rmsnorm → FFN → concat(ffn_out, h1, h3, silu_out, x2norm) fp16 +static NSString *gen_ffn_fwd_taps(void) { + float invd = 1.0f/(float)DIM; + NSMutableString *m = [NSMutableString string]; + [m appendString:MIL_HDR]; + [m appendFormat:@" func main(tensor x) {\n", DIM, SEQ]; + // --- RMSNorm: x → xn --- + [m appendFormat:@" tensor sq = mul(x=x,y=x)[name=string(\"sq\")];\n", DIM, SEQ]; + [m appendFormat:@" tensor rax = const()[name=string(\"rax\"), val=tensor([1])];\n"]; + [m appendFormat:@" bool kd = const()[name=string(\"kd\"), val=bool(true)];\n"]; + [m appendFormat:@" tensor ss = reduce_sum(x=sq,axes=rax,keep_dims=kd)[name=string(\"ss\")];\n", SEQ]; + [m appendFormat:@" fp16 invd = const()[name=string(\"invd\"), val=fp16(%f)];\n", invd]; + [m appendFormat:@" tensor ss2 = mul(x=ss,y=invd)[name=string(\"ss2\")];\n", SEQ]; + [m appendFormat:@" fp16 eps = const()[name=string(\"eps\"), val=fp16(0.00001)];\n"]; + [m appendFormat:@" tensor ss3 = add(x=ss2,y=eps)[name=string(\"ss3\")];\n", SEQ]; + [m appendFormat:@" fp16 nhalf = const()[name=string(\"nhalf\"), val=fp16(-0.5)];\n"]; + [m appendFormat:@" tensor rrms = pow(x=ss3,y=nhalf)[name=string(\"rrms\")];\n", SEQ]; + [m appendFormat:@" tensor xr = mul(x=x,y=rrms)[name=string(\"xr\")];\n", DIM, SEQ]; + [m appendFormat:@" tensor rw = const()[name=string(\"rw\"), val=tensor(BLOBFILE(path=string(\"@model_path/weights/rms2.bin\"), offset=uint64(64)))];\n", DIM, DIM]; + [m appendFormat:@" tensor xn = mul(x=xr,y=rw)[name=string(\"xn\")];\n", DIM, SEQ]; + // --- FFN (operates on xn) --- + [m appendString:@CONV_CONST]; + [m appendFormat:@" tensor W1 = const()[name=string(\"W1\"), val=tensor(BLOBFILE(path=string(\"@model_path/weights/w1.bin\"), offset=uint64(64)))];\n", HIDDEN,DIM,HIDDEN,DIM]; + [m appendFormat:@" tensor W3 = const()[name=string(\"W3\"), val=tensor(BLOBFILE(path=string(\"@model_path/weights/w3.bin\"), offset=uint64(64)))];\n", HIDDEN,DIM,HIDDEN,DIM]; + [m appendFormat:@" tensor W2 = const()[name=string(\"W2\"), val=tensor(BLOBFILE(path=string(\"@model_path/weights/w2.bin\"), offset=uint64(64)))];\n", DIM,HIDDEN,DIM,HIDDEN]; + [m appendFormat:@" tensor h1 = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W1,x=xn)[name=string(\"c1\")];\n", HIDDEN,SEQ]; + [m appendFormat:@" tensor h3 = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W3,x=xn)[name=string(\"c3\")];\n", HIDDEN,SEQ]; + [m appendFormat:@" tensor sig = sigmoid(x=h1)[name=string(\"sg\")];\n", HIDDEN,SEQ]; + [m appendFormat:@" tensor silu = mul(x=h1,y=sig)[name=string(\"si\")];\n", HIDDEN,SEQ]; + [m appendFormat:@" tensor gate = mul(x=silu,y=h3)[name=string(\"gt\")];\n", HIDDEN,SEQ]; + [m appendFormat:@" tensor y = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W2,x=gate)[name=string(\"c2\")];\n", DIM,SEQ]; + [m appendString:@" int32 cax = const()[name=string(\"cax\"), val=int32(1)];\n"]; + [m appendString:@" bool cid = const()[name=string(\"cid\"), val=bool(false)];\n"]; + [m appendFormat:@" tensor out = concat(axis=cax,interleave=cid,values=(y,h1,h3,gate,xn))[name=string(\"cat\")];\n", 2*DIM+3*HIDDEN,SEQ]; + [m appendString:@" } -> (out);\n}\n"]; + return m; +} + +// Fused FFN backward: concat(dffn,h1,h3) → concat(dx,dh1,dh3) fp16 +static NSString *gen_ffn_bwd(void) { + NSMutableString *m = [NSMutableString string]; + [m appendString:MIL_HDR]; + [m appendFormat:@" func main(tensor x) {\n", DIM+2*HIDDEN, SEQ]; + [m appendString:@CONV_CONST]; + [m appendString:@" tensor bd = const()[name=string(\"bd\"), val=tensor([0,0,0,0])];\n"]; + [m appendFormat:@" tensor sd = const()[name=string(\"sd\"), val=tensor([1,%d,1,%d])];\n", DIM, SEQ]; + [m appendFormat:@" tensor dffn = slice_by_size(x=x,begin=bd,size=sd)[name=string(\"s0\")];\n", DIM, SEQ]; + [m appendFormat:@" tensor b1 = const()[name=string(\"b1\"), val=tensor([0,%d,0,0])];\n", DIM]; + [m appendFormat:@" tensor s1 = const()[name=string(\"s1\"), val=tensor([1,%d,1,%d])];\n", HIDDEN, SEQ]; + [m appendFormat:@" tensor h1 = slice_by_size(x=x,begin=b1,size=s1)[name=string(\"s1x\")];\n", HIDDEN, SEQ]; + [m appendFormat:@" tensor b3 = const()[name=string(\"b3\"), val=tensor([0,%d,0,0])];\n", DIM+HIDDEN]; + [m appendFormat:@" tensor h3 = slice_by_size(x=x,begin=b3,size=s1)[name=string(\"s3x\")];\n", HIDDEN, SEQ]; + [m appendFormat:@" tensor W2t = const()[name=string(\"W2t\"), val=tensor(BLOBFILE(path=string(\"@model_path/weights/w2t.bin\"), offset=uint64(64)))];\n", HIDDEN, DIM, HIDDEN, DIM]; + [m appendFormat:@" tensor dsilu = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W2t,x=dffn)[name=string(\"cw2\")];\n", HIDDEN, SEQ]; + [m appendFormat:@" tensor sig = sigmoid(x=h1)[name=string(\"sg\")];\n", HIDDEN, SEQ]; + [m appendString:@" fp16 one = const()[name=string(\"one\"), val=fp16(1.0)];\n"]; + [m appendFormat:@" tensor oms = sub(x=one,y=sig)[name=string(\"oms\")];\n", HIDDEN, SEQ]; + [m appendFormat:@" tensor homs = mul(x=h1,y=oms)[name=string(\"homs\")];\n", HIDDEN, SEQ]; + [m appendFormat:@" tensor brk = add(x=one,y=homs)[name=string(\"brk\")];\n", HIDDEN, SEQ]; + [m appendFormat:@" tensor dsd = mul(x=sig,y=brk)[name=string(\"dsd\")];\n", HIDDEN, SEQ]; + [m appendFormat:@" tensor t1 = mul(x=dsilu,y=h3)[name=string(\"t1\")];\n", HIDDEN, SEQ]; + [m appendFormat:@" tensor dh1 = mul(x=t1,y=dsd)[name=string(\"dh1\")];\n", HIDDEN, SEQ]; + [m appendFormat:@" tensor slh = mul(x=h1,y=sig)[name=string(\"slh\")];\n", HIDDEN, SEQ]; + [m appendFormat:@" tensor dh3 = mul(x=dsilu,y=slh)[name=string(\"dh3\")];\n", HIDDEN, SEQ]; + [m appendFormat:@" tensor W1t = const()[name=string(\"W1t\"), val=tensor(BLOBFILE(path=string(\"@model_path/weights/w1t.bin\"), offset=uint64(64)))];\n", DIM, HIDDEN, DIM, HIDDEN]; + [m appendFormat:@" tensor W3t = const()[name=string(\"W3t\"), val=tensor(BLOBFILE(path=string(\"@model_path/weights/w3t.bin\"), offset=uint64(64)))];\n", DIM, HIDDEN, DIM, HIDDEN]; + [m appendFormat:@" tensor dx1 = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W1t,x=dh1)[name=string(\"cw1\")];\n", DIM, SEQ]; + [m appendFormat:@" tensor dx3 = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W3t,x=dh3)[name=string(\"cw3\")];\n", DIM, SEQ]; + [m appendFormat:@" tensor dx = add(x=dx1,y=dx3)[name=string(\"adx\")];\n", DIM, SEQ]; + [m appendString:@" int32 cax = const()[name=string(\"cax\"), val=int32(1)];\n"]; + [m appendString:@" bool cid = const()[name=string(\"cid\"), val=bool(false)];\n"]; + [m appendFormat:@" tensor out = concat(axis=cax,interleave=cid,values=(dx,dh1,dh3))[name=string(\"cat\")];\n", DIM+2*HIDDEN, SEQ]; + [m appendString:@" } -> (out);\n}\n"]; + return m; +} + +// Fused QKV backward: concat(dq,dk,dv) → dx fp16 +static NSString *gen_qkvb(void) { + NSMutableString *m = [NSMutableString string]; + [m appendString:MIL_HDR]; + [m appendFormat:@" func main(tensor x) {\n", 3*DIM, SEQ]; + [m appendString:@CONV_CONST]; + [m appendFormat:@" tensor sz = const()[name=string(\"sz\"), val=tensor([1,%d,1,%d])];\n", DIM, SEQ]; + [m appendString:@" tensor b0 = const()[name=string(\"b0\"), val=tensor([0,0,0,0])];\n"]; + [m appendFormat:@" tensor dq = slice_by_size(x=x,begin=b0,size=sz)[name=string(\"s0\")];\n", DIM,SEQ]; + [m appendFormat:@" tensor b1 = const()[name=string(\"b1\"), val=tensor([0,%d,0,0])];\n", DIM]; + [m appendFormat:@" tensor dk = slice_by_size(x=x,begin=b1,size=sz)[name=string(\"s1\")];\n", DIM,SEQ]; + [m appendFormat:@" tensor b2 = const()[name=string(\"b2\"), val=tensor([0,%d,0,0])];\n", 2*DIM]; + [m appendFormat:@" tensor dv = slice_by_size(x=x,begin=b2,size=sz)[name=string(\"s2\")];\n", DIM,SEQ]; + [m appendFormat:@" tensor Wqt = const()[name=string(\"Wqt\"), val=tensor(BLOBFILE(path=string(\"@model_path/weights/wqt.bin\"), offset=uint64(64)))];\n", DIM,DIM,DIM,DIM]; + [m appendFormat:@" tensor Wkt = const()[name=string(\"Wkt\"), val=tensor(BLOBFILE(path=string(\"@model_path/weights/wkt.bin\"), offset=uint64(64)))];\n", DIM,DIM,DIM,DIM]; + [m appendFormat:@" tensor Wvt = const()[name=string(\"Wvt\"), val=tensor(BLOBFILE(path=string(\"@model_path/weights/wvt.bin\"), offset=uint64(64)))];\n", DIM,DIM,DIM,DIM]; + [m appendFormat:@" tensor dxq = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wqt,x=dq)[name=string(\"cq\")];\n", DIM,SEQ]; + [m appendFormat:@" tensor dxk = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wkt,x=dk)[name=string(\"ck\")];\n", DIM,SEQ]; + [m appendFormat:@" tensor dxv = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wvt,x=dv)[name=string(\"cv\")];\n", DIM,SEQ]; + [m appendFormat:@" tensor dxqk = add(x=dxq,y=dxk)[name=string(\"aqk\")];\n", DIM,SEQ]; + [m appendFormat:@" tensor out = add(x=dxqk,y=dxv)[name=string(\"out\")];\n", DIM,SEQ]; + [m appendString:@" } -> (out);\n}\n"]; + return m; +} + +// SDPA backward part 1 + Wo^T: concat(Q,K,V,dx2) → Wo^T(dx2) → concat(dV, probs_flat, dp_flat) fp16 +// SCORE_CH: channels needed for flattened attention scores [HEADS,SEQ,SEQ] → [HEADS*SEQ, SEQ] +#define SCORE_CH (HEADS*SEQ) + +static NSString *gen_sdpa_bwd1(void) { + float sc = 1.0f/sqrtf((float)HD); + NSMutableString *m = [NSMutableString string]; + [m appendString:MIL_HDR]; + [m appendFormat:@" func main(tensor x) {\n", 4*DIM, SEQ]; + [m appendString:@CONV_CONST]; + [m appendFormat:@" tensor sz = const()[name=string(\"sz\"), val=tensor([1,%d,1,%d])];\n", DIM, SEQ]; + [m appendString:@" tensor b0 = const()[name=string(\"b0\"), val=tensor([0,0,0,0])];\n"]; + [m appendFormat:@" tensor qf = slice_by_size(x=x,begin=b0,size=sz)[name=string(\"s0\")];\n", DIM,SEQ]; + [m appendFormat:@" tensor b1 = const()[name=string(\"b1\"), val=tensor([0,%d,0,0])];\n", DIM]; + [m appendFormat:@" tensor kf = slice_by_size(x=x,begin=b1,size=sz)[name=string(\"s1\")];\n", DIM,SEQ]; + [m appendFormat:@" tensor b2 = const()[name=string(\"b2\"), val=tensor([0,%d,0,0])];\n", 2*DIM]; + [m appendFormat:@" tensor vf = slice_by_size(x=x,begin=b2,size=sz)[name=string(\"s2\")];\n", DIM,SEQ]; + [m appendFormat:@" tensor b3 = const()[name=string(\"b3\"), val=tensor([0,%d,0,0])];\n", 3*DIM]; + [m appendFormat:@" tensor dx2f = slice_by_size(x=x,begin=b3,size=sz)[name=string(\"s3\")];\n", DIM,SEQ]; + // Wo^T backward: dx2 → dattn + [m appendFormat:@" tensor Wot = const()[name=string(\"Wot\"), val=tensor(BLOBFILE(path=string(\"@model_path/weights/wot.bin\"), offset=uint64(64)))];\n", DIM,DIM,DIM,DIM]; + [m appendFormat:@" tensor df = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wot,x=dx2f)[name=string(\"cwo\")];\n", DIM,SEQ]; + [m appendFormat:@" tensor rsh = const()[name=string(\"rsh\"), val=tensor([1,%d,%d,%d])];\n", HEADS,HD,SEQ]; + [m appendString:@" tensor pm = const()[name=string(\"pm\"), val=tensor([0,1,3,2])];\n"]; + [m appendFormat:@" tensor qr = reshape(shape=rsh,x=qf)[name=string(\"rq\")];\n", HEADS,HD,SEQ]; + [m appendFormat:@" tensor q = transpose(perm=pm,x=qr)[name=string(\"tq\")];\n", HEADS,SEQ,HD]; + [m appendFormat:@" tensor kr = reshape(shape=rsh,x=kf)[name=string(\"rk\")];\n", HEADS,HD,SEQ]; + [m appendFormat:@" tensor k = transpose(perm=pm,x=kr)[name=string(\"tk\")];\n", HEADS,SEQ,HD]; + [m appendFormat:@" tensor vr = reshape(shape=rsh,x=vf)[name=string(\"rv\")];\n", HEADS,HD,SEQ]; + [m appendFormat:@" tensor v = transpose(perm=pm,x=vr)[name=string(\"tv\")];\n", HEADS,SEQ,HD]; + [m appendFormat:@" tensor dr = reshape(shape=rsh,x=df)[name=string(\"rd\")];\n", HEADS,HD,SEQ]; + [m appendFormat:@" tensor da = transpose(perm=pm,x=dr)[name=string(\"td\")];\n", HEADS,SEQ,HD]; + [m appendString:@" bool bF = const()[name=string(\"bF\"), val=bool(false)];\n"]; + [m appendString:@" bool bT = const()[name=string(\"bT\"), val=bool(true)];\n"]; + [m appendFormat:@" tensor sc1 = matmul(transpose_x=bF,transpose_y=bT,x=q,y=k)[name=string(\"mm1\")];\n", HEADS,SEQ,SEQ]; + [m appendFormat:@" fp16 scv = const()[name=string(\"scv\"), val=fp16(%f)];\n", sc]; + [m appendFormat:@" tensor sc2 = mul(x=sc1,y=scv)[name=string(\"scl\")];\n", HEADS,SEQ,SEQ]; + [m appendFormat:@" tensor cm = const()[name=string(\"cm\"), val=tensor(BLOBFILE(path=string(\"@model_path/weights/mask.bin\"), offset=uint64(64)))];\n", SEQ,SEQ,SEQ,SEQ]; + [m appendFormat:@" tensor ms = add(x=sc2,y=cm)[name=string(\"msk\")];\n", HEADS,SEQ,SEQ]; + [m appendString:@" int32 sax = const()[name=string(\"sax\"), val=int32(-1)];\n"]; + [m appendFormat:@" tensor probs = softmax(axis=sax,x=ms)[name=string(\"sm\")];\n", HEADS,SEQ,SEQ]; + [m appendFormat:@" tensor dv4 = matmul(transpose_x=bT,transpose_y=bF,x=probs,y=da)[name=string(\"dv\")];\n", HEADS,SEQ,HD]; + [m appendFormat:@" tensor dp4 = matmul(transpose_x=bF,transpose_y=bT,x=da,y=v)[name=string(\"dp\")];\n", HEADS,SEQ,SEQ]; + // Flatten dv back to [1,DIM,1,SEQ] + [m appendFormat:@" tensor dvt = transpose(perm=pm,x=dv4)[name=string(\"dvt\")];\n", HEADS,HD,SEQ]; + [m appendFormat:@" tensor dvs = const()[name=string(\"dvs\"), val=tensor([1,%d,1,%d])];\n", DIM,SEQ]; + [m appendFormat:@" tensor dvf = reshape(shape=dvs,x=dvt)[name=string(\"dvf\")];\n", DIM,SEQ]; + // Flatten probs [1,H,S,S] → [1,H*S,1,S] and dp [1,H,S,S] → [1,H*S,1,S] + [m appendFormat:@" tensor scs = const()[name=string(\"scs\"), val=tensor([1,%d,1,%d])];\n", SCORE_CH,SEQ]; + [m appendFormat:@" tensor pf = reshape(shape=scs,x=probs)[name=string(\"pf\")];\n", SCORE_CH,SEQ]; + [m appendFormat:@" tensor dpf = reshape(shape=scs,x=dp4)[name=string(\"dpf\")];\n", SCORE_CH,SEQ]; + [m appendString:@" int32 cax = const()[name=string(\"cax\"), val=int32(1)];\n"]; + [m appendString:@" bool cid = const()[name=string(\"cid\"), val=bool(false)];\n"]; + [m appendFormat:@" tensor out = concat(axis=cax,interleave=cid,values=(dvf,pf,dpf))[name=string(\"cat\")];\n", DIM+2*SCORE_CH,SEQ]; + [m appendString:@" } -> (out);\n}\n"]; + return m; +} + +// SDPA backward part 2: concat(probs[SCORE_CH],dp[SCORE_CH],Q[DIM],K[DIM]) → concat(dQ,dK) fp16 +static NSString *gen_sdpa_bwd2(void) { + float sc = 1.0f/sqrtf((float)HD); + int bwd2_in = 2*SCORE_CH + 2*DIM; + NSMutableString *m = [NSMutableString string]; + [m appendString:MIL_HDR]; + [m appendFormat:@" func main(tensor x) {\n", bwd2_in, SEQ]; + // Slice probs + [m appendFormat:@" tensor sz_sc = const()[name=string(\"szsc\"), val=tensor([1,%d,1,%d])];\n", SCORE_CH, SEQ]; + [m appendString:@" tensor b0 = const()[name=string(\"b0\"), val=tensor([0,0,0,0])];\n"]; + [m appendFormat:@" tensor pf = slice_by_size(x=x,begin=b0,size=sz_sc)[name=string(\"s0\")];\n", SCORE_CH,SEQ]; + // Slice dp + [m appendFormat:@" tensor b1 = const()[name=string(\"b1\"), val=tensor([0,%d,0,0])];\n", SCORE_CH]; + [m appendFormat:@" tensor dpf = slice_by_size(x=x,begin=b1,size=sz_sc)[name=string(\"s1\")];\n", SCORE_CH,SEQ]; + // Slice Q + [m appendFormat:@" tensor sz_d = const()[name=string(\"szd\"), val=tensor([1,%d,1,%d])];\n", DIM, SEQ]; + [m appendFormat:@" tensor b2 = const()[name=string(\"b2\"), val=tensor([0,%d,0,0])];\n", 2*SCORE_CH]; + [m appendFormat:@" tensor qf = slice_by_size(x=x,begin=b2,size=sz_d)[name=string(\"s2\")];\n", DIM,SEQ]; + // Slice K + [m appendFormat:@" tensor b3 = const()[name=string(\"b3\"), val=tensor([0,%d,0,0])];\n", 2*SCORE_CH+DIM]; + [m appendFormat:@" tensor kf = slice_by_size(x=x,begin=b3,size=sz_d)[name=string(\"s3\")];\n", DIM,SEQ]; + // Reshape to multi-head + [m appendFormat:@" tensor ssh = const()[name=string(\"ssh\"), val=tensor([1,%d,%d,%d])];\n", HEADS,SEQ,SEQ]; + [m appendFormat:@" tensor probs = reshape(shape=ssh,x=pf)[name=string(\"rp\")];\n", HEADS,SEQ,SEQ]; + [m appendFormat:@" tensor dp = reshape(shape=ssh,x=dpf)[name=string(\"rdp\")];\n", HEADS,SEQ,SEQ]; + [m appendFormat:@" tensor rsh = const()[name=string(\"rsh\"), val=tensor([1,%d,%d,%d])];\n", HEADS,HD,SEQ]; + [m appendString:@" tensor pm = const()[name=string(\"pm\"), val=tensor([0,1,3,2])];\n"]; + [m appendFormat:@" tensor qr = reshape(shape=rsh,x=qf)[name=string(\"rq\")];\n", HEADS,HD,SEQ]; + [m appendFormat:@" tensor q = transpose(perm=pm,x=qr)[name=string(\"tq\")];\n", HEADS,SEQ,HD]; + [m appendFormat:@" tensor kr = reshape(shape=rsh,x=kf)[name=string(\"rk\")];\n", HEADS,HD,SEQ]; + [m appendFormat:@" tensor k = transpose(perm=pm,x=kr)[name=string(\"tk\")];\n", HEADS,SEQ,HD]; + // Softmax grad: ds = probs * (dp - sum(probs*dp)) * scale + [m appendFormat:@" tensor pdp = mul(x=probs,y=dp)[name=string(\"pdp\")];\n", HEADS,SEQ,SEQ]; + [m appendString:@" tensor rax = const()[name=string(\"rax\"), val=tensor([-1])];\n"]; + [m appendString:@" bool kd = const()[name=string(\"kd\"), val=bool(true)];\n"]; + [m appendFormat:@" tensor spdp = reduce_sum(x=pdp,axes=rax,keep_dims=kd)[name=string(\"rs\")];\n", HEADS,SEQ]; + [m appendFormat:@" tensor dps = sub(x=dp,y=spdp)[name=string(\"dps\")];\n", HEADS,SEQ,SEQ]; + [m appendFormat:@" tensor ds0 = mul(x=probs,y=dps)[name=string(\"ds0\")];\n", HEADS,SEQ,SEQ]; + [m appendFormat:@" fp16 scv = const()[name=string(\"scv\"), val=fp16(%f)];\n", sc]; + [m appendFormat:@" tensor ds = mul(x=ds0,y=scv)[name=string(\"ds\")];\n", HEADS,SEQ,SEQ]; + [m appendString:@" bool bF = const()[name=string(\"bF\"), val=bool(false)];\n"]; + [m appendString:@" bool bT = const()[name=string(\"bT\"), val=bool(true)];\n"]; + [m appendFormat:@" tensor dq4 = matmul(transpose_x=bF,transpose_y=bF,x=ds,y=k)[name=string(\"dq\")];\n", HEADS,SEQ,HD]; + [m appendFormat:@" tensor dk4 = matmul(transpose_x=bT,transpose_y=bF,x=ds,y=q)[name=string(\"dk\")];\n", HEADS,SEQ,HD]; + [m appendFormat:@" tensor dqt = transpose(perm=pm,x=dq4)[name=string(\"dqt\")];\n", HEADS,HD,SEQ]; + [m appendFormat:@" tensor dkt = transpose(perm=pm,x=dk4)[name=string(\"dkt\")];\n", HEADS,HD,SEQ]; + [m appendFormat:@" tensor fs = const()[name=string(\"fs\"), val=tensor([1,%d,1,%d])];\n", DIM,SEQ]; + [m appendFormat:@" tensor dqf = reshape(shape=fs,x=dqt)[name=string(\"dqf\")];\n", DIM,SEQ]; + [m appendFormat:@" tensor dkf = reshape(shape=fs,x=dkt)[name=string(\"dkf\")];\n", DIM,SEQ]; + [m appendString:@" int32 cax = const()[name=string(\"cax\"), val=int32(1)];\n"]; + [m appendString:@" bool cid = const()[name=string(\"cid\"), val=bool(false)];\n"]; + [m appendFormat:@" tensor out = concat(axis=cax,interleave=cid,values=(dqf,dkf))[name=string(\"cat\")];\n", 2*DIM,SEQ]; + [m appendString:@" } -> (out);\n}\n"]; + return m; +} + +// ===== Weight builders ===== +static NSData *g_mask_blob = nil; +static NSData *get_mask_blob(void) { + if (!g_mask_blob) { + _Float16 *mask = (_Float16*)calloc(SEQ*SEQ, sizeof(_Float16)); + for(int t=0;tmodel = CFBridgingRetain(mdl); + k->ioIn = make_surface(ic_bytes); + k->ioOut = make_surface(oc_bytes); + id wI = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), k->ioIn); + id wO = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), k->ioOut); + k->request = CFBridgingRetain(((id(*)(Class,SEL,id,id,id,id,id,id,id))objc_msgSend)(g_AR, + @selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:), + @[wI], @[@0], @[wO], @[@0], nil, nil, @0)); + k->tmpDir = CFBridgingRetain(td); + return k; + } +} +static void free_kern(Kern *k) { + if (!k) return; + id mdl = (__bridge id)k->model; NSError *e = nil; + ((BOOL(*)(id,SEL,unsigned int,NSError**))objc_msgSend)(mdl, @selector(unloadWithQoS:error:), 21, &e); + CFRelease(k->ioIn); CFRelease(k->ioOut); + [[NSFileManager defaultManager] removeItemAtPath:(__bridge id)k->tmpDir error:nil]; + CFRelease(k->model); CFRelease(k->request); CFRelease(k->tmpDir); + free(k); +} +static void ane_eval(Kern *k) { + id mdl = (__bridge id)k->model; id req = (__bridge id)k->request; NSError *e = nil; + ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)(mdl, @selector(evaluateWithQoS:options:request:error:), 21, @{}, req, &e); +} + +// ===== Vectorized conversion helpers (NEON) ===== +#include +static void cvt_f16_f32(float *dst, const _Float16 *src, int n) { + int i = 0; + for (; i+7 < n; i += 8) { + float16x8_t h = vld1q_f16((const __fp16*)(src+i)); + vst1q_f32(dst+i, vcvt_f32_f16(vget_low_f16(h))); + vst1q_f32(dst+i+4, vcvt_f32_f16(vget_high_f16(h))); + } + for (; i < n; i++) dst[i] = (float)src[i]; +} +static void cvt_f32_f16(_Float16 *dst, const float *src, int n) { + int i = 0; + for (; i+7 < n; i += 8) { + float16x8_t h = vcombine_f16(vcvt_f16_f32(vld1q_f32(src+i)), + vcvt_f16_f32(vld1q_f32(src+i+4))); + vst1q_f16((__fp16*)(dst+i), h); + } + for (; i < n; i++) dst[i] = (_Float16)src[i]; +} + +// ===== IOSurface I/O helpers (channel-first, no transpose) ===== +// All CPU buffers are [C,S] channel-first matching IOSurface [1,C,1,S] +// Write fp32 [C,S] → fp16 [1,C,1,S] (just type conversion, no transpose) +static void io_write_fp16(IOSurfaceRef s, const float *data, int channels, int sp) { + IOSurfaceLock(s, 0, NULL); + _Float16 *dst = (_Float16*)IOSurfaceGetBaseAddress(s); + cvt_f32_f16(dst, data, channels * sp); + IOSurfaceUnlock(s, 0, NULL); +} +// Write fp32 [C,S] → fp32 [1,C,1,S] (just memcpy) +static void io_write_fp32(IOSurfaceRef s, const float *data, int channels, int sp) { + IOSurfaceLock(s, 0, NULL); + memcpy(IOSurfaceGetBaseAddress(s), data, channels * sp * sizeof(float)); + IOSurfaceUnlock(s, 0, NULL); +} +// Read fp16 [1,C,1,S] → fp32 [C,S] at channel offset (just type conversion) +static void io_read_fp16(IOSurfaceRef s, float *data, int ch_off, int channels, int sp) { + IOSurfaceLock(s, kIOSurfaceLockReadOnly, NULL); + _Float16 *src = (_Float16*)IOSurfaceGetBaseAddress(s) + ch_off * sp; + cvt_f16_f32(data, src, channels * sp); + IOSurfaceUnlock(s, kIOSurfaceLockReadOnly, NULL); +} +// Read fp32 [1,C,1,S] → fp32 [C,S] (just memcpy) +static void io_read_fp32(IOSurfaceRef s, float *data, int channels, int sp) { + IOSurfaceLock(s, kIOSurfaceLockReadOnly, NULL); + memcpy(data, IOSurfaceGetBaseAddress(s), channels * sp * sizeof(float)); + IOSurfaceUnlock(s, kIOSurfaceLockReadOnly, NULL); +} +// Write multiple fp32 [C,S] arrays concatenated along channel dim as fp16 +static void io_write_multi_fp16(IOSurfaceRef s, int sp, int n, ...) { + IOSurfaceLock(s, 0, NULL); + _Float16 *dst = (_Float16*)IOSurfaceGetBaseAddress(s); + va_list ap; va_start(ap, n); + int ch_off = 0; + for (int i=0; im); free(s->v); } +static void adam_update(float *w, const float *g, AdamState *s, int t, float lr, float b1, float b2, float eps) { + float bc1 = 1.0f - powf(b1, t), bc2 = 1.0f - powf(b2, t); + for (size_t i=0; in; i++) { + s->m[i] = b1*s->m[i] + (1-b1)*g[i]; + s->v[i] = b2*s->v[i] + (1-b2)*g[i]*g[i]; + float mh = s->m[i]/bc1, vh = s->v[i]/bc2; + w[i] -= lr * mh / (sqrtf(vh) + eps); + } +} + +int main(int argc, char *argv[]) { + @autoreleasepool { + setbuf(stdout, NULL); + ane_init(); + mach_timebase_info(&g_tb); + + int total_steps = 400; + float lr = 1e-3f; + float adam_b1=0.9f, adam_b2=0.999f, adam_eps=1e-8f; + int adam_t = 0; + int start_step = 0; + + size_t wq_sz = DIM*DIM, wo_sz = DIM*DIM; + size_t w1_sz = HIDDEN*DIM, w2_sz = DIM*HIDDEN, w3_sz = HIDDEN*DIM; + size_t total_params = 4*wq_sz + w1_sz + w2_sz + w3_sz; + + float *Wq=malloc(wq_sz*4), *Wk=malloc(wq_sz*4), *Wv=malloc(wq_sz*4), *Wo=malloc(wo_sz*4); + float *W1=malloc(w1_sz*4), *W2=malloc(w2_sz*4), *W3=malloc(w3_sz*4); + float *rms1_w=malloc(DIM*4), *rms2_w=malloc(DIM*4); + + // Adam optimizer states (m and v for each weight) + AdamState aWq=adam_alloc(wq_sz), aWk=adam_alloc(wq_sz), aWv=adam_alloc(wq_sz), aWo=adam_alloc(wo_sz); + AdamState aW1=adam_alloc(w1_sz), aW2=adam_alloc(w2_sz), aW3=adam_alloc(w3_sz); + AdamState arms1=adam_alloc(DIM), arms2=adam_alloc(DIM); + + double cum_compile=0, cum_train=0, cum_wall=0; + int cum_steps=0, cum_batches=0; + + bool resuming = false; + if (argc > 1 && strcmp(argv[1], "--resume") == 0) { + FILE *f = fopen(CKPT_PATH, "rb"); + if (f) { + CkptHdr h; fread(&h, sizeof(h), 1, f); + start_step=h.step; total_steps=h.total_steps; lr=h.lr; + cum_compile=h.cum_compile; cum_train=h.cum_train; cum_wall=h.cum_wall; + cum_steps=h.cum_steps; cum_batches=h.cum_batches; adam_t=h.adam_t; + fread(Wq,4,wq_sz,f); fread(Wk,4,wq_sz,f); fread(Wv,4,wq_sz,f); fread(Wo,4,wo_sz,f); + fread(W1,4,w1_sz,f); fread(W2,4,w2_sz,f); fread(W3,4,w3_sz,f); + fread(rms1_w,4,DIM,f); fread(rms2_w,4,DIM,f); + // Adam state + fread(aWq.m,4,wq_sz,f);fread(aWq.v,4,wq_sz,f); + fread(aWk.m,4,wq_sz,f);fread(aWk.v,4,wq_sz,f); + fread(aWv.m,4,wq_sz,f);fread(aWv.v,4,wq_sz,f); + fread(aWo.m,4,wo_sz,f);fread(aWo.v,4,wo_sz,f); + fread(aW1.m,4,w1_sz,f);fread(aW1.v,4,w1_sz,f); + fread(aW2.m,4,w2_sz,f);fread(aW2.v,4,w2_sz,f); + fread(aW3.m,4,w3_sz,f);fread(aW3.v,4,w3_sz,f); + fread(arms1.m,4,DIM,f);fread(arms1.v,4,DIM,f); + fread(arms2.m,4,DIM,f);fread(arms2.v,4,DIM,f); + fclose(f); + resuming = true; + printf("[RESUMED step %d, loss=%.6f]\n", start_step, h.loss); + } + } + if (!resuming) { + srand48(42); + float scale_d=1.0f/sqrtf(DIM), scale_h=1.0f/sqrtf(HIDDEN); + for(size_t i=0;i