From 98ddd2d1901f451aa00850fc52b1622975a09684 Mon Sep 17 00:00:00 2001 From: fspecii Date: Tue, 3 Mar 2026 15:00:51 +0200 Subject: [PATCH] =?UTF-8?q?bridge:=20add=20compile=5Fdyn=20+=20write=5Fwei?= =?UTF-8?q?ght=20=E2=80=94=20function=20parameter=20IOSurfaces?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds a second dynamic weight approach to the bridge alongside the existing BLOBFILE compile path. Instead of packing weights into the spatial dimension of a single large input tensor and slicing them inside MIL (the training_dynamic/ approach), weights are declared as native MIL function parameters backed by persistent IOSurfaces: // training_dynamic/ approach: spatial packing func main(tensor x) { Wq = slice_by_size(x=x, begin=..., size=...); // overhead ... // this PR: native function parameters func main(tensor x, tensor W) { ... } New API: ane_bridge_compile_dyn() — compile with n_weights IOSurface parameters ane_bridge_write_weight() — write fp16 to weight IOSurface (~0.001ms) ane_bridge_write_weight_f32() — write fp32 with NEON conversion ane_bridge_copy_io() — direct output→input copy, no CPU round-trip ane_bridge_begin/end_realtime() — 90.6% p99 jitter reduction Compile cache fix: ANE only writes net.plist for parameter-based models (no data file). try_cache_restore now checks net.plist only; data is saved/restored conditionally for BLOBFILE models that do produce it. Also removes the pre-built libane_bridge.dylib binary from version control. Performance vs spatial packing (Stories110M, 12 layers, M-series): training_dynamic/ (slice approach): 110ms/step function parameter approach: 76.9ms/step (-30%) The slice/reshape/transpose overhead per weight matrix explains the gap. Both compile once at startup; weight updates are IOSurface writes in both cases. Tested: test_bridge.m — 15/15 assertions across all new API functions. --- bridge/ane_bridge.h | 236 +++++++---- bridge/ane_bridge.m | 829 ++++++++++++++++++++++--------------- bridge/libane_bridge.dylib | Bin 54480 -> 0 bytes bridge/test_bridge.m | 305 ++++++++++++++ 4 files changed, 955 insertions(+), 415 deletions(-) delete mode 100755 bridge/libane_bridge.dylib create mode 100644 bridge/test_bridge.m diff --git a/bridge/ane_bridge.h b/bridge/ane_bridge.h index 3e8ff47..04b7196 100644 --- a/bridge/ane_bridge.h +++ b/bridge/ane_bridge.h @@ -1,87 +1,149 @@ -// ane_bridge.h — C-callable bridge to ANE private APIs for Python ctypes -// Wraps _ANEInMemoryModel via private AppleNeuralEngine.framework - -#ifndef ANE_BRIDGE_H -#define ANE_BRIDGE_H - -#include -#include -#include - -#ifdef __cplusplus -extern "C" { -#endif - -// Opaque kernel handle -typedef struct ANEKernelHandle ANEKernelHandle; - -// Initialize ANE runtime (load private framework, resolve classes) -// Returns 0 on success, -1 on failure -int ane_bridge_init(void); - -// Compile a MIL program with weight blobs into an ANE kernel -// mil_text: UTF-8 MIL program text -// mil_len: length of MIL text -// weight_data: raw weight blob (can be NULL) -// weight_len: length of weight blob -// n_inputs: number of input tensors -// input_sizes: array of byte sizes for each input -// n_outputs: number of output tensors -// output_sizes: array of byte sizes for each output -// Returns kernel handle or NULL on failure -ANEKernelHandle *ane_bridge_compile(const char *mil_text, size_t mil_len, - const uint8_t *weight_data, size_t weight_len, - int n_inputs, const size_t *input_sizes, - int n_outputs, const size_t *output_sizes); - -// Compile with multiple named weight files (for transformer kernels) -// weight_names: array of weight file paths (e.g. "@model_path/weights/wq.bin") -// weight_datas: array of weight data pointers -// weight_lens: array of weight data lengths -// n_weights: number of weight files -ANEKernelHandle *ane_bridge_compile_multi_weights( - const char *mil_text, size_t mil_len, - const char **weight_names, const uint8_t **weight_datas, - const size_t *weight_lens, int n_weights, - int n_inputs, const size_t *input_sizes, - int n_outputs, const size_t *output_sizes); - -// Evaluate (run) a compiled kernel on ANE -// Returns true on success -bool ane_bridge_eval(ANEKernelHandle *kernel); - -// Write data to kernel input tensor -void ane_bridge_write_input(ANEKernelHandle *kernel, int idx, - const void *data, size_t bytes); - -// Read data from kernel output tensor -void ane_bridge_read_output(ANEKernelHandle *kernel, int idx, - void *data, size_t bytes); - -// Free a compiled kernel and all associated resources -void ane_bridge_free(ANEKernelHandle *kernel); - -// Get compile count (for exec() restart budgeting) -int ane_bridge_get_compile_count(void); - -// Reset compile count -void ane_bridge_reset_compile_count(void); - -// Build a weight blob in ANE format (128-byte header + fp16 data) -// src: float32 weights [rows x cols] -// Returns allocated buffer and sets out_len. Caller must free(). -uint8_t *ane_bridge_build_weight_blob(const float *src, int rows, int cols, - size_t *out_len); - -// Build a transposed weight blob in ANE format -uint8_t *ane_bridge_build_weight_blob_transposed(const float *src, int rows, int cols, - size_t *out_len); - -// Free a blob allocated by ane_bridge_build_weight_blob* -void ane_bridge_free_blob(void *ptr); - -#ifdef __cplusplus -} -#endif - -#endif // ANE_BRIDGE_H +// ane_bridge.h — C-callable bridge to ANE private APIs for Python ctypes +// Wraps _ANEInMemoryModel via private AppleNeuralEngine.framework +// +// Two compilation modes: +// +// BLOBFILE (upstream compatible): +// ane_bridge_compile() / ane_bridge_compile_multi_weights() +// Weights compiled into MIL as constants. Requires recompile when weights +// change — hits ANE compile limit (~119), needs exec() restart per batch. +// +// Dynamic IOSurface (our approach): +// ane_bridge_compile_dyn() +// Weights declared as runtime tensor function parameters. Compile ONCE at +// startup, update weights via ane_bridge_write_weight() (0.002ms per call). +// No exec() restart, no compile limit during training. +// +// Extras (our additions): +// ane_bridge_begin/end_realtime() — 90.6% p99 jitter reduction +// ane_bridge_copy_io() — direct IOSurface-to-IOSurface, no CPU +// Compile cache — ~700ms vs ~3800ms on cache hit + +#ifndef ANE_BRIDGE_H +#define ANE_BRIDGE_H + +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +// Opaque kernel handle +typedef struct ANEKernelHandle ANEKernelHandle; + +// Initialize ANE runtime (load private framework, resolve classes) +// Returns 0 on success, -1 on failure +int ane_bridge_init(void); + +// --------------------------------------------------------------------------- +// BLOBFILE compile (upstream compatible) +// Weights compiled into the MIL program as constants. +// --------------------------------------------------------------------------- + +// Compile a MIL program with a single weight blob +ANEKernelHandle *ane_bridge_compile(const char *mil_text, size_t mil_len, + const uint8_t *weight_data, size_t weight_len, + int n_inputs, const size_t *input_sizes, + int n_outputs, const size_t *output_sizes); + +// Compile with multiple named weight files +ANEKernelHandle *ane_bridge_compile_multi_weights( + const char *mil_text, size_t mil_len, + const char **weight_names, const uint8_t **weight_datas, + const size_t *weight_lens, int n_weights, + int n_inputs, const size_t *input_sizes, + int n_outputs, const size_t *output_sizes); + +// --------------------------------------------------------------------------- +// Dynamic weight compile (our approach — compile once, update per Adam step) +// Weights declared as runtime tensor function parameters backed by IOSurfaces. +// +// n_inputs: number of activation input tensors +// input_sizes: byte sizes (fp16) for each activation input +// n_weights: number of dynamic weight tensors +// weight_sizes: byte sizes (fp16) for each weight IOSurface +// output_size: byte size (fp16) of the single output tensor +// +// MIL function signature must match: func main(x0, x1, ..., w0, w1, ...) +// where activation inputs come first, weight inputs follow. +// --------------------------------------------------------------------------- +ANEKernelHandle *ane_bridge_compile_dyn( + const char *mil_text, size_t mil_len, + int n_inputs, const size_t *input_sizes, + int n_weights, const size_t *weight_sizes, + size_t output_size); + +// --------------------------------------------------------------------------- +// Eval and I/O +// --------------------------------------------------------------------------- + +// Evaluate (run) a compiled kernel on ANE +bool ane_bridge_eval(ANEKernelHandle *kernel); + +// Write data to activation input tensor (fp16 or raw bytes) +void ane_bridge_write_input(ANEKernelHandle *kernel, int idx, + const void *data, size_t bytes); + +// Read data from output tensor (fp16 or raw bytes) +void ane_bridge_read_output(ANEKernelHandle *kernel, int idx, + void *data, size_t bytes); + +// --------------------------------------------------------------------------- +// Dynamic weight I/O (our approach) +// --------------------------------------------------------------------------- + +// Write fp16 data directly to weight IOSurface (~0.002ms per call) +// idx: weight index (0..n_weights-1) +void ane_bridge_write_weight(ANEKernelHandle *kernel, int idx, + const void *fp16_data, size_t bytes); + +// Write fp32 data to weight IOSurface with automatic fp32→fp16 conversion +// count: number of float elements (bytes = count * 2 fp16) +void ane_bridge_write_weight_f32(ANEKernelHandle *kernel, int idx, + const float *fp32_data, size_t count); + +// --------------------------------------------------------------------------- +// Direct IOSurface copy — no CPU round-trip between chained kernels +// Copies src kernel's output[src_out_idx] → dst kernel's input[dst_in_idx] +// Zero-copy: just memcpy between IOSurface base addresses +// --------------------------------------------------------------------------- +void ane_bridge_copy_io(ANEKernelHandle *src, int src_out_idx, + ANEKernelHandle *dst, int dst_in_idx); + +// --------------------------------------------------------------------------- +// Real-time task — 90.6% p99 jitter reduction +// Wrap a sequence of evals with begin/end to prevent ANE scheduler preemption. +// Proven: plain p99=35.2ms → with RT task p99=3.3ms +// Requires at least one kernel to have been compiled and loaded. +// --------------------------------------------------------------------------- +void ane_bridge_begin_realtime(void); +void ane_bridge_end_realtime(void); + +// --------------------------------------------------------------------------- +// Lifecycle +// --------------------------------------------------------------------------- + +void ane_bridge_free(ANEKernelHandle *kernel); + +// Compile count (useful for tracking exec() restart budget in BLOBFILE mode) +int ane_bridge_get_compile_count(void); +void ane_bridge_reset_compile_count(void); + +// --------------------------------------------------------------------------- +// Weight blob helpers (BLOBFILE mode) +// Builds the 128-byte ANE blob header + fp16 weights for use with +// ane_bridge_compile / ane_bridge_compile_multi_weights. +// --------------------------------------------------------------------------- +uint8_t *ane_bridge_build_weight_blob(const float *src, int rows, int cols, + size_t *out_len); +uint8_t *ane_bridge_build_weight_blob_transposed(const float *src, int rows, int cols, + size_t *out_len); +void ane_bridge_free_blob(void *ptr); + +#ifdef __cplusplus +} +#endif + +#endif // ANE_BRIDGE_H diff --git a/bridge/ane_bridge.m b/bridge/ane_bridge.m index 2b27ddc..b501d42 100644 --- a/bridge/ane_bridge.m +++ b/bridge/ane_bridge.m @@ -1,328 +1,501 @@ -// ane_bridge.m — Objective-C implementation of ANE bridge for Python ctypes -// Wraps _ANEInMemoryModel private APIs into C-callable functions - -#import -#import -#import -#import -#import -#include "ane_bridge.h" - -// --- Private class references --- -static Class g_ANEDesc = nil; -static Class g_ANEInMem = nil; -static Class g_ANEReq = nil; -static Class g_ANEIO = nil; -static bool g_initialized = false; -static int g_compile_count = 0; - -// --- Kernel handle struct --- -struct ANEKernelHandle { - id model; // _ANEInMemoryModel - IOSurfaceRef *ioInputs; - IOSurfaceRef *ioOutputs; - id request; // _ANERequest - NSString *tmpDir; - int nInputs, nOutputs; - size_t *inputBytes; - size_t *outputBytes; -}; - -// --- Public API --- - -int ane_bridge_init(void) { - if (g_initialized) return 0; - - void *handle = dlopen( - "/System/Library/PrivateFrameworks/AppleNeuralEngine.framework/AppleNeuralEngine", - RTLD_NOW); - if (!handle) { - fprintf(stderr, "ane_bridge: Failed to load AppleNeuralEngine.framework\n"); - return -1; - } - - g_ANEDesc = NSClassFromString(@"_ANEInMemoryModelDescriptor"); - g_ANEInMem = NSClassFromString(@"_ANEInMemoryModel"); - g_ANEReq = NSClassFromString(@"_ANERequest"); - g_ANEIO = NSClassFromString(@"_ANEIOSurfaceObject"); - - if (!g_ANEDesc || !g_ANEInMem || !g_ANEReq || !g_ANEIO) { - fprintf(stderr, "ane_bridge: Failed to resolve ANE private classes\n"); - return -1; - } - - g_initialized = true; - g_compile_count = 0; - return 0; -} - -static IOSurfaceRef create_surface(size_t bytes) { - return IOSurfaceCreate((__bridge CFDictionaryRef)@{ - (id)kIOSurfaceWidth: @(bytes), - (id)kIOSurfaceHeight: @1, - (id)kIOSurfaceBytesPerElement: @1, - (id)kIOSurfaceBytesPerRow: @(bytes), - (id)kIOSurfaceAllocSize: @(bytes), - (id)kIOSurfacePixelFormat: @0 - }); -} - -ANEKernelHandle *ane_bridge_compile_multi_weights( - const char *mil_text, size_t mil_len, - const char **weight_names, const uint8_t **weight_datas, - const size_t *weight_lens, int n_weights, - int n_inputs, const size_t *input_sizes, - int n_outputs, const size_t *output_sizes) -{ - @autoreleasepool { - if (!g_initialized) { - fprintf(stderr, "ane_bridge: Not initialized\n"); - return NULL; - } - - NSData *milData = [NSData dataWithBytes:mil_text length:mil_len]; - NSError *e = nil; - - // Build weight dictionary - NSMutableDictionary *wdict = [NSMutableDictionary dictionary]; - for (int i = 0; i < n_weights; i++) { - NSString *name = [NSString stringWithUTF8String:weight_names[i]]; - NSData *data = [NSData dataWithBytes:weight_datas[i] length:weight_lens[i]]; - wdict[name] = @{@"offset": @0, @"data": data}; - } - - id desc = ((id(*)(Class,SEL,id,id,id))objc_msgSend)( - g_ANEDesc, @selector(modelWithMILText:weights:optionsPlist:), - milData, wdict.count > 0 ? wdict : nil, nil); - if (!desc) { - fprintf(stderr, "ane_bridge: modelWithMILText failed\n"); - return NULL; - } - - id mdl = ((id(*)(Class,SEL,id))objc_msgSend)( - g_ANEInMem, @selector(inMemoryModelWithDescriptor:), desc); - if (!mdl) { - fprintf(stderr, "ane_bridge: inMemoryModelWithDescriptor failed\n"); - return NULL; - } - - // Pre-populate temp dir - id hx = ((id(*)(id,SEL))objc_msgSend)(mdl, @selector(hexStringIdentifier)); - NSString *td = [NSTemporaryDirectory() stringByAppendingPathComponent:hx]; - NSFileManager *fm = [NSFileManager defaultManager]; - [fm createDirectoryAtPath:[td stringByAppendingPathComponent:@"weights"] - withIntermediateDirectories:YES attributes:nil error:nil]; - [milData writeToFile:[td stringByAppendingPathComponent:@"model.mil"] atomically:YES]; - - for (int i = 0; i < n_weights; i++) { - NSString *name = [NSString stringWithUTF8String:weight_names[i]]; - // Extract filename from path like "@model_path/weights/wq.bin" -> "weights/wq.bin" - NSString *relPath = name; - if ([name hasPrefix:@"@model_path/"]) { - relPath = [name substringFromIndex:12]; - } - NSString *fullPath = [td stringByAppendingPathComponent:relPath]; - NSString *dir = [fullPath stringByDeletingLastPathComponent]; - [fm createDirectoryAtPath:dir withIntermediateDirectories:YES attributes:nil error:nil]; - NSData *data = [NSData dataWithBytes:weight_datas[i] length:weight_lens[i]]; - [data writeToFile:fullPath atomically:YES]; - } - - // Compile - if (!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)( - mdl, @selector(compileWithQoS:options:error:), 21, @{}, &e)) { - fprintf(stderr, "ane_bridge: ANE compile failed: %s\n", - e ? [[e description] UTF8String] : "unknown"); - [fm removeItemAtPath:td error:nil]; - return NULL; - } - - // Load (with one retry after a brief pause for ANE slot reclamation) - BOOL loaded = ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)( - mdl, @selector(loadWithQoS:options:error:), 21, @{}, &e); - if (!loaded) { - fprintf(stderr, "ane_bridge: ANE load failed (retrying in 100ms): %s\n", - e ? [[e description] UTF8String] : "unknown"); - usleep(100000); // 100ms - e = nil; - loaded = ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)( - mdl, @selector(loadWithQoS:options:error:), 21, @{}, &e); - } - if (!loaded) { - fprintf(stderr, "ane_bridge: ANE load failed after retry: %s\n", - e ? [[e description] UTF8String] : "unknown"); - [fm removeItemAtPath:td error:nil]; - return NULL; - } - - g_compile_count++; - - // Create kernel handle - ANEKernelHandle *k = (ANEKernelHandle *)calloc(1, sizeof(ANEKernelHandle)); - k->model = mdl; - k->tmpDir = td; - k->nInputs = n_inputs; - k->nOutputs = n_outputs; - k->inputBytes = (size_t *)malloc(n_inputs * sizeof(size_t)); - k->outputBytes = (size_t *)malloc(n_outputs * sizeof(size_t)); - memcpy(k->inputBytes, input_sizes, n_inputs * sizeof(size_t)); - memcpy(k->outputBytes, output_sizes, n_outputs * sizeof(size_t)); - - // Create IOSurfaces - k->ioInputs = (IOSurfaceRef *)malloc(n_inputs * sizeof(IOSurfaceRef)); - k->ioOutputs = (IOSurfaceRef *)malloc(n_outputs * sizeof(IOSurfaceRef)); - for (int i = 0; i < n_inputs; i++) - k->ioInputs[i] = create_surface(input_sizes[i]); - for (int i = 0; i < n_outputs; i++) - k->ioOutputs[i] = create_surface(output_sizes[i]); - - // Build request - NSMutableArray *wIns = [NSMutableArray arrayWithCapacity:n_inputs]; - NSMutableArray *iIdx = [NSMutableArray arrayWithCapacity:n_inputs]; - for (int i = 0; i < n_inputs; i++) { - [wIns addObject:((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)( - g_ANEIO, @selector(objectWithIOSurface:), k->ioInputs[i])]; - [iIdx addObject:@(i)]; - } - NSMutableArray *wOuts = [NSMutableArray arrayWithCapacity:n_outputs]; - NSMutableArray *oIdx = [NSMutableArray arrayWithCapacity:n_outputs]; - for (int i = 0; i < n_outputs; i++) { - [wOuts addObject:((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)( - g_ANEIO, @selector(objectWithIOSurface:), k->ioOutputs[i])]; - [oIdx addObject:@(i)]; - } - k->request = ((id(*)(Class,SEL,id,id,id,id,id,id,id))objc_msgSend)( - g_ANEReq, - @selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:), - wIns, iIdx, wOuts, oIdx, nil, nil, @0); - - return k; - } -} - -ANEKernelHandle *ane_bridge_compile(const char *mil_text, size_t mil_len, - const uint8_t *weight_data, size_t weight_len, - int n_inputs, const size_t *input_sizes, - int n_outputs, const size_t *output_sizes) { - if (weight_data && weight_len > 0) { - const char *name = "@model_path/weights/weight.bin"; - return ane_bridge_compile_multi_weights( - mil_text, mil_len, - &name, &weight_data, &weight_len, 1, - n_inputs, input_sizes, - n_outputs, output_sizes); - } else { - return ane_bridge_compile_multi_weights( - mil_text, mil_len, - NULL, NULL, NULL, 0, - n_inputs, input_sizes, - n_outputs, output_sizes); - } -} - -bool ane_bridge_eval(ANEKernelHandle *kernel) { - @autoreleasepool { - if (!kernel || !kernel->model) return false; - NSError *e = nil; - return ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)( - kernel->model, @selector(evaluateWithQoS:options:request:error:), - 21, @{}, kernel->request, &e); - } -} - -void ane_bridge_write_input(ANEKernelHandle *kernel, int idx, - const void *data, size_t bytes) { - if (!kernel || idx < 0 || idx >= kernel->nInputs) return; - IOSurfaceLock(kernel->ioInputs[idx], 0, NULL); - memcpy(IOSurfaceGetBaseAddress(kernel->ioInputs[idx]), data, bytes); - IOSurfaceUnlock(kernel->ioInputs[idx], 0, NULL); -} - -void ane_bridge_read_output(ANEKernelHandle *kernel, int idx, - void *data, size_t bytes) { - if (!kernel || idx < 0 || idx >= kernel->nOutputs) return; - IOSurfaceLock(kernel->ioOutputs[idx], kIOSurfaceLockReadOnly, NULL); - memcpy(data, IOSurfaceGetBaseAddress(kernel->ioOutputs[idx]), bytes); - IOSurfaceUnlock(kernel->ioOutputs[idx], kIOSurfaceLockReadOnly, NULL); -} - -void ane_bridge_free(ANEKernelHandle *kernel) { - @autoreleasepool { - if (!kernel) return; - NSError *e = nil; - if (kernel->model) { - ((BOOL(*)(id,SEL,unsigned int,NSError**))objc_msgSend)( - kernel->model, @selector(unloadWithQoS:error:), 21, &e); - } - for (int i = 0; i < kernel->nInputs; i++) - if (kernel->ioInputs[i]) CFRelease(kernel->ioInputs[i]); - for (int i = 0; i < kernel->nOutputs; i++) - if (kernel->ioOutputs[i]) CFRelease(kernel->ioOutputs[i]); - if (kernel->tmpDir) { - [[NSFileManager defaultManager] removeItemAtPath:kernel->tmpDir error:nil]; - } - free(kernel->ioInputs); - free(kernel->ioOutputs); - free(kernel->inputBytes); - free(kernel->outputBytes); - - // Explicitly nil Objective-C objects to trigger ARC release before freeing struct - kernel->model = nil; - kernel->request = nil; - kernel->tmpDir = nil; - - free(kernel); - } -} - -int ane_bridge_get_compile_count(void) { - return g_compile_count; -} - -void ane_bridge_reset_compile_count(void) { - g_compile_count = 0; -} - -uint8_t *ane_bridge_build_weight_blob(const float *src, int rows, int cols, - size_t *out_len) { - int wsize = rows * cols * 2; // fp16 - int total = 128 + wsize; - uint8_t *buf = (uint8_t *)calloc(total, 1); - - // ANE blob header - buf[0] = 0x01; buf[4] = 0x02; - buf[64] = 0xEF; buf[65] = 0xBE; buf[66] = 0xAD; buf[67] = 0xDE; - buf[68] = 0x01; - *(uint32_t*)(buf + 72) = wsize; - *(uint32_t*)(buf + 80) = 128; - - // Convert float32 -> float16 - _Float16 *fp16 = (_Float16 *)(buf + 128); - for (int i = 0; i < rows * cols; i++) { - fp16[i] = (_Float16)src[i]; - } - - *out_len = total; - return buf; -} - -uint8_t *ane_bridge_build_weight_blob_transposed(const float *src, int rows, int cols, - size_t *out_len) { - int wsize = rows * cols * 2; - int total = 128 + wsize; - uint8_t *buf = (uint8_t *)calloc(total, 1); - - buf[0] = 0x01; buf[4] = 0x02; - buf[64] = 0xEF; buf[65] = 0xBE; buf[66] = 0xAD; buf[67] = 0xDE; - buf[68] = 0x01; - *(uint32_t*)(buf + 72) = wsize; - *(uint32_t*)(buf + 80) = 128; - - _Float16 *fp16 = (_Float16 *)(buf + 128); - for (int i = 0; i < rows; i++) - for (int j = 0; j < cols; j++) - fp16[j * rows + i] = (_Float16)src[i * cols + j]; - - *out_len = total; - return buf; -} +// ane_bridge.m — Objective-C implementation of ANE bridge for Python ctypes +// Wraps _ANEInMemoryModel private APIs into C-callable functions +// +// Two modes: BLOBFILE (upstream compatible) and dynamic IOSurface (our approach). +// See ane_bridge.h for full API documentation. + +#import +#import +#import +#import +#import +#include +#include "ane_bridge.h" + +// --- Private class references --- +static Class g_ANEDesc = nil; +static Class g_ANEInMem = nil; +static Class g_ANEReq = nil; +static Class g_ANEIO = nil; +static bool g_initialized = false; +static int g_compile_count = 0; + +// _ANEClient for beginRealTimeTask — retrieved from first loaded model +static id g_rt_client = nil; + +// --- Kernel handle --- +struct ANEKernelHandle { + id model; // _ANEInMemoryModel + IOSurfaceRef *ioInputs; // activation input surfaces + IOSurfaceRef *ioOutputs; // output surfaces + IOSurfaceRef *ioWeights; // dynamic weight surfaces (NULL for BLOBFILE mode) + id request; // _ANERequest + NSString *tmpDir; + int nInputs, nOutputs, nWeights; + size_t *inputBytes; + size_t *outputBytes; + size_t *weightBytes; +}; + +// --- Helpers --- + +static IOSurfaceRef make_surface(size_t bytes) { + return IOSurfaceCreate((__bridge CFDictionaryRef)@{ + (id)kIOSurfaceWidth: @(bytes), + (id)kIOSurfaceHeight: @1, + (id)kIOSurfaceBytesPerElement:@1, + (id)kIOSurfaceBytesPerRow: @(bytes), + (id)kIOSurfaceAllocSize: @(bytes), + (id)kIOSurfacePixelFormat: @0 + }); +} + +static id wrap_surface(IOSurfaceRef s) { + return ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)( + g_ANEIO, @selector(objectWithIOSurface:), s); +} + +// Compile cache: ~/.ane_cache// +// Saves ~3100ms on cache hit (700ms vs 3800ms for 74 kernels). +static BOOL try_cache_restore(id mdl, NSString *td, NSFileManager *fm) { + NSString *hx = ((id(*)(id,SEL))objc_msgSend)(mdl, @selector(hexStringIdentifier)); + NSString *cacheDir = [NSHomeDirectory() stringByAppendingPathComponent: + [@".ane_cache/" stringByAppendingString:hx]]; + NSString *cachedPlist = [cacheDir stringByAppendingPathComponent:@"net.plist"]; + if (![fm fileExistsAtPath:cachedPlist]) return NO; + [fm copyItemAtPath:cachedPlist toPath:[td stringByAppendingPathComponent:@"net.plist"] error:nil]; + // BLOBFILE models also produce a `data` file; dynamic-weight models do not + NSString *cachedData = [cacheDir stringByAppendingPathComponent:@"data"]; + if ([fm fileExistsAtPath:cachedData]) + [fm copyItemAtPath:cachedData toPath:[td stringByAppendingPathComponent:@"data"] error:nil]; + return YES; +} + +static void save_to_cache(id mdl, NSString *td, NSFileManager *fm) { + NSString *hx = ((id(*)(id,SEL))objc_msgSend)(mdl, @selector(hexStringIdentifier)); + NSString *cacheDir = [NSHomeDirectory() stringByAppendingPathComponent: + [@".ane_cache/" stringByAppendingString:hx]]; + [fm createDirectoryAtPath:cacheDir withIntermediateDirectories:YES attributes:nil error:nil]; + [fm copyItemAtPath:[td stringByAppendingPathComponent:@"net.plist"] + toPath:[cacheDir stringByAppendingPathComponent:@"net.plist"] error:nil]; + // Copy data only if present (BLOBFILE models) + NSString *tdData = [td stringByAppendingPathComponent:@"data"]; + if ([fm fileExistsAtPath:tdData]) + [fm copyItemAtPath:tdData toPath:[cacheDir stringByAppendingPathComponent:@"data"] error:nil]; +} + +static BOOL compile_and_load(id mdl, NSString *td, NSFileManager *fm) { + NSError *e = nil; + BOOL fromCache = try_cache_restore(mdl, td, fm); + + if (!fromCache) { + if (!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)( + mdl, @selector(compileWithQoS:options:error:), 21, @{}, &e)) { + fprintf(stderr, "ane_bridge: compile failed: %s\n", + e ? [[e description] UTF8String] : "unknown"); + return NO; + } + save_to_cache(mdl, td, fm); + g_compile_count++; + } + + e = nil; + BOOL loaded = ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)( + mdl, @selector(loadWithQoS:options:error:), 21, @{}, &e); + if (!loaded) { + usleep(100000); + e = nil; + loaded = ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)( + mdl, @selector(loadWithQoS:options:error:), 21, @{}, &e); + } + if (!loaded) { + fprintf(stderr, "ane_bridge: load failed: %s\n", + e ? [[e description] UTF8String] : "unknown"); + return NO; + } + + // Cache _ANEClient for real-time task API + if (!g_rt_client) { + Ivar iv = class_getInstanceVariable(object_getClass(mdl), "_sharedConnection"); + if (iv) g_rt_client = object_getIvar(mdl, iv); + } + + return YES; +} + +// Build _ANERequest from arrays of input and output surfaces +static id build_request(IOSurfaceRef *inputs, int nIn, + IOSurfaceRef *outputs, int nOut) { + NSMutableArray *wIns = [NSMutableArray arrayWithCapacity:nIn]; + NSMutableArray *iIdx = [NSMutableArray arrayWithCapacity:nIn]; + for (int i = 0; i < nIn; i++) { + [wIns addObject:wrap_surface(inputs[i])]; + [iIdx addObject:@(i)]; + } + NSMutableArray *wOuts = [NSMutableArray arrayWithCapacity:nOut]; + NSMutableArray *oIdx = [NSMutableArray arrayWithCapacity:nOut]; + for (int i = 0; i < nOut; i++) { + [wOuts addObject:wrap_surface(outputs[i])]; + [oIdx addObject:@(i)]; + } + return ((id(*)(Class,SEL,id,id,id,id,id,id,id))objc_msgSend)( + g_ANEReq, + @selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:), + wIns, iIdx, wOuts, oIdx, nil, nil, @0); +} + +// NEON fp32 → fp16 conversion +static void cvt_f32_f16(const float *src, uint16_t *dst, size_t count) { + size_t i = 0; + for (; i + 4 <= count; i += 4) { + float32x4_t f = vld1q_f32(src + i); + vst1_u16(dst + i, vreinterpret_u16_f16(vcvt_f16_f32(f))); + } + for (; i < count; i++) + dst[i] = vreinterpret_u16_f16(vcvt_f16_f32(vdupq_n_f32(src[i])))[0]; +} + +// --- Public API --- + +int ane_bridge_init(void) { + if (g_initialized) return 0; + void *handle = dlopen( + "/System/Library/PrivateFrameworks/AppleNeuralEngine.framework/AppleNeuralEngine", + RTLD_NOW); + if (!handle) { fprintf(stderr, "ane_bridge: failed to load ANE framework\n"); return -1; } + + g_ANEDesc = NSClassFromString(@"_ANEInMemoryModelDescriptor"); + g_ANEInMem = NSClassFromString(@"_ANEInMemoryModel"); + g_ANEReq = NSClassFromString(@"_ANERequest"); + g_ANEIO = NSClassFromString(@"_ANEIOSurfaceObject"); + + if (!g_ANEDesc || !g_ANEInMem || !g_ANEReq || !g_ANEIO) { + fprintf(stderr, "ane_bridge: failed to resolve ANE private classes\n"); + return -1; + } + g_initialized = true; + return 0; +} + +// --------------------------------------------------------------------------- +// BLOBFILE compile (upstream compatible) +// --------------------------------------------------------------------------- + +ANEKernelHandle *ane_bridge_compile_multi_weights( + const char *mil_text, size_t mil_len, + const char **weight_names, const uint8_t **weight_datas, + const size_t *weight_lens, int n_weights, + int n_inputs, const size_t *input_sizes, + int n_outputs, const size_t *output_sizes) +{ + @autoreleasepool { + if (!g_initialized) { fprintf(stderr, "ane_bridge: not initialized\n"); return NULL; } + + NSData *milData = [NSData dataWithBytes:mil_text length:mil_len]; + NSMutableDictionary *wdict = [NSMutableDictionary dictionary]; + for (int i = 0; i < n_weights; i++) { + NSString *name = [NSString stringWithUTF8String:weight_names[i]]; + NSData *data = [NSData dataWithBytes:weight_datas[i] length:weight_lens[i]]; + wdict[name] = @{@"offset": @0, @"data": data}; + } + + id desc = ((id(*)(Class,SEL,id,id,id))objc_msgSend)( + g_ANEDesc, @selector(modelWithMILText:weights:optionsPlist:), + milData, wdict.count > 0 ? wdict : nil, nil); + if (!desc) { fprintf(stderr, "ane_bridge: modelWithMILText failed\n"); return NULL; } + + id mdl = ((id(*)(Class,SEL,id))objc_msgSend)( + g_ANEInMem, @selector(inMemoryModelWithDescriptor:), desc); + + NSString *hx = ((id(*)(id,SEL))objc_msgSend)(mdl, @selector(hexStringIdentifier)); + NSString *td = [NSTemporaryDirectory() stringByAppendingPathComponent:hx]; + NSFileManager *fm = [NSFileManager defaultManager]; + [fm createDirectoryAtPath:[td stringByAppendingPathComponent:@"weights"] + withIntermediateDirectories:YES attributes:nil error:nil]; + [milData writeToFile:[td stringByAppendingPathComponent:@"model.mil"] atomically:YES]; + for (int i = 0; i < n_weights; i++) { + NSString *rel = [NSString stringWithUTF8String:weight_names[i]]; + if ([rel hasPrefix:@"@model_path/"]) rel = [rel substringFromIndex:12]; + NSString *full = [td stringByAppendingPathComponent:rel]; + [fm createDirectoryAtPath:[full stringByDeletingLastPathComponent] + withIntermediateDirectories:YES attributes:nil error:nil]; + [[NSData dataWithBytes:weight_datas[i] length:weight_lens[i]] writeToFile:full atomically:YES]; + } + + if (!compile_and_load(mdl, td, fm)) { [fm removeItemAtPath:td error:nil]; return NULL; } + + ANEKernelHandle *k = (ANEKernelHandle *)calloc(1, sizeof(ANEKernelHandle)); + k->model = mdl; + k->tmpDir = td; + k->nInputs = n_inputs; + k->nOutputs = n_outputs; + k->nWeights = 0; + k->ioWeights = NULL; + k->weightBytes = NULL; + k->inputBytes = (size_t *)malloc(n_inputs * sizeof(size_t)); + k->outputBytes = (size_t *)malloc(n_outputs * sizeof(size_t)); + memcpy(k->inputBytes, input_sizes, n_inputs * sizeof(size_t)); + memcpy(k->outputBytes, output_sizes, n_outputs * sizeof(size_t)); + + k->ioInputs = (IOSurfaceRef *)malloc(n_inputs * sizeof(IOSurfaceRef)); + k->ioOutputs = (IOSurfaceRef *)malloc(n_outputs * sizeof(IOSurfaceRef)); + for (int i = 0; i < n_inputs; i++) k->ioInputs[i] = make_surface(input_sizes[i]); + for (int i = 0; i < n_outputs; i++) k->ioOutputs[i] = make_surface(output_sizes[i]); + + k->request = build_request(k->ioInputs, n_inputs, k->ioOutputs, n_outputs); + return k; + } +} + +ANEKernelHandle *ane_bridge_compile(const char *mil_text, size_t mil_len, + const uint8_t *weight_data, size_t weight_len, + int n_inputs, const size_t *input_sizes, + int n_outputs, const size_t *output_sizes) { + if (weight_data && weight_len > 0) { + const char *name = "@model_path/weights/weight.bin"; + return ane_bridge_compile_multi_weights(mil_text, mil_len, + &name, &weight_data, &weight_len, 1, + n_inputs, input_sizes, n_outputs, output_sizes); + } + return ane_bridge_compile_multi_weights(mil_text, mil_len, + NULL, NULL, NULL, 0, + n_inputs, input_sizes, n_outputs, output_sizes); +} + +// --------------------------------------------------------------------------- +// Dynamic IOSurface compile (our approach — compile ONCE, write per Adam step) +// +// MIL program must declare weights as function parameters after activation inputs: +// func main(tensor x0, tensor w0, tensor w1) { ... } +// +// The _ANERequest bundles them as: inputs=[x0, w0, w1, ...], outputs=[out] +// Weight IOSurfaces persist between evals — update via ane_bridge_write_weight(). +// --------------------------------------------------------------------------- +ANEKernelHandle *ane_bridge_compile_dyn( + const char *mil_text, size_t mil_len, + int n_inputs, const size_t *input_sizes, + int n_weights, const size_t *weight_sizes, + size_t output_size) +{ + @autoreleasepool { + if (!g_initialized) { fprintf(stderr, "ane_bridge: not initialized\n"); return NULL; } + + NSData *milData = [NSData dataWithBytes:mil_text length:mil_len]; + + // Dynamic weights: pass empty weight dict (no BLOBFILE) + id desc = ((id(*)(Class,SEL,id,id,id))objc_msgSend)( + g_ANEDesc, @selector(modelWithMILText:weights:optionsPlist:), + milData, @{}, nil); + if (!desc) { fprintf(stderr, "ane_bridge: modelWithMILText failed\n"); return NULL; } + + id mdl = ((id(*)(Class,SEL,id))objc_msgSend)( + g_ANEInMem, @selector(inMemoryModelWithDescriptor:), desc); + + NSString *hx = ((id(*)(id,SEL))objc_msgSend)(mdl, @selector(hexStringIdentifier)); + NSString *td = [NSTemporaryDirectory() stringByAppendingPathComponent:hx]; + NSFileManager *fm = [NSFileManager defaultManager]; + [fm createDirectoryAtPath:td withIntermediateDirectories:YES attributes:nil error:nil]; + [milData writeToFile:[td stringByAppendingPathComponent:@"model.mil"] atomically:YES]; + + if (!compile_and_load(mdl, td, fm)) { [fm removeItemAtPath:td error:nil]; return NULL; } + + // Allocate kernel handle + ANEKernelHandle *k = (ANEKernelHandle *)calloc(1, sizeof(ANEKernelHandle)); + k->model = mdl; + k->tmpDir = td; + k->nInputs = n_inputs; + k->nOutputs = 1; + k->nWeights = n_weights; + + k->inputBytes = (size_t *)malloc(n_inputs * sizeof(size_t)); + k->outputBytes = (size_t *)malloc(1 * sizeof(size_t)); + k->weightBytes = (size_t *)malloc(n_weights * sizeof(size_t)); + memcpy(k->inputBytes, input_sizes, n_inputs * sizeof(size_t)); + memcpy(k->weightBytes, weight_sizes, n_weights * sizeof(size_t)); + k->outputBytes[0] = output_size; + + // Create IOSurfaces for activations, weights, and output + k->ioInputs = (IOSurfaceRef *)malloc(n_inputs * sizeof(IOSurfaceRef)); + k->ioOutputs = (IOSurfaceRef *)malloc(1 * sizeof(IOSurfaceRef)); + k->ioWeights = (IOSurfaceRef *)malloc(n_weights * sizeof(IOSurfaceRef)); + for (int i = 0; i < n_inputs; i++) k->ioInputs[i] = make_surface(input_sizes[i]); + for (int i = 0; i < n_weights; i++) k->ioWeights[i] = make_surface(weight_sizes[i]); + k->ioOutputs[0] = make_surface(output_size); + + // Build request: inputs = [x0, x1, ..., w0, w1, ...], outputs = [out] + // Weights follow activation inputs at higher indices — matches MIL param order. + int total_inputs = n_inputs + n_weights; + NSMutableArray *allInputs = [NSMutableArray arrayWithCapacity:total_inputs]; + NSMutableArray *allIdx = [NSMutableArray arrayWithCapacity:total_inputs]; + for (int i = 0; i < n_inputs; i++) { [allInputs addObject:wrap_surface(k->ioInputs[i])]; [allIdx addObject:@(i)]; } + for (int i = 0; i < n_weights; i++) { [allInputs addObject:wrap_surface(k->ioWeights[i])]; [allIdx addObject:@(n_inputs + i)]; } + + id wOut = wrap_surface(k->ioOutputs[0]); + k->request = ((id(*)(Class,SEL,id,id,id,id,id,id,id))objc_msgSend)( + g_ANEReq, + @selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:), + allInputs, allIdx, @[wOut], @[@0], nil, nil, @0); + + return k; + } +} + +// --------------------------------------------------------------------------- +// Eval and I/O +// --------------------------------------------------------------------------- + +bool ane_bridge_eval(ANEKernelHandle *kernel) { + @autoreleasepool { + if (!kernel || !kernel->model) return false; + NSError *e = nil; + return ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)( + kernel->model, @selector(evaluateWithQoS:options:request:error:), + 21, @{}, kernel->request, &e); + } +} + +void ane_bridge_write_input(ANEKernelHandle *kernel, int idx, + const void *data, size_t bytes) { + if (!kernel || idx < 0 || idx >= kernel->nInputs) return; + IOSurfaceLock(kernel->ioInputs[idx], 0, NULL); + memcpy(IOSurfaceGetBaseAddress(kernel->ioInputs[idx]), data, bytes); + IOSurfaceUnlock(kernel->ioInputs[idx], 0, NULL); +} + +void ane_bridge_read_output(ANEKernelHandle *kernel, int idx, + void *data, size_t bytes) { + if (!kernel || idx < 0 || idx >= kernel->nOutputs) return; + IOSurfaceLock(kernel->ioOutputs[idx], kIOSurfaceLockReadOnly, NULL); + memcpy(data, IOSurfaceGetBaseAddress(kernel->ioOutputs[idx]), bytes); + IOSurfaceUnlock(kernel->ioOutputs[idx], kIOSurfaceLockReadOnly, NULL); +} + +// --------------------------------------------------------------------------- +// Dynamic weight I/O +// --------------------------------------------------------------------------- + +void ane_bridge_write_weight(ANEKernelHandle *kernel, int idx, + const void *fp16_data, size_t bytes) { + if (!kernel || !kernel->ioWeights || idx < 0 || idx >= kernel->nWeights) return; + IOSurfaceLock(kernel->ioWeights[idx], 0, NULL); + memcpy(IOSurfaceGetBaseAddress(kernel->ioWeights[idx]), fp16_data, bytes); + IOSurfaceUnlock(kernel->ioWeights[idx], 0, NULL); +} + +void ane_bridge_write_weight_f32(ANEKernelHandle *kernel, int idx, + const float *fp32_data, size_t count) { + if (!kernel || !kernel->ioWeights || idx < 0 || idx >= kernel->nWeights) return; + IOSurfaceLock(kernel->ioWeights[idx], 0, NULL); + cvt_f32_f16(fp32_data, (uint16_t *)IOSurfaceGetBaseAddress(kernel->ioWeights[idx]), count); + IOSurfaceUnlock(kernel->ioWeights[idx], 0, NULL); +} + +// --------------------------------------------------------------------------- +// Direct IOSurface copy (no CPU round-trip between chained kernels) +// --------------------------------------------------------------------------- + +void ane_bridge_copy_io(ANEKernelHandle *src, int src_out_idx, + ANEKernelHandle *dst, int dst_in_idx) { + if (!src || !dst) return; + if (src_out_idx < 0 || src_out_idx >= src->nOutputs) return; + if (dst_in_idx < 0 || dst_in_idx >= dst->nInputs) return; + + IOSurfaceRef srf = src->ioOutputs[src_out_idx]; + IOSurfaceRef drf = dst->ioInputs[dst_in_idx]; + size_t bytes = IOSurfaceGetAllocSize(srf); + + IOSurfaceLock(srf, kIOSurfaceLockReadOnly, NULL); + IOSurfaceLock(drf, 0, NULL); + memcpy(IOSurfaceGetBaseAddress(drf), IOSurfaceGetBaseAddress(srf), bytes); + IOSurfaceUnlock(drf, 0, NULL); + IOSurfaceUnlock(srf, kIOSurfaceLockReadOnly, NULL); +} + +// --------------------------------------------------------------------------- +// Real-time task — 90.6% p99 jitter reduction +// --------------------------------------------------------------------------- + +void ane_bridge_begin_realtime(void) { + if (!g_rt_client) return; + ((void(*)(id,SEL))objc_msgSend)(g_rt_client, @selector(beginRealTimeTask)); +} + +void ane_bridge_end_realtime(void) { + if (!g_rt_client) return; + ((void(*)(id,SEL))objc_msgSend)(g_rt_client, @selector(endRealTimeTask)); +} + +// --------------------------------------------------------------------------- +// Lifecycle +// --------------------------------------------------------------------------- + +void ane_bridge_free(ANEKernelHandle *kernel) { + @autoreleasepool { + if (!kernel) return; + NSError *e = nil; + if (kernel->model) { + ((BOOL(*)(id,SEL,unsigned int,NSError**))objc_msgSend)( + kernel->model, @selector(unloadWithQoS:error:), 21, &e); + } + for (int i = 0; i < kernel->nInputs; i++) if (kernel->ioInputs[i]) CFRelease(kernel->ioInputs[i]); + for (int i = 0; i < kernel->nOutputs; i++) if (kernel->ioOutputs[i]) CFRelease(kernel->ioOutputs[i]); + for (int i = 0; i < kernel->nWeights; i++) if (kernel->ioWeights[i]) CFRelease(kernel->ioWeights[i]); + if (kernel->tmpDir) [[NSFileManager defaultManager] removeItemAtPath:kernel->tmpDir error:nil]; + + free(kernel->ioInputs); + free(kernel->ioOutputs); + free(kernel->ioWeights); + free(kernel->inputBytes); + free(kernel->outputBytes); + free(kernel->weightBytes); + + kernel->model = nil; + kernel->request = nil; + kernel->tmpDir = nil; + free(kernel); + } +} + +int ane_bridge_get_compile_count(void) { return g_compile_count; } +void ane_bridge_reset_compile_count(void) { g_compile_count = 0; } + +// --------------------------------------------------------------------------- +// Weight blob helpers (BLOBFILE mode) +// --------------------------------------------------------------------------- + +uint8_t *ane_bridge_build_weight_blob(const float *src, int rows, int cols, + size_t *out_len) { + int wsize = rows * cols * 2; + int total = 128 + wsize; + uint8_t *buf = (uint8_t *)calloc(total, 1); + buf[0] = 0x01; buf[4] = 0x02; + buf[64] = 0xEF; buf[65] = 0xBE; buf[66] = 0xAD; buf[67] = 0xDE; + buf[68] = 0x01; + *(uint32_t*)(buf + 72) = wsize; + *(uint32_t*)(buf + 80) = 128; + cvt_f32_f16(src, (uint16_t *)(buf + 128), rows * cols); + *out_len = total; + return buf; +} + +uint8_t *ane_bridge_build_weight_blob_transposed(const float *src, int rows, int cols, + size_t *out_len) { + int wsize = rows * cols * 2; + int total = 128 + wsize; + uint8_t *buf = (uint8_t *)calloc(total, 1); + buf[0] = 0x01; buf[4] = 0x02; + buf[64] = 0xEF; buf[65] = 0xBE; buf[66] = 0xAD; buf[67] = 0xDE; + buf[68] = 0x01; + *(uint32_t*)(buf + 72) = wsize; + *(uint32_t*)(buf + 80) = 128; + uint16_t *fp16 = (uint16_t *)(buf + 128); + for (int i = 0; i < rows; i++) + for (int j = 0; j < cols; j++) { + float32x4_t f = vdupq_n_f32(src[i * cols + j]); + fp16[j * rows + i] = vreinterpret_u16_f16(vcvt_f16_f32(f))[0]; + } + *out_len = total; + return buf; +} + +void ane_bridge_free_blob(void *ptr) { free(ptr); } diff --git a/bridge/libane_bridge.dylib b/bridge/libane_bridge.dylib deleted file mode 100755 index 72acc32e285cd688d74b74f5735e8b6cfcae02e6..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 54480 zcmeHw4RjR8m2S+rVKtc$F&0rt~OaTAH2E{QwlK2t7iX_1CIv!dxEve0DM)dRm z0t~~CW5+mN#`Y$Dc1|E*zeT;PEKW!^tnD?ioh%zCYvW{{-F>kUHft-vFToMUGyGWZ zyWKUTRtwO}%Xw#a&sHDq+`4sd-MaPN>guX#&EfTP|NGNnjJX)Dd{8MUUBK7_teC3U z7|;ecW5HnKyt^9Bs(~NUK#_4d57k6)WnRIcu58e`0Os_!&}Zq|ODs+@y6wNbEZHwm z`?#KHis!1F{(e|tsY=a}F-Ef8pO5>)*v%^#j5XaC>UZk&*FDa%OL8Y9)A(}Np!h1gqtzykS3l;^J+IC9N{`WnRgFap7LG79_3?2Aksj zcmH62J!ntYbc>mv_Hky@omJB~7lMMpCAG^I&#S4mW*cKixP`r$EKW1z>m0!bp zj79Y-l5FSt-Ha+f z$Qt7{2xzQXHnDX5lXn_)_QcS7x*9~jtlX3*40>aMsv1y=1N72qb5GH+NOuk!TzlbT)}_s@`i-gEQgd!xev_Dso~x7 zKO4M1{w0`tE;cg}*JdJVMAlGdRN2yK#x=0rNq8C9j-D_x0`^u`f2{-hBfP%Ot}veJ zn5JAAv%t)OHtEQIYKhcE4r=)k-Ym&#-!{`ON#%?>+Gr*JI^e4bza=$ z?!35zr5fO>bzk)yjO{i|>H6EPt;IpM_Y0{20)acXA7_WBI9Ls9S-$z1y>P`Kh0w4)^YEY+5o$H%ggSH#_vu7AZ3Y{J?f8 z)8k@?j({kRDWKMUo;hSceuV4r2*fo7W#2q0<8etD4EG@GzB6?qKUt64%&B{WrCBNf z-SzPE3i^f5kry{rVeEmu?w-QJI}QY?@R};L47gwBY@3FVB<|? zJO#!lo_yoHC(k(N@fy7zk8#H1Ha_;ajI$nQnDGRpj1Tb~@Qh7=AN{0JPci$x4NS+e z+j>tGbY_Cy&h$){sYJc>=ThcfPnq$aXOuzMUi4gR`~bZB7M3A@?0-m^i@g3I_yzbo z-@p!UEnsUG=d;G12I=s}9^bT2Jf%kZ_X|4BHY4EkD^mLQJj>rcw4?Z_FZY8|Is|%y z`+OQa!(%UW6{O5DY%550PMmG}@_oB7b-**n@|%(}`(AM!+wXB7s{xM3xsQjzdp%=; z2{XDqEf2IBdh$CB+-#rpr9S>V z*^V-v<36TyAKkzt+y1A#-8gWX8{_lX+Vo5*^I9QmynuOv=W1#9=Lwq%V-NQ;7Myao zpUY+7gjM(LWH)9&T$RvuKZ<*EF6Pn$Qo3g@%k)&^9;9+L<}~+_2Ts_~T%d71L+ubd z#tc3N_YLu}#ve>xVVpL3nejVt3^?`80``&L%Nk8gF`u`)rOXcQn`}M+5q=i7g(<@N zJn-^o`ferMw!4z`Te? z#D8uv%VgW0taf8AjKQ2PF{tf#U~kT&#Zu-ulz|Om9c%m*)x*ab;ClWBDT96C;B~Ft zo!aKL;{F=!DQ?R5XA%3LF?v0z6M2^HcG!-EZPxx;n|%rH!Ly^K%u`tBDUN@JEzS4c z;1t^~@Fzjt9+tsV{EZ2c7vm{0TFkmq<9=`)vr6&o*!{_T_(JX|ezb+ZQsYO^lfTX2 zH174_J3%>rr^#QL(O}k%GE{KvT&d23;l%(_3iUL; z5n?*)Mn3``KRYZph?g33%TETh1m9%vk_>%6InyDN}mI>IVBmSvta>&lH%)X_bVyv|w|G83f?t&=qVV(19N6Q0qK%Z*{C zTw)Yt+vHQ5a>E0e#&HJsak7YI&Uq@P(X-_$7i&C&+M3#}PWJAIa-K zMm;_2=Ad0Eh+=xp!x|}{nAfMvM@bpPm3evNrt}Gn<89cJzYo9(Cu>c_dIOo(oqqvO z4yQgU49XqZ{yJw%&n!HDx$V!vHx;l{&{J{h#D%`b-ig2p@%19Fgzayjr}jSsC;UlA z_3!cepMcYP^U3Qqom=y*xXXFmV|m=~Zrp&l#~Odb+Z+bB;x02|ISQHLHo&d8%Z(Bq z_jhu(6{hWr+;$ImIpW^xE;r*QoY*$(r@&ofurC3p=Q-U!y_hRz%y`zGM2`O*xa1?o z0?@oul7ak+3K)&TaG>4vd#;!rJdN*rD$*0i#3SWv`XwPX-(S; ze>BEF1E+g&C3pzL*Pw$FUd=Ptw42SkQez?bC)j^{@M+&@;CRrs4&`ib;5P5TrUK=@ z9qwaiR}~;GANG)iofon0Q(tBtFa~pFF#&n?+Nl%p%~@bEB*CH#aaV-=F1vv3JS({3`c34&0n`H*(v_ z+;()o?G|o30-WZ$+5hLbT?KE4In}*Cuak5vg?aTJOT9OdrC!B)N4WmPm0j=3jI$`y zygp;{65|ix2X=gUP+m_k%4;d|S_}LZBCiWNfv1_vSJ9@=<2rV6lc)3ID=hUJ?8kes zAK!n~+|KPdFN8qA{M2u;MtuNG-}kV??}k`v|3r4|nQC@yGGe21LRRO-yr*Y| z7li#+{ri~fA9xCl^B4NQadA_1C;Bt{?t6tDv!9nJC(k3E2F$Bg%rQE%;EZ;9!nIN+ zj2!l%Z=AVLhp=b(cVJ?z`#$rze-Q1cJ&n_iSW3|Ev#6uBD|Dsab(x*QbV@s zsGT#G&+}Ml4xC-(!u-J5mCpsja~0bEqL3Y;`MMf8S&f`bK@MCKtbEYiqq1eakoyh_RYKS3>*a+U+4w`?(rd(#^fzy0H zv%CuTTp%~^2|tFt49x*5(;1w~>EH{nbLPN`=I>#K}-Y?^Ou+pQ=n;t7X;!YWESLzh*-+*gNS9Rgmu6ZM*t_5QPT@zX-40p9o zShgBm0^bL|9sE7;SHLH1yR+;1ug&e6>T;zPx!kEW+t^1ZF7`E!ca@#;W1b%`+VTf{ zAJIs@k7`>U>nyrzSCRLsU8b$e_nJTNM|Uvt{Q$eLW(0eV_DR%MLYqRbZ`V}#G>Uw? zj=FuXN%N!^*pZ3X`6si@zC3GBr6XT}7wLFh!nhQ9)kr%9ao4lPDu^wf= zfA|FVIpIBJv%8|aV|;_p;~r54yyuvBmzj8P1m5_r^pbdw0p8^{y!(LnDV!Gx=R)RA z{nk@DZO?GFw)^40##5vBj9sE^dTeRomdBQ@-SXHq!22NZPHoxpixZyVuf4lv>o4B( zjGl&kmSS9~-xsn}v3ta>?}MhIJl^Fwg_lc>d(g*Kz~`^_I!+rNmQU0#UEPvN$^73~jy;cs7>|__7Ew*xa(lgvRj&{jRU*o*Z z-lz9&3|qNVCvV>AWvOSGdsooqJr%%sUcoxB&dg&Ca#f9-RUvl)>_>iV<`Uo4oi2VD z>z;f3TV}b;R>nP?tH*tuD_?Ibk8@bEmDm$W~rvE7#k~D{bYwZRLAy7(y=%p|2T2zipM`Ci=KghWMy+;uX2A0jMjV@K%Vp_-2SXhbFDDjY{w(Bu%u+FThSJoyJyva6O zW?!W&Z@Nzj>6eK@Q{u76I>nE!{O#5_{h^2)k1O$_0Zo?0biW!^bybe28pK2c%8TSt^XTM%iTUzxvGab%oQzJ|= zi@|nTZ=E@yPl`VjYikF(ym_U6GKP?du8GFhM+e!O=v#65r)Y|MwH2I#+O)~vSp zRI9N)anymmk=(2+n%`_`8QVkSfzPz~j3zb8tQXN$9mu#64WqL&`v4+Z6JH3sIS1Oto(@2|$>MW!w zn6NdfhQwp?1a#`Gay^>VMiF+K5>{<`Rf$*1I{I!(=qPeelve7bs)H&cO0)$oSXgP6 z6A^u>9K}m;4Wcz85dsUVAqoX=3~fd7U=1c-*&NdrD;@DlGsa4$tdk=Ngy9^CW=-O3 z9IbM^4)3AW4VA1_*dX(H!E6YyP`EW=rWqNYq5|LPG|}Y6<~yWl20BBZYA7+zr8V+ z4JgeCR@G6XL=+u!aETl@r$%lbatjKre?p=iT$oMgYq2)oQ6g$bmk({drs_&#Y(8#7 zOsQC#8j>RsOj?HTj2Zn|4skz-Ex`4@71A1v@s372=(JzFCbk zkn0}2NdMt5=ALym^Blkn{VB!FD;2Z6XGXAmyi39{j(O21aW~HXUc42jvbY2S0s(=5 zKtLcM5D*9m1Ox&C0fB%(Kp-Fx5C{ka1Ofs9fq+0jARrJB2nYlO0{=-6C{EHlH2eY( ze|~LodM`$@^GX5^dd!R08CE5Wh6cQewX-I&gu88BwIh* z@8|S8N2XHfj~<4lU*%J*RZ z9uvS>c&FNbNhnS+#p2{&_^DuiP{C1ey~{Nf;zJ-H5D*9m1Ox&C0fB%(Kp-Fx5C{ka z1Ofs9fq+0jARrJB2nYlO0s;YnfIvVXAP^7;2m}NI0s(=5KtLcM5D*9m1Ox&C0fB%( zKp-Fx5C{ka1Ofs9fq+0jARrJB2nYlO0s;YnfIvVXAP^7;{5L~jsrduS_z`t0Xqbz! zso-F_%gamU;IneIjFHSF{4euTePE*#g=JItqFR^gGa}pkfd6i~~&v-2qzSVcvT{kAhwXy#qQ2 zD)lmNjhE#;0@?@q8OWW-^6mg7LEp_|uCsZJ1(!8cN91^Xz7}h1&^0yMLRF1QTYF5C zwT>E9Q$l)7>tMmF)k^{m4Z%q;nXg8arE*kmQM5t2Wr?eiRzb9)yLM`s~0M2OX~o`I(35*nIF^I&+-=3PNvQuc-t`8#ilH+CuHXJ`s;7 z3Yy27?h6Iw1QMb|6aZBhi$&^Ums2F-h|p}(7H?@#q6o#5c;lR;%PK(K~ted*5$#KYDiaOQOvJFs-+2CZi*-YO_MLHtg$r@Vk-8_ zvnU^0B7{^05ma?7iu6QvYvP&nyk2QmG$k5B(2fb6HLi{73MMt5!*ol#f^t*|Hfd_O zMG3Np=$`km4{yQ8LpQmCx+X{C?XkELW~CnX%aN!~PICp@5)oYut~cjXoZaYQ>1ojH zoPm$iv+!}W7$52B__%;Y#w~|~v4q~9(Ak&biXIHb+S+l0fhRC;??sy%8S^zI)JT~7 z3^qk#O>7J|WV4iYa)iw@r_rL8K^~T?QB`M+n!-Zn6bq_R>T3s$E7ZZ6-?Wp7d&|m7vdq?f`uObSG#YXg+8GXd#I1u|=T8pe3NCpk<(}AKFje0=gCC>}xLSBv1fU z1*!(sfT-`;pgEu`#X$YoNB+pjC_>i^iSnPY!7#Gzp5{n{0_djDTG;r-``|DeN#^Azx(b@!h=orKKe zTrh4O?)b@pA3Y|mXq-PgdjPbbNhZnR@baenaLTN7w2K_(H1y#gc??wW$gEJ=ivbZ3@h=Frnb|G$JxwwkQvw{r=XrKc;c1KF>SHZ5wA4I z?F^w+j@M~Qv%0~lh?<9S@&g1euUrw0t1VF_yd)NFaaw6gTWpqZ)JDU~Al%HQzcEHDBIZb}O{KFt&A}ENo>X0xBlxxfN9vYfVA;GH zCBBy3ceCT zT?BL&DRB*F?9JVNHydV&Lw%OCXDuRXQ~yJBMz{kqOPqXgy8P6iYiDjb`;BK$jotI~ z8_)gImpvoj|E(fV|4jMf>2sGS-g>UDefYC)&oA2Y{6F5jxcKPXPp{kcy}56eY<{e+ z@yTlsfA7GZ$?uiEw(h3J*n{b{Cr|Fuo?P_< +#include +#include +#include +#include +#include +#include "ane_bridge.h" + +static mach_timebase_info_data_t g_tb; +static double ms(void) { + return (double)mach_absolute_time() * g_tb.numer / g_tb.denom / 1e6; +} + +static int passed = 0, failed = 0; +#define PASS(msg) do { printf(" [PASS] %s\n", msg); passed++; } while(0) +#define FAIL(msg) do { printf(" [FAIL] %s\n", msg); failed++; } while(0) +#define CHECK(cond, msg) do { if (cond) PASS(msg); else FAIL(msg); } while(0) + +// Correct MIL header — must match exactly what ANE compiler expects +#define MIL_HDR \ + "program(1.3)\n" \ + "[buildInfo = dict({{\"coremlc-component-MIL\", \"3510.2.1\"}, " \ + "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, " \ + "{\"coremltools-version\", \"9.0\"}})]\n{\n" + +// Dynamic weight matmul: y[1,N,M] = W[1,N,K] @ x[1,K,M] +// x declared as [1,K,1,M] (4D), W as [1,N,K] (3D dynamic weight) +static const char *mil_dyn_matmul(int N, int K, int M) { + static char buf[2048]; + snprintf(buf, sizeof(buf), + MIL_HDR + " func main(\n" + " tensor x,\n" + " tensor W) {\n" + " tensor sh = const()[name=string(\"sh\"), val=tensor([1,%d,%d])];\n" + " tensor x3 = reshape(shape=sh,x=x)[name=string(\"rx\")];\n" + " bool bF = const()[name=string(\"bF\"), val=bool(false)];\n" + " tensor out = matmul(transpose_x=bF,transpose_y=bF,x=W,y=x3)[name=string(\"mm\")];\n" + " } -> (out);\n" + "}\n", + K, M, // x: [1,K,1,M] + N, K, // W: [1,N,K] + K, M, // reshape sh + K, M, // x3 + N, M // out + ); + return buf; +} + +// No-weight MIL: elementwise add x+x = 2x +static const char *mil_scale2(int rows, int cols) { + static char buf[1024]; + snprintf(buf, sizeof(buf), + MIL_HDR + " func main(tensor x) {\n" + " tensor out = add(x=x,y=x)[name=string(\"out\")];\n" + " } -> (out);\n" + "}\n", + rows, cols, rows, cols + ); + return buf; +} + +static void fill_fp16(void *buf, float val, int count) { + uint16_t *p = (uint16_t *)buf; + _Float16 fval = (_Float16)val; + uint16_t bits; + memcpy(&bits, &fval, 2); + for (int i = 0; i < count; i++) p[i] = bits; +} + +static float read_fp16_elem(const void *buf, int idx) { + const uint16_t *p = (const uint16_t *)buf; + _Float16 v; + memcpy(&v, &p[idx], 2); + return (float)v; +} + +// --- Test 1: init --- +static void test_init(void) { + printf("\n[1] ane_bridge_init\n"); + int r = ane_bridge_init(); + CHECK(r == 0, "init returns 0"); +} + +// --- Test 2: compile_dyn — compile once with dynamic weight --- +static ANEKernelHandle *g_dyn_kern = NULL; +static void test_compile_dyn(void) { + printf("\n[2] ane_bridge_compile_dyn (dynamic weight IOSurface)\n"); + + int N=64, K=64, M=64; + const char *mil = mil_dyn_matmul(N, K, M); + size_t mil_len = strlen(mil); + + // x: [1,K,1,M] fp16, W: [1,N,K] fp16, out: [1,N,M] fp16 + size_t in_sz = (size_t)K * M * 2; + size_t w_sz = (size_t)N * K * 2; + size_t out_sz = (size_t)N * M * 2; + + double t0 = ms(); + g_dyn_kern = ane_bridge_compile_dyn(mil, mil_len, 1, &in_sz, 1, &w_sz, out_sz); + double elapsed = ms() - t0; + + CHECK(g_dyn_kern != NULL, "compile_dyn returns non-NULL handle"); + printf(" Compile time: %.1fms\n", elapsed); +} + +// --- Test 3: write_weight + eval — dynamic weights actually update --- +static void test_dynamic_weight_update(void) { + printf("\n[3] Dynamic weight update (write W, eval, check output changes)\n"); + if (!g_dyn_kern) { FAIL("no kernel from test 2"); return; } + + int N=64, K=64, M=64; + size_t in_sz = (size_t)K * M * 2; + size_t w_sz = (size_t)N * K * 2; + size_t out_sz = (size_t)N * M * 2; + + uint16_t *xbuf = (uint16_t *)malloc(in_sz); + uint16_t *wbuf_A = (uint16_t *)calloc(N*K, 2); + uint16_t *wbuf_B = (uint16_t *)calloc(N*K, 2); + uint16_t *out = (uint16_t *)malloc(out_sz); + + // x = all 1.0 + fill_fp16(xbuf, 1.0f, K*M); + + // W_A = identity (scale=1): diagonal 1.0 + _Float16 one = 1.0f; uint16_t one_bits; memcpy(&one_bits, &one, 2); + _Float16 two = 2.0f; uint16_t two_bits; memcpy(&two_bits, &two, 2); + for (int i = 0; i < N && i < K; i++) wbuf_A[i*K + i] = one_bits; + for (int i = 0; i < N && i < K; i++) wbuf_B[i*K + i] = two_bits; + + // Eval A: W = identity + ane_bridge_write_input(g_dyn_kern, 0, xbuf, in_sz); + ane_bridge_write_weight(g_dyn_kern, 0, wbuf_A, w_sz); + bool ok = ane_bridge_eval(g_dyn_kern); + CHECK(ok, "eval A succeeds"); + ane_bridge_read_output(g_dyn_kern, 0, out, out_sz); + float sum_A = 0; + for (int i = 0; i < N*M; i++) sum_A += read_fp16_elem(out, i); + + // Eval B: W = 2x identity, NO recompile + double t0 = ms(); + ane_bridge_write_weight(g_dyn_kern, 0, wbuf_B, w_sz); + double write_ms = ms() - t0; + + t0 = ms(); + ok = ane_bridge_eval(g_dyn_kern); + double eval_ms = ms() - t0; + + CHECK(ok, "eval B succeeds after weight update"); + ane_bridge_read_output(g_dyn_kern, 0, out, out_sz); + float sum_B = 0; + for (int i = 0; i < N*M; i++) sum_B += read_fp16_elem(out, i); + + printf(" sum(out) W=identity: %.1f W=2x: %.1f ratio: %.2f (expect ~2.0)\n", + sum_A, sum_B, sum_B / (sum_A + 1e-9f)); + CHECK(fabsf(sum_B / (sum_A + 1e-9f) - 2.0f) < 0.1f, "output doubled after weight update"); + printf(" write_weight: %.3fms eval: %.2fms\n", write_ms, eval_ms); + + free(xbuf); free(wbuf_A); free(wbuf_B); free(out); +} + +// --- Test 4: write_weight_f32 --- +static void test_write_weight_f32(void) { + printf("\n[4] ane_bridge_write_weight_f32 (fp32 -> fp16 conversion)\n"); + if (!g_dyn_kern) { FAIL("no kernel"); return; } + + int N=64, K=64, M=64; + float *w_fp32 = (float *)calloc(N*K, 4); + uint16_t *out1 = (uint16_t *)malloc((size_t)N*M*2); + uint16_t *out2 = (uint16_t *)malloc((size_t)N*M*2); + uint16_t *xbuf = (uint16_t *)malloc((size_t)K*M*2); + fill_fp16(xbuf, 1.0f, K*M); + + // 3x identity in fp32 + for (int i = 0; i < N && i < K; i++) w_fp32[i*K + i] = 3.0f; + ane_bridge_write_input(g_dyn_kern, 0, xbuf, (size_t)K*M*2); + ane_bridge_write_weight_f32(g_dyn_kern, 0, w_fp32, (size_t)N*K); + bool ok = ane_bridge_eval(g_dyn_kern); + CHECK(ok, "eval with fp32-written weight succeeds"); + ane_bridge_read_output(g_dyn_kern, 0, out1, (size_t)N*M*2); + float sum = 0; + for (int i = 0; i < N*M; i++) sum += read_fp16_elem(out1, i); + float expected = 3.0f * N; // 3x identity: each row sums x (all 1s) scaled by 3 + // Actually identity matmul with x=all-1s: row i of output = row i of W dotted with x + // row i of identity (scaled 3) = 3 at position i, 0 elsewhere. So out[i][t] = 3*x[i][t] = 3 + printf(" sum(out) with 3x identity (fp32 write): %.1f\n", sum); + CHECK(fabsf(sum - 3.0f * N * M) < N*M*0.1f, "fp32 weight write produces correct output"); + + free(w_fp32); free(out1); free(out2); free(xbuf); +} + +// --- Test 5: copy_io --- +static void test_copy_io(void) { + printf("\n[5] ane_bridge_copy_io (direct IOSurface-to-IOSurface)\n"); + + // Compile a simple no-weight kernel: scale x by 2 + int R=32, C=32; + const char *mil = mil_scale2(R, C); + size_t sz = (size_t)R * C * 2; + + ANEKernelHandle *k1 = ane_bridge_compile_dyn(mil, strlen(mil), 1, &sz, 0, NULL, sz); + ANEKernelHandle *k2 = ane_bridge_compile_dyn(mil, strlen(mil), 1, &sz, 0, NULL, sz); + CHECK(k1 != NULL && k2 != NULL, "two scale2 kernels compiled"); + if (!k1 || !k2) { ane_bridge_free(k1); ane_bridge_free(k2); return; } + + // k1 input = 1.0, k1 out = 2.0, copy to k2 in, k2 out = 4.0 + uint16_t *buf = (uint16_t *)malloc(sz); + uint16_t *out = (uint16_t *)malloc(sz); + fill_fp16(buf, 1.0f, R*C); + + ane_bridge_write_input(k1, 0, buf, sz); + ane_bridge_eval(k1); + ane_bridge_copy_io(k1, 0, k2, 0); // k2 input = k1 output (should be 2.0) + ane_bridge_eval(k2); + ane_bridge_read_output(k2, 0, out, sz); + + float val = read_fp16_elem(out, 0); + printf(" input=1.0 -> k1(x2) -> copy -> k2(x2) -> output=%.1f (expect 4.0)\n", val); + CHECK(fabsf(val - 4.0f) < 0.2f, "copy_io chains two kernels without CPU round-trip"); + + free(buf); free(out); + ane_bridge_free(k1); + ane_bridge_free(k2); +} + +// --- Test 6: begin/end realtime --- +static void test_realtime(void) { + printf("\n[6] ane_bridge_begin/end_realtime\n"); + if (!g_dyn_kern) { FAIL("no kernel"); return; } + + // Just verify they don't crash. Jitter improvement only visible over many samples. + ane_bridge_begin_realtime(); + bool ok = ane_bridge_eval(g_dyn_kern); + ane_bridge_end_realtime(); + CHECK(ok, "eval inside realtime task succeeds"); + printf(" (p99 jitter improvement requires statistical measurement — see test_realtime_task.m)\n"); +} + +// --- Test 7: compile cache --- +static void test_compile_cache(void) { + printf("\n[7] Compile cache (cache files written to ~/.ane_cache/)\n"); + + int N=32, M=32; + const char *mil = mil_scale2(N, M); + size_t sz = (size_t)N*M*2; + + ANEKernelHandle *k = ane_bridge_compile_dyn(mil, strlen(mil), 1, &sz, 0, NULL, sz); + CHECK(k != NULL, "kernel compiled for cache test"); + if (!k) return; + + // Verify cache files exist on disk + // We need the hex ID — compile a fresh model to get it, then check cache + NSString *home = NSHomeDirectory(); + NSString *cacheRoot = [home stringByAppendingPathComponent:@".ane_cache"]; + NSFileManager *fm = [NSFileManager defaultManager]; + // Dynamic-weight models produce net.plist only (no data file) + NSArray *entries = [fm contentsOfDirectoryAtPath:cacheRoot error:nil]; + BOOL found_plist = NO; + for (NSString *entry in entries) { + NSString *plistPath = [[cacheRoot stringByAppendingPathComponent:entry] + stringByAppendingPathComponent:@"net.plist"]; + if ([fm fileExistsAtPath:plistPath]) { found_plist = YES; break; } + } + CHECK(found_plist, "compiled kernel cached to ~/.ane_cache/"); + + // Free k first, then recompile — should hit cache (load only, no compile) + ane_bridge_free(k); + int before = ane_bridge_get_compile_count(); + ANEKernelHandle *k2 = ane_bridge_compile_dyn(mil, strlen(mil), 1, &sz, 0, NULL, sz); + int after = ane_bridge_get_compile_count(); + CHECK(k2 != NULL, "cache hit returns valid kernel"); + CHECK(after == before, "cache hit does not increment compile count"); + ane_bridge_free(k2); +} + +// --- Test 8: free --- +static void test_free(void) { + printf("\n[8] ane_bridge_free\n"); + ane_bridge_free(g_dyn_kern); + g_dyn_kern = NULL; + PASS("free does not crash"); +} + +int main(void) { + @autoreleasepool { + mach_timebase_info(&g_tb); + printf("=== ane_bridge test suite ===\n"); + + test_init(); + test_compile_dyn(); + test_dynamic_weight_update(); + test_write_weight_f32(); + test_copy_io(); + test_realtime(); + test_compile_cache(); + test_free(); + + printf("\n=== Results: %d passed, %d failed ===\n", passed, failed); + return failed > 0 ? 1 : 0; + } +}