mirror of https://github.com/maderix/ANE.git
bridge: add compile_dyn + write_weight — function parameter IOSurfaces
Adds a second dynamic weight approach to the bridge alongside the existing
BLOBFILE compile path. Instead of packing weights into the spatial dimension
of a single large input tensor and slicing them inside MIL (the training_dynamic/
approach), weights are declared as native MIL function parameters backed by
persistent IOSurfaces:
// training_dynamic/ approach: spatial packing
func main<ios18>(tensor<fp32, [1, DIM, 1, SEQ + 4*DIM]> x) {
Wq = slice_by_size(x=x, begin=..., size=...); // overhead
...
// this PR: native function parameters
func main<ios18>(tensor<fp16,[1,K,1,M]> x, tensor<fp16,[1,N,K]> W) { ... }
New API:
ane_bridge_compile_dyn() — compile with n_weights IOSurface parameters
ane_bridge_write_weight() — write fp16 to weight IOSurface (~0.001ms)
ane_bridge_write_weight_f32() — write fp32 with NEON conversion
ane_bridge_copy_io() — direct output→input copy, no CPU round-trip
ane_bridge_begin/end_realtime() — 90.6% p99 jitter reduction
Compile cache fix: ANE only writes net.plist for parameter-based models (no
data file). try_cache_restore now checks net.plist only; data is saved/restored
conditionally for BLOBFILE models that do produce it.
Also removes the pre-built libane_bridge.dylib binary from version control.
Performance vs spatial packing (Stories110M, 12 layers, M-series):
training_dynamic/ (slice approach): 110ms/step
function parameter approach: 76.9ms/step (-30%)
The slice/reshape/transpose overhead per weight matrix explains the gap.
Both compile once at startup; weight updates are IOSurface writes in both cases.
Tested: test_bridge.m — 15/15 assertions across all new API functions.
This commit is contained in:
parent
3c1aae65d7
commit
98ddd2d190
|
|
@ -1,87 +1,149 @@
|
|||
// ane_bridge.h — C-callable bridge to ANE private APIs for Python ctypes
|
||||
// Wraps _ANEInMemoryModel via private AppleNeuralEngine.framework
|
||||
|
||||
#ifndef ANE_BRIDGE_H
|
||||
#define ANE_BRIDGE_H
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
#include <stdbool.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
// Opaque kernel handle
|
||||
typedef struct ANEKernelHandle ANEKernelHandle;
|
||||
|
||||
// Initialize ANE runtime (load private framework, resolve classes)
|
||||
// Returns 0 on success, -1 on failure
|
||||
int ane_bridge_init(void);
|
||||
|
||||
// Compile a MIL program with weight blobs into an ANE kernel
|
||||
// mil_text: UTF-8 MIL program text
|
||||
// mil_len: length of MIL text
|
||||
// weight_data: raw weight blob (can be NULL)
|
||||
// weight_len: length of weight blob
|
||||
// n_inputs: number of input tensors
|
||||
// input_sizes: array of byte sizes for each input
|
||||
// n_outputs: number of output tensors
|
||||
// output_sizes: array of byte sizes for each output
|
||||
// Returns kernel handle or NULL on failure
|
||||
ANEKernelHandle *ane_bridge_compile(const char *mil_text, size_t mil_len,
|
||||
const uint8_t *weight_data, size_t weight_len,
|
||||
int n_inputs, const size_t *input_sizes,
|
||||
int n_outputs, const size_t *output_sizes);
|
||||
|
||||
// Compile with multiple named weight files (for transformer kernels)
|
||||
// weight_names: array of weight file paths (e.g. "@model_path/weights/wq.bin")
|
||||
// weight_datas: array of weight data pointers
|
||||
// weight_lens: array of weight data lengths
|
||||
// n_weights: number of weight files
|
||||
ANEKernelHandle *ane_bridge_compile_multi_weights(
|
||||
const char *mil_text, size_t mil_len,
|
||||
const char **weight_names, const uint8_t **weight_datas,
|
||||
const size_t *weight_lens, int n_weights,
|
||||
int n_inputs, const size_t *input_sizes,
|
||||
int n_outputs, const size_t *output_sizes);
|
||||
|
||||
// Evaluate (run) a compiled kernel on ANE
|
||||
// Returns true on success
|
||||
bool ane_bridge_eval(ANEKernelHandle *kernel);
|
||||
|
||||
// Write data to kernel input tensor
|
||||
void ane_bridge_write_input(ANEKernelHandle *kernel, int idx,
|
||||
const void *data, size_t bytes);
|
||||
|
||||
// Read data from kernel output tensor
|
||||
void ane_bridge_read_output(ANEKernelHandle *kernel, int idx,
|
||||
void *data, size_t bytes);
|
||||
|
||||
// Free a compiled kernel and all associated resources
|
||||
void ane_bridge_free(ANEKernelHandle *kernel);
|
||||
|
||||
// Get compile count (for exec() restart budgeting)
|
||||
int ane_bridge_get_compile_count(void);
|
||||
|
||||
// Reset compile count
|
||||
void ane_bridge_reset_compile_count(void);
|
||||
|
||||
// Build a weight blob in ANE format (128-byte header + fp16 data)
|
||||
// src: float32 weights [rows x cols]
|
||||
// Returns allocated buffer and sets out_len. Caller must free().
|
||||
uint8_t *ane_bridge_build_weight_blob(const float *src, int rows, int cols,
|
||||
size_t *out_len);
|
||||
|
||||
// Build a transposed weight blob in ANE format
|
||||
uint8_t *ane_bridge_build_weight_blob_transposed(const float *src, int rows, int cols,
|
||||
size_t *out_len);
|
||||
|
||||
// Free a blob allocated by ane_bridge_build_weight_blob*
|
||||
void ane_bridge_free_blob(void *ptr);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif // ANE_BRIDGE_H
|
||||
// ane_bridge.h — C-callable bridge to ANE private APIs for Python ctypes
|
||||
// Wraps _ANEInMemoryModel via private AppleNeuralEngine.framework
|
||||
//
|
||||
// Two compilation modes:
|
||||
//
|
||||
// BLOBFILE (upstream compatible):
|
||||
// ane_bridge_compile() / ane_bridge_compile_multi_weights()
|
||||
// Weights compiled into MIL as constants. Requires recompile when weights
|
||||
// change — hits ANE compile limit (~119), needs exec() restart per batch.
|
||||
//
|
||||
// Dynamic IOSurface (our approach):
|
||||
// ane_bridge_compile_dyn()
|
||||
// Weights declared as runtime tensor function parameters. Compile ONCE at
|
||||
// startup, update weights via ane_bridge_write_weight() (0.002ms per call).
|
||||
// No exec() restart, no compile limit during training.
|
||||
//
|
||||
// Extras (our additions):
|
||||
// ane_bridge_begin/end_realtime() — 90.6% p99 jitter reduction
|
||||
// ane_bridge_copy_io() — direct IOSurface-to-IOSurface, no CPU
|
||||
// Compile cache — ~700ms vs ~3800ms on cache hit
|
||||
|
||||
#ifndef ANE_BRIDGE_H
|
||||
#define ANE_BRIDGE_H
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
#include <stdbool.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
// Opaque kernel handle
|
||||
typedef struct ANEKernelHandle ANEKernelHandle;
|
||||
|
||||
// Initialize ANE runtime (load private framework, resolve classes)
|
||||
// Returns 0 on success, -1 on failure
|
||||
int ane_bridge_init(void);
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// BLOBFILE compile (upstream compatible)
|
||||
// Weights compiled into the MIL program as constants.
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
// Compile a MIL program with a single weight blob
|
||||
ANEKernelHandle *ane_bridge_compile(const char *mil_text, size_t mil_len,
|
||||
const uint8_t *weight_data, size_t weight_len,
|
||||
int n_inputs, const size_t *input_sizes,
|
||||
int n_outputs, const size_t *output_sizes);
|
||||
|
||||
// Compile with multiple named weight files
|
||||
ANEKernelHandle *ane_bridge_compile_multi_weights(
|
||||
const char *mil_text, size_t mil_len,
|
||||
const char **weight_names, const uint8_t **weight_datas,
|
||||
const size_t *weight_lens, int n_weights,
|
||||
int n_inputs, const size_t *input_sizes,
|
||||
int n_outputs, const size_t *output_sizes);
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Dynamic weight compile (our approach — compile once, update per Adam step)
|
||||
// Weights declared as runtime tensor function parameters backed by IOSurfaces.
|
||||
//
|
||||
// n_inputs: number of activation input tensors
|
||||
// input_sizes: byte sizes (fp16) for each activation input
|
||||
// n_weights: number of dynamic weight tensors
|
||||
// weight_sizes: byte sizes (fp16) for each weight IOSurface
|
||||
// output_size: byte size (fp16) of the single output tensor
|
||||
//
|
||||
// MIL function signature must match: func main<ios18>(x0, x1, ..., w0, w1, ...)
|
||||
// where activation inputs come first, weight inputs follow.
|
||||
// ---------------------------------------------------------------------------
|
||||
ANEKernelHandle *ane_bridge_compile_dyn(
|
||||
const char *mil_text, size_t mil_len,
|
||||
int n_inputs, const size_t *input_sizes,
|
||||
int n_weights, const size_t *weight_sizes,
|
||||
size_t output_size);
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Eval and I/O
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
// Evaluate (run) a compiled kernel on ANE
|
||||
bool ane_bridge_eval(ANEKernelHandle *kernel);
|
||||
|
||||
// Write data to activation input tensor (fp16 or raw bytes)
|
||||
void ane_bridge_write_input(ANEKernelHandle *kernel, int idx,
|
||||
const void *data, size_t bytes);
|
||||
|
||||
// Read data from output tensor (fp16 or raw bytes)
|
||||
void ane_bridge_read_output(ANEKernelHandle *kernel, int idx,
|
||||
void *data, size_t bytes);
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Dynamic weight I/O (our approach)
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
// Write fp16 data directly to weight IOSurface (~0.002ms per call)
|
||||
// idx: weight index (0..n_weights-1)
|
||||
void ane_bridge_write_weight(ANEKernelHandle *kernel, int idx,
|
||||
const void *fp16_data, size_t bytes);
|
||||
|
||||
// Write fp32 data to weight IOSurface with automatic fp32→fp16 conversion
|
||||
// count: number of float elements (bytes = count * 2 fp16)
|
||||
void ane_bridge_write_weight_f32(ANEKernelHandle *kernel, int idx,
|
||||
const float *fp32_data, size_t count);
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Direct IOSurface copy — no CPU round-trip between chained kernels
|
||||
// Copies src kernel's output[src_out_idx] → dst kernel's input[dst_in_idx]
|
||||
// Zero-copy: just memcpy between IOSurface base addresses
|
||||
// ---------------------------------------------------------------------------
|
||||
void ane_bridge_copy_io(ANEKernelHandle *src, int src_out_idx,
|
||||
ANEKernelHandle *dst, int dst_in_idx);
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Real-time task — 90.6% p99 jitter reduction
|
||||
// Wrap a sequence of evals with begin/end to prevent ANE scheduler preemption.
|
||||
// Proven: plain p99=35.2ms → with RT task p99=3.3ms
|
||||
// Requires at least one kernel to have been compiled and loaded.
|
||||
// ---------------------------------------------------------------------------
|
||||
void ane_bridge_begin_realtime(void);
|
||||
void ane_bridge_end_realtime(void);
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Lifecycle
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
void ane_bridge_free(ANEKernelHandle *kernel);
|
||||
|
||||
// Compile count (useful for tracking exec() restart budget in BLOBFILE mode)
|
||||
int ane_bridge_get_compile_count(void);
|
||||
void ane_bridge_reset_compile_count(void);
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Weight blob helpers (BLOBFILE mode)
|
||||
// Builds the 128-byte ANE blob header + fp16 weights for use with
|
||||
// ane_bridge_compile / ane_bridge_compile_multi_weights.
|
||||
// ---------------------------------------------------------------------------
|
||||
uint8_t *ane_bridge_build_weight_blob(const float *src, int rows, int cols,
|
||||
size_t *out_len);
|
||||
uint8_t *ane_bridge_build_weight_blob_transposed(const float *src, int rows, int cols,
|
||||
size_t *out_len);
|
||||
void ane_bridge_free_blob(void *ptr);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif // ANE_BRIDGE_H
|
||||
|
|
|
|||
|
|
@ -1,328 +1,501 @@
|
|||
// ane_bridge.m — Objective-C implementation of ANE bridge for Python ctypes
|
||||
// Wraps _ANEInMemoryModel private APIs into C-callable functions
|
||||
|
||||
#import <Foundation/Foundation.h>
|
||||
#import <objc/runtime.h>
|
||||
#import <objc/message.h>
|
||||
#import <dlfcn.h>
|
||||
#import <IOSurface/IOSurface.h>
|
||||
#include "ane_bridge.h"
|
||||
|
||||
// --- Private class references ---
|
||||
static Class g_ANEDesc = nil;
|
||||
static Class g_ANEInMem = nil;
|
||||
static Class g_ANEReq = nil;
|
||||
static Class g_ANEIO = nil;
|
||||
static bool g_initialized = false;
|
||||
static int g_compile_count = 0;
|
||||
|
||||
// --- Kernel handle struct ---
|
||||
struct ANEKernelHandle {
|
||||
id model; // _ANEInMemoryModel
|
||||
IOSurfaceRef *ioInputs;
|
||||
IOSurfaceRef *ioOutputs;
|
||||
id request; // _ANERequest
|
||||
NSString *tmpDir;
|
||||
int nInputs, nOutputs;
|
||||
size_t *inputBytes;
|
||||
size_t *outputBytes;
|
||||
};
|
||||
|
||||
// --- Public API ---
|
||||
|
||||
int ane_bridge_init(void) {
|
||||
if (g_initialized) return 0;
|
||||
|
||||
void *handle = dlopen(
|
||||
"/System/Library/PrivateFrameworks/AppleNeuralEngine.framework/AppleNeuralEngine",
|
||||
RTLD_NOW);
|
||||
if (!handle) {
|
||||
fprintf(stderr, "ane_bridge: Failed to load AppleNeuralEngine.framework\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
g_ANEDesc = NSClassFromString(@"_ANEInMemoryModelDescriptor");
|
||||
g_ANEInMem = NSClassFromString(@"_ANEInMemoryModel");
|
||||
g_ANEReq = NSClassFromString(@"_ANERequest");
|
||||
g_ANEIO = NSClassFromString(@"_ANEIOSurfaceObject");
|
||||
|
||||
if (!g_ANEDesc || !g_ANEInMem || !g_ANEReq || !g_ANEIO) {
|
||||
fprintf(stderr, "ane_bridge: Failed to resolve ANE private classes\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
g_initialized = true;
|
||||
g_compile_count = 0;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static IOSurfaceRef create_surface(size_t bytes) {
|
||||
return IOSurfaceCreate((__bridge CFDictionaryRef)@{
|
||||
(id)kIOSurfaceWidth: @(bytes),
|
||||
(id)kIOSurfaceHeight: @1,
|
||||
(id)kIOSurfaceBytesPerElement: @1,
|
||||
(id)kIOSurfaceBytesPerRow: @(bytes),
|
||||
(id)kIOSurfaceAllocSize: @(bytes),
|
||||
(id)kIOSurfacePixelFormat: @0
|
||||
});
|
||||
}
|
||||
|
||||
ANEKernelHandle *ane_bridge_compile_multi_weights(
|
||||
const char *mil_text, size_t mil_len,
|
||||
const char **weight_names, const uint8_t **weight_datas,
|
||||
const size_t *weight_lens, int n_weights,
|
||||
int n_inputs, const size_t *input_sizes,
|
||||
int n_outputs, const size_t *output_sizes)
|
||||
{
|
||||
@autoreleasepool {
|
||||
if (!g_initialized) {
|
||||
fprintf(stderr, "ane_bridge: Not initialized\n");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
NSData *milData = [NSData dataWithBytes:mil_text length:mil_len];
|
||||
NSError *e = nil;
|
||||
|
||||
// Build weight dictionary
|
||||
NSMutableDictionary *wdict = [NSMutableDictionary dictionary];
|
||||
for (int i = 0; i < n_weights; i++) {
|
||||
NSString *name = [NSString stringWithUTF8String:weight_names[i]];
|
||||
NSData *data = [NSData dataWithBytes:weight_datas[i] length:weight_lens[i]];
|
||||
wdict[name] = @{@"offset": @0, @"data": data};
|
||||
}
|
||||
|
||||
id desc = ((id(*)(Class,SEL,id,id,id))objc_msgSend)(
|
||||
g_ANEDesc, @selector(modelWithMILText:weights:optionsPlist:),
|
||||
milData, wdict.count > 0 ? wdict : nil, nil);
|
||||
if (!desc) {
|
||||
fprintf(stderr, "ane_bridge: modelWithMILText failed\n");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
id mdl = ((id(*)(Class,SEL,id))objc_msgSend)(
|
||||
g_ANEInMem, @selector(inMemoryModelWithDescriptor:), desc);
|
||||
if (!mdl) {
|
||||
fprintf(stderr, "ane_bridge: inMemoryModelWithDescriptor failed\n");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// Pre-populate temp dir
|
||||
id hx = ((id(*)(id,SEL))objc_msgSend)(mdl, @selector(hexStringIdentifier));
|
||||
NSString *td = [NSTemporaryDirectory() stringByAppendingPathComponent:hx];
|
||||
NSFileManager *fm = [NSFileManager defaultManager];
|
||||
[fm createDirectoryAtPath:[td stringByAppendingPathComponent:@"weights"]
|
||||
withIntermediateDirectories:YES attributes:nil error:nil];
|
||||
[milData writeToFile:[td stringByAppendingPathComponent:@"model.mil"] atomically:YES];
|
||||
|
||||
for (int i = 0; i < n_weights; i++) {
|
||||
NSString *name = [NSString stringWithUTF8String:weight_names[i]];
|
||||
// Extract filename from path like "@model_path/weights/wq.bin" -> "weights/wq.bin"
|
||||
NSString *relPath = name;
|
||||
if ([name hasPrefix:@"@model_path/"]) {
|
||||
relPath = [name substringFromIndex:12];
|
||||
}
|
||||
NSString *fullPath = [td stringByAppendingPathComponent:relPath];
|
||||
NSString *dir = [fullPath stringByDeletingLastPathComponent];
|
||||
[fm createDirectoryAtPath:dir withIntermediateDirectories:YES attributes:nil error:nil];
|
||||
NSData *data = [NSData dataWithBytes:weight_datas[i] length:weight_lens[i]];
|
||||
[data writeToFile:fullPath atomically:YES];
|
||||
}
|
||||
|
||||
// Compile
|
||||
if (!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(
|
||||
mdl, @selector(compileWithQoS:options:error:), 21, @{}, &e)) {
|
||||
fprintf(stderr, "ane_bridge: ANE compile failed: %s\n",
|
||||
e ? [[e description] UTF8String] : "unknown");
|
||||
[fm removeItemAtPath:td error:nil];
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// Load (with one retry after a brief pause for ANE slot reclamation)
|
||||
BOOL loaded = ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(
|
||||
mdl, @selector(loadWithQoS:options:error:), 21, @{}, &e);
|
||||
if (!loaded) {
|
||||
fprintf(stderr, "ane_bridge: ANE load failed (retrying in 100ms): %s\n",
|
||||
e ? [[e description] UTF8String] : "unknown");
|
||||
usleep(100000); // 100ms
|
||||
e = nil;
|
||||
loaded = ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(
|
||||
mdl, @selector(loadWithQoS:options:error:), 21, @{}, &e);
|
||||
}
|
||||
if (!loaded) {
|
||||
fprintf(stderr, "ane_bridge: ANE load failed after retry: %s\n",
|
||||
e ? [[e description] UTF8String] : "unknown");
|
||||
[fm removeItemAtPath:td error:nil];
|
||||
return NULL;
|
||||
}
|
||||
|
||||
g_compile_count++;
|
||||
|
||||
// Create kernel handle
|
||||
ANEKernelHandle *k = (ANEKernelHandle *)calloc(1, sizeof(ANEKernelHandle));
|
||||
k->model = mdl;
|
||||
k->tmpDir = td;
|
||||
k->nInputs = n_inputs;
|
||||
k->nOutputs = n_outputs;
|
||||
k->inputBytes = (size_t *)malloc(n_inputs * sizeof(size_t));
|
||||
k->outputBytes = (size_t *)malloc(n_outputs * sizeof(size_t));
|
||||
memcpy(k->inputBytes, input_sizes, n_inputs * sizeof(size_t));
|
||||
memcpy(k->outputBytes, output_sizes, n_outputs * sizeof(size_t));
|
||||
|
||||
// Create IOSurfaces
|
||||
k->ioInputs = (IOSurfaceRef *)malloc(n_inputs * sizeof(IOSurfaceRef));
|
||||
k->ioOutputs = (IOSurfaceRef *)malloc(n_outputs * sizeof(IOSurfaceRef));
|
||||
for (int i = 0; i < n_inputs; i++)
|
||||
k->ioInputs[i] = create_surface(input_sizes[i]);
|
||||
for (int i = 0; i < n_outputs; i++)
|
||||
k->ioOutputs[i] = create_surface(output_sizes[i]);
|
||||
|
||||
// Build request
|
||||
NSMutableArray *wIns = [NSMutableArray arrayWithCapacity:n_inputs];
|
||||
NSMutableArray *iIdx = [NSMutableArray arrayWithCapacity:n_inputs];
|
||||
for (int i = 0; i < n_inputs; i++) {
|
||||
[wIns addObject:((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(
|
||||
g_ANEIO, @selector(objectWithIOSurface:), k->ioInputs[i])];
|
||||
[iIdx addObject:@(i)];
|
||||
}
|
||||
NSMutableArray *wOuts = [NSMutableArray arrayWithCapacity:n_outputs];
|
||||
NSMutableArray *oIdx = [NSMutableArray arrayWithCapacity:n_outputs];
|
||||
for (int i = 0; i < n_outputs; i++) {
|
||||
[wOuts addObject:((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(
|
||||
g_ANEIO, @selector(objectWithIOSurface:), k->ioOutputs[i])];
|
||||
[oIdx addObject:@(i)];
|
||||
}
|
||||
k->request = ((id(*)(Class,SEL,id,id,id,id,id,id,id))objc_msgSend)(
|
||||
g_ANEReq,
|
||||
@selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:),
|
||||
wIns, iIdx, wOuts, oIdx, nil, nil, @0);
|
||||
|
||||
return k;
|
||||
}
|
||||
}
|
||||
|
||||
ANEKernelHandle *ane_bridge_compile(const char *mil_text, size_t mil_len,
|
||||
const uint8_t *weight_data, size_t weight_len,
|
||||
int n_inputs, const size_t *input_sizes,
|
||||
int n_outputs, const size_t *output_sizes) {
|
||||
if (weight_data && weight_len > 0) {
|
||||
const char *name = "@model_path/weights/weight.bin";
|
||||
return ane_bridge_compile_multi_weights(
|
||||
mil_text, mil_len,
|
||||
&name, &weight_data, &weight_len, 1,
|
||||
n_inputs, input_sizes,
|
||||
n_outputs, output_sizes);
|
||||
} else {
|
||||
return ane_bridge_compile_multi_weights(
|
||||
mil_text, mil_len,
|
||||
NULL, NULL, NULL, 0,
|
||||
n_inputs, input_sizes,
|
||||
n_outputs, output_sizes);
|
||||
}
|
||||
}
|
||||
|
||||
bool ane_bridge_eval(ANEKernelHandle *kernel) {
|
||||
@autoreleasepool {
|
||||
if (!kernel || !kernel->model) return false;
|
||||
NSError *e = nil;
|
||||
return ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)(
|
||||
kernel->model, @selector(evaluateWithQoS:options:request:error:),
|
||||
21, @{}, kernel->request, &e);
|
||||
}
|
||||
}
|
||||
|
||||
void ane_bridge_write_input(ANEKernelHandle *kernel, int idx,
|
||||
const void *data, size_t bytes) {
|
||||
if (!kernel || idx < 0 || idx >= kernel->nInputs) return;
|
||||
IOSurfaceLock(kernel->ioInputs[idx], 0, NULL);
|
||||
memcpy(IOSurfaceGetBaseAddress(kernel->ioInputs[idx]), data, bytes);
|
||||
IOSurfaceUnlock(kernel->ioInputs[idx], 0, NULL);
|
||||
}
|
||||
|
||||
void ane_bridge_read_output(ANEKernelHandle *kernel, int idx,
|
||||
void *data, size_t bytes) {
|
||||
if (!kernel || idx < 0 || idx >= kernel->nOutputs) return;
|
||||
IOSurfaceLock(kernel->ioOutputs[idx], kIOSurfaceLockReadOnly, NULL);
|
||||
memcpy(data, IOSurfaceGetBaseAddress(kernel->ioOutputs[idx]), bytes);
|
||||
IOSurfaceUnlock(kernel->ioOutputs[idx], kIOSurfaceLockReadOnly, NULL);
|
||||
}
|
||||
|
||||
void ane_bridge_free(ANEKernelHandle *kernel) {
|
||||
@autoreleasepool {
|
||||
if (!kernel) return;
|
||||
NSError *e = nil;
|
||||
if (kernel->model) {
|
||||
((BOOL(*)(id,SEL,unsigned int,NSError**))objc_msgSend)(
|
||||
kernel->model, @selector(unloadWithQoS:error:), 21, &e);
|
||||
}
|
||||
for (int i = 0; i < kernel->nInputs; i++)
|
||||
if (kernel->ioInputs[i]) CFRelease(kernel->ioInputs[i]);
|
||||
for (int i = 0; i < kernel->nOutputs; i++)
|
||||
if (kernel->ioOutputs[i]) CFRelease(kernel->ioOutputs[i]);
|
||||
if (kernel->tmpDir) {
|
||||
[[NSFileManager defaultManager] removeItemAtPath:kernel->tmpDir error:nil];
|
||||
}
|
||||
free(kernel->ioInputs);
|
||||
free(kernel->ioOutputs);
|
||||
free(kernel->inputBytes);
|
||||
free(kernel->outputBytes);
|
||||
|
||||
// Explicitly nil Objective-C objects to trigger ARC release before freeing struct
|
||||
kernel->model = nil;
|
||||
kernel->request = nil;
|
||||
kernel->tmpDir = nil;
|
||||
|
||||
free(kernel);
|
||||
}
|
||||
}
|
||||
|
||||
int ane_bridge_get_compile_count(void) {
|
||||
return g_compile_count;
|
||||
}
|
||||
|
||||
void ane_bridge_reset_compile_count(void) {
|
||||
g_compile_count = 0;
|
||||
}
|
||||
|
||||
uint8_t *ane_bridge_build_weight_blob(const float *src, int rows, int cols,
|
||||
size_t *out_len) {
|
||||
int wsize = rows * cols * 2; // fp16
|
||||
int total = 128 + wsize;
|
||||
uint8_t *buf = (uint8_t *)calloc(total, 1);
|
||||
|
||||
// ANE blob header
|
||||
buf[0] = 0x01; buf[4] = 0x02;
|
||||
buf[64] = 0xEF; buf[65] = 0xBE; buf[66] = 0xAD; buf[67] = 0xDE;
|
||||
buf[68] = 0x01;
|
||||
*(uint32_t*)(buf + 72) = wsize;
|
||||
*(uint32_t*)(buf + 80) = 128;
|
||||
|
||||
// Convert float32 -> float16
|
||||
_Float16 *fp16 = (_Float16 *)(buf + 128);
|
||||
for (int i = 0; i < rows * cols; i++) {
|
||||
fp16[i] = (_Float16)src[i];
|
||||
}
|
||||
|
||||
*out_len = total;
|
||||
return buf;
|
||||
}
|
||||
|
||||
uint8_t *ane_bridge_build_weight_blob_transposed(const float *src, int rows, int cols,
|
||||
size_t *out_len) {
|
||||
int wsize = rows * cols * 2;
|
||||
int total = 128 + wsize;
|
||||
uint8_t *buf = (uint8_t *)calloc(total, 1);
|
||||
|
||||
buf[0] = 0x01; buf[4] = 0x02;
|
||||
buf[64] = 0xEF; buf[65] = 0xBE; buf[66] = 0xAD; buf[67] = 0xDE;
|
||||
buf[68] = 0x01;
|
||||
*(uint32_t*)(buf + 72) = wsize;
|
||||
*(uint32_t*)(buf + 80) = 128;
|
||||
|
||||
_Float16 *fp16 = (_Float16 *)(buf + 128);
|
||||
for (int i = 0; i < rows; i++)
|
||||
for (int j = 0; j < cols; j++)
|
||||
fp16[j * rows + i] = (_Float16)src[i * cols + j];
|
||||
|
||||
*out_len = total;
|
||||
return buf;
|
||||
}
|
||||
// ane_bridge.m — Objective-C implementation of ANE bridge for Python ctypes
|
||||
// Wraps _ANEInMemoryModel private APIs into C-callable functions
|
||||
//
|
||||
// Two modes: BLOBFILE (upstream compatible) and dynamic IOSurface (our approach).
|
||||
// See ane_bridge.h for full API documentation.
|
||||
|
||||
#import <Foundation/Foundation.h>
|
||||
#import <objc/runtime.h>
|
||||
#import <objc/message.h>
|
||||
#import <dlfcn.h>
|
||||
#import <IOSurface/IOSurface.h>
|
||||
#include <arm_neon.h>
|
||||
#include "ane_bridge.h"
|
||||
|
||||
// --- Private class references ---
|
||||
static Class g_ANEDesc = nil;
|
||||
static Class g_ANEInMem = nil;
|
||||
static Class g_ANEReq = nil;
|
||||
static Class g_ANEIO = nil;
|
||||
static bool g_initialized = false;
|
||||
static int g_compile_count = 0;
|
||||
|
||||
// _ANEClient for beginRealTimeTask — retrieved from first loaded model
|
||||
static id g_rt_client = nil;
|
||||
|
||||
// --- Kernel handle ---
|
||||
struct ANEKernelHandle {
|
||||
id model; // _ANEInMemoryModel
|
||||
IOSurfaceRef *ioInputs; // activation input surfaces
|
||||
IOSurfaceRef *ioOutputs; // output surfaces
|
||||
IOSurfaceRef *ioWeights; // dynamic weight surfaces (NULL for BLOBFILE mode)
|
||||
id request; // _ANERequest
|
||||
NSString *tmpDir;
|
||||
int nInputs, nOutputs, nWeights;
|
||||
size_t *inputBytes;
|
||||
size_t *outputBytes;
|
||||
size_t *weightBytes;
|
||||
};
|
||||
|
||||
// --- Helpers ---
|
||||
|
||||
static IOSurfaceRef make_surface(size_t bytes) {
|
||||
return IOSurfaceCreate((__bridge CFDictionaryRef)@{
|
||||
(id)kIOSurfaceWidth: @(bytes),
|
||||
(id)kIOSurfaceHeight: @1,
|
||||
(id)kIOSurfaceBytesPerElement:@1,
|
||||
(id)kIOSurfaceBytesPerRow: @(bytes),
|
||||
(id)kIOSurfaceAllocSize: @(bytes),
|
||||
(id)kIOSurfacePixelFormat: @0
|
||||
});
|
||||
}
|
||||
|
||||
static id wrap_surface(IOSurfaceRef s) {
|
||||
return ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(
|
||||
g_ANEIO, @selector(objectWithIOSurface:), s);
|
||||
}
|
||||
|
||||
// Compile cache: ~/.ane_cache/<hexId>/
|
||||
// Saves ~3100ms on cache hit (700ms vs 3800ms for 74 kernels).
|
||||
static BOOL try_cache_restore(id mdl, NSString *td, NSFileManager *fm) {
|
||||
NSString *hx = ((id(*)(id,SEL))objc_msgSend)(mdl, @selector(hexStringIdentifier));
|
||||
NSString *cacheDir = [NSHomeDirectory() stringByAppendingPathComponent:
|
||||
[@".ane_cache/" stringByAppendingString:hx]];
|
||||
NSString *cachedPlist = [cacheDir stringByAppendingPathComponent:@"net.plist"];
|
||||
if (![fm fileExistsAtPath:cachedPlist]) return NO;
|
||||
[fm copyItemAtPath:cachedPlist toPath:[td stringByAppendingPathComponent:@"net.plist"] error:nil];
|
||||
// BLOBFILE models also produce a `data` file; dynamic-weight models do not
|
||||
NSString *cachedData = [cacheDir stringByAppendingPathComponent:@"data"];
|
||||
if ([fm fileExistsAtPath:cachedData])
|
||||
[fm copyItemAtPath:cachedData toPath:[td stringByAppendingPathComponent:@"data"] error:nil];
|
||||
return YES;
|
||||
}
|
||||
|
||||
static void save_to_cache(id mdl, NSString *td, NSFileManager *fm) {
|
||||
NSString *hx = ((id(*)(id,SEL))objc_msgSend)(mdl, @selector(hexStringIdentifier));
|
||||
NSString *cacheDir = [NSHomeDirectory() stringByAppendingPathComponent:
|
||||
[@".ane_cache/" stringByAppendingString:hx]];
|
||||
[fm createDirectoryAtPath:cacheDir withIntermediateDirectories:YES attributes:nil error:nil];
|
||||
[fm copyItemAtPath:[td stringByAppendingPathComponent:@"net.plist"]
|
||||
toPath:[cacheDir stringByAppendingPathComponent:@"net.plist"] error:nil];
|
||||
// Copy data only if present (BLOBFILE models)
|
||||
NSString *tdData = [td stringByAppendingPathComponent:@"data"];
|
||||
if ([fm fileExistsAtPath:tdData])
|
||||
[fm copyItemAtPath:tdData toPath:[cacheDir stringByAppendingPathComponent:@"data"] error:nil];
|
||||
}
|
||||
|
||||
static BOOL compile_and_load(id mdl, NSString *td, NSFileManager *fm) {
|
||||
NSError *e = nil;
|
||||
BOOL fromCache = try_cache_restore(mdl, td, fm);
|
||||
|
||||
if (!fromCache) {
|
||||
if (!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(
|
||||
mdl, @selector(compileWithQoS:options:error:), 21, @{}, &e)) {
|
||||
fprintf(stderr, "ane_bridge: compile failed: %s\n",
|
||||
e ? [[e description] UTF8String] : "unknown");
|
||||
return NO;
|
||||
}
|
||||
save_to_cache(mdl, td, fm);
|
||||
g_compile_count++;
|
||||
}
|
||||
|
||||
e = nil;
|
||||
BOOL loaded = ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(
|
||||
mdl, @selector(loadWithQoS:options:error:), 21, @{}, &e);
|
||||
if (!loaded) {
|
||||
usleep(100000);
|
||||
e = nil;
|
||||
loaded = ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(
|
||||
mdl, @selector(loadWithQoS:options:error:), 21, @{}, &e);
|
||||
}
|
||||
if (!loaded) {
|
||||
fprintf(stderr, "ane_bridge: load failed: %s\n",
|
||||
e ? [[e description] UTF8String] : "unknown");
|
||||
return NO;
|
||||
}
|
||||
|
||||
// Cache _ANEClient for real-time task API
|
||||
if (!g_rt_client) {
|
||||
Ivar iv = class_getInstanceVariable(object_getClass(mdl), "_sharedConnection");
|
||||
if (iv) g_rt_client = object_getIvar(mdl, iv);
|
||||
}
|
||||
|
||||
return YES;
|
||||
}
|
||||
|
||||
// Build _ANERequest from arrays of input and output surfaces
|
||||
static id build_request(IOSurfaceRef *inputs, int nIn,
|
||||
IOSurfaceRef *outputs, int nOut) {
|
||||
NSMutableArray *wIns = [NSMutableArray arrayWithCapacity:nIn];
|
||||
NSMutableArray *iIdx = [NSMutableArray arrayWithCapacity:nIn];
|
||||
for (int i = 0; i < nIn; i++) {
|
||||
[wIns addObject:wrap_surface(inputs[i])];
|
||||
[iIdx addObject:@(i)];
|
||||
}
|
||||
NSMutableArray *wOuts = [NSMutableArray arrayWithCapacity:nOut];
|
||||
NSMutableArray *oIdx = [NSMutableArray arrayWithCapacity:nOut];
|
||||
for (int i = 0; i < nOut; i++) {
|
||||
[wOuts addObject:wrap_surface(outputs[i])];
|
||||
[oIdx addObject:@(i)];
|
||||
}
|
||||
return ((id(*)(Class,SEL,id,id,id,id,id,id,id))objc_msgSend)(
|
||||
g_ANEReq,
|
||||
@selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:),
|
||||
wIns, iIdx, wOuts, oIdx, nil, nil, @0);
|
||||
}
|
||||
|
||||
// NEON fp32 → fp16 conversion
|
||||
static void cvt_f32_f16(const float *src, uint16_t *dst, size_t count) {
|
||||
size_t i = 0;
|
||||
for (; i + 4 <= count; i += 4) {
|
||||
float32x4_t f = vld1q_f32(src + i);
|
||||
vst1_u16(dst + i, vreinterpret_u16_f16(vcvt_f16_f32(f)));
|
||||
}
|
||||
for (; i < count; i++)
|
||||
dst[i] = vreinterpret_u16_f16(vcvt_f16_f32(vdupq_n_f32(src[i])))[0];
|
||||
}
|
||||
|
||||
// --- Public API ---
|
||||
|
||||
int ane_bridge_init(void) {
|
||||
if (g_initialized) return 0;
|
||||
void *handle = dlopen(
|
||||
"/System/Library/PrivateFrameworks/AppleNeuralEngine.framework/AppleNeuralEngine",
|
||||
RTLD_NOW);
|
||||
if (!handle) { fprintf(stderr, "ane_bridge: failed to load ANE framework\n"); return -1; }
|
||||
|
||||
g_ANEDesc = NSClassFromString(@"_ANEInMemoryModelDescriptor");
|
||||
g_ANEInMem = NSClassFromString(@"_ANEInMemoryModel");
|
||||
g_ANEReq = NSClassFromString(@"_ANERequest");
|
||||
g_ANEIO = NSClassFromString(@"_ANEIOSurfaceObject");
|
||||
|
||||
if (!g_ANEDesc || !g_ANEInMem || !g_ANEReq || !g_ANEIO) {
|
||||
fprintf(stderr, "ane_bridge: failed to resolve ANE private classes\n");
|
||||
return -1;
|
||||
}
|
||||
g_initialized = true;
|
||||
return 0;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// BLOBFILE compile (upstream compatible)
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
ANEKernelHandle *ane_bridge_compile_multi_weights(
|
||||
const char *mil_text, size_t mil_len,
|
||||
const char **weight_names, const uint8_t **weight_datas,
|
||||
const size_t *weight_lens, int n_weights,
|
||||
int n_inputs, const size_t *input_sizes,
|
||||
int n_outputs, const size_t *output_sizes)
|
||||
{
|
||||
@autoreleasepool {
|
||||
if (!g_initialized) { fprintf(stderr, "ane_bridge: not initialized\n"); return NULL; }
|
||||
|
||||
NSData *milData = [NSData dataWithBytes:mil_text length:mil_len];
|
||||
NSMutableDictionary *wdict = [NSMutableDictionary dictionary];
|
||||
for (int i = 0; i < n_weights; i++) {
|
||||
NSString *name = [NSString stringWithUTF8String:weight_names[i]];
|
||||
NSData *data = [NSData dataWithBytes:weight_datas[i] length:weight_lens[i]];
|
||||
wdict[name] = @{@"offset": @0, @"data": data};
|
||||
}
|
||||
|
||||
id desc = ((id(*)(Class,SEL,id,id,id))objc_msgSend)(
|
||||
g_ANEDesc, @selector(modelWithMILText:weights:optionsPlist:),
|
||||
milData, wdict.count > 0 ? wdict : nil, nil);
|
||||
if (!desc) { fprintf(stderr, "ane_bridge: modelWithMILText failed\n"); return NULL; }
|
||||
|
||||
id mdl = ((id(*)(Class,SEL,id))objc_msgSend)(
|
||||
g_ANEInMem, @selector(inMemoryModelWithDescriptor:), desc);
|
||||
|
||||
NSString *hx = ((id(*)(id,SEL))objc_msgSend)(mdl, @selector(hexStringIdentifier));
|
||||
NSString *td = [NSTemporaryDirectory() stringByAppendingPathComponent:hx];
|
||||
NSFileManager *fm = [NSFileManager defaultManager];
|
||||
[fm createDirectoryAtPath:[td stringByAppendingPathComponent:@"weights"]
|
||||
withIntermediateDirectories:YES attributes:nil error:nil];
|
||||
[milData writeToFile:[td stringByAppendingPathComponent:@"model.mil"] atomically:YES];
|
||||
for (int i = 0; i < n_weights; i++) {
|
||||
NSString *rel = [NSString stringWithUTF8String:weight_names[i]];
|
||||
if ([rel hasPrefix:@"@model_path/"]) rel = [rel substringFromIndex:12];
|
||||
NSString *full = [td stringByAppendingPathComponent:rel];
|
||||
[fm createDirectoryAtPath:[full stringByDeletingLastPathComponent]
|
||||
withIntermediateDirectories:YES attributes:nil error:nil];
|
||||
[[NSData dataWithBytes:weight_datas[i] length:weight_lens[i]] writeToFile:full atomically:YES];
|
||||
}
|
||||
|
||||
if (!compile_and_load(mdl, td, fm)) { [fm removeItemAtPath:td error:nil]; return NULL; }
|
||||
|
||||
ANEKernelHandle *k = (ANEKernelHandle *)calloc(1, sizeof(ANEKernelHandle));
|
||||
k->model = mdl;
|
||||
k->tmpDir = td;
|
||||
k->nInputs = n_inputs;
|
||||
k->nOutputs = n_outputs;
|
||||
k->nWeights = 0;
|
||||
k->ioWeights = NULL;
|
||||
k->weightBytes = NULL;
|
||||
k->inputBytes = (size_t *)malloc(n_inputs * sizeof(size_t));
|
||||
k->outputBytes = (size_t *)malloc(n_outputs * sizeof(size_t));
|
||||
memcpy(k->inputBytes, input_sizes, n_inputs * sizeof(size_t));
|
||||
memcpy(k->outputBytes, output_sizes, n_outputs * sizeof(size_t));
|
||||
|
||||
k->ioInputs = (IOSurfaceRef *)malloc(n_inputs * sizeof(IOSurfaceRef));
|
||||
k->ioOutputs = (IOSurfaceRef *)malloc(n_outputs * sizeof(IOSurfaceRef));
|
||||
for (int i = 0; i < n_inputs; i++) k->ioInputs[i] = make_surface(input_sizes[i]);
|
||||
for (int i = 0; i < n_outputs; i++) k->ioOutputs[i] = make_surface(output_sizes[i]);
|
||||
|
||||
k->request = build_request(k->ioInputs, n_inputs, k->ioOutputs, n_outputs);
|
||||
return k;
|
||||
}
|
||||
}
|
||||
|
||||
ANEKernelHandle *ane_bridge_compile(const char *mil_text, size_t mil_len,
|
||||
const uint8_t *weight_data, size_t weight_len,
|
||||
int n_inputs, const size_t *input_sizes,
|
||||
int n_outputs, const size_t *output_sizes) {
|
||||
if (weight_data && weight_len > 0) {
|
||||
const char *name = "@model_path/weights/weight.bin";
|
||||
return ane_bridge_compile_multi_weights(mil_text, mil_len,
|
||||
&name, &weight_data, &weight_len, 1,
|
||||
n_inputs, input_sizes, n_outputs, output_sizes);
|
||||
}
|
||||
return ane_bridge_compile_multi_weights(mil_text, mil_len,
|
||||
NULL, NULL, NULL, 0,
|
||||
n_inputs, input_sizes, n_outputs, output_sizes);
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Dynamic IOSurface compile (our approach — compile ONCE, write per Adam step)
|
||||
//
|
||||
// MIL program must declare weights as function parameters after activation inputs:
|
||||
// func main<ios18>(tensor<fp16,...> x0, tensor<fp16,...> w0, tensor<fp16,...> w1) { ... }
|
||||
//
|
||||
// The _ANERequest bundles them as: inputs=[x0, w0, w1, ...], outputs=[out]
|
||||
// Weight IOSurfaces persist between evals — update via ane_bridge_write_weight().
|
||||
// ---------------------------------------------------------------------------
|
||||
ANEKernelHandle *ane_bridge_compile_dyn(
|
||||
const char *mil_text, size_t mil_len,
|
||||
int n_inputs, const size_t *input_sizes,
|
||||
int n_weights, const size_t *weight_sizes,
|
||||
size_t output_size)
|
||||
{
|
||||
@autoreleasepool {
|
||||
if (!g_initialized) { fprintf(stderr, "ane_bridge: not initialized\n"); return NULL; }
|
||||
|
||||
NSData *milData = [NSData dataWithBytes:mil_text length:mil_len];
|
||||
|
||||
// Dynamic weights: pass empty weight dict (no BLOBFILE)
|
||||
id desc = ((id(*)(Class,SEL,id,id,id))objc_msgSend)(
|
||||
g_ANEDesc, @selector(modelWithMILText:weights:optionsPlist:),
|
||||
milData, @{}, nil);
|
||||
if (!desc) { fprintf(stderr, "ane_bridge: modelWithMILText failed\n"); return NULL; }
|
||||
|
||||
id mdl = ((id(*)(Class,SEL,id))objc_msgSend)(
|
||||
g_ANEInMem, @selector(inMemoryModelWithDescriptor:), desc);
|
||||
|
||||
NSString *hx = ((id(*)(id,SEL))objc_msgSend)(mdl, @selector(hexStringIdentifier));
|
||||
NSString *td = [NSTemporaryDirectory() stringByAppendingPathComponent:hx];
|
||||
NSFileManager *fm = [NSFileManager defaultManager];
|
||||
[fm createDirectoryAtPath:td withIntermediateDirectories:YES attributes:nil error:nil];
|
||||
[milData writeToFile:[td stringByAppendingPathComponent:@"model.mil"] atomically:YES];
|
||||
|
||||
if (!compile_and_load(mdl, td, fm)) { [fm removeItemAtPath:td error:nil]; return NULL; }
|
||||
|
||||
// Allocate kernel handle
|
||||
ANEKernelHandle *k = (ANEKernelHandle *)calloc(1, sizeof(ANEKernelHandle));
|
||||
k->model = mdl;
|
||||
k->tmpDir = td;
|
||||
k->nInputs = n_inputs;
|
||||
k->nOutputs = 1;
|
||||
k->nWeights = n_weights;
|
||||
|
||||
k->inputBytes = (size_t *)malloc(n_inputs * sizeof(size_t));
|
||||
k->outputBytes = (size_t *)malloc(1 * sizeof(size_t));
|
||||
k->weightBytes = (size_t *)malloc(n_weights * sizeof(size_t));
|
||||
memcpy(k->inputBytes, input_sizes, n_inputs * sizeof(size_t));
|
||||
memcpy(k->weightBytes, weight_sizes, n_weights * sizeof(size_t));
|
||||
k->outputBytes[0] = output_size;
|
||||
|
||||
// Create IOSurfaces for activations, weights, and output
|
||||
k->ioInputs = (IOSurfaceRef *)malloc(n_inputs * sizeof(IOSurfaceRef));
|
||||
k->ioOutputs = (IOSurfaceRef *)malloc(1 * sizeof(IOSurfaceRef));
|
||||
k->ioWeights = (IOSurfaceRef *)malloc(n_weights * sizeof(IOSurfaceRef));
|
||||
for (int i = 0; i < n_inputs; i++) k->ioInputs[i] = make_surface(input_sizes[i]);
|
||||
for (int i = 0; i < n_weights; i++) k->ioWeights[i] = make_surface(weight_sizes[i]);
|
||||
k->ioOutputs[0] = make_surface(output_size);
|
||||
|
||||
// Build request: inputs = [x0, x1, ..., w0, w1, ...], outputs = [out]
|
||||
// Weights follow activation inputs at higher indices — matches MIL param order.
|
||||
int total_inputs = n_inputs + n_weights;
|
||||
NSMutableArray *allInputs = [NSMutableArray arrayWithCapacity:total_inputs];
|
||||
NSMutableArray *allIdx = [NSMutableArray arrayWithCapacity:total_inputs];
|
||||
for (int i = 0; i < n_inputs; i++) { [allInputs addObject:wrap_surface(k->ioInputs[i])]; [allIdx addObject:@(i)]; }
|
||||
for (int i = 0; i < n_weights; i++) { [allInputs addObject:wrap_surface(k->ioWeights[i])]; [allIdx addObject:@(n_inputs + i)]; }
|
||||
|
||||
id wOut = wrap_surface(k->ioOutputs[0]);
|
||||
k->request = ((id(*)(Class,SEL,id,id,id,id,id,id,id))objc_msgSend)(
|
||||
g_ANEReq,
|
||||
@selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:),
|
||||
allInputs, allIdx, @[wOut], @[@0], nil, nil, @0);
|
||||
|
||||
return k;
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Eval and I/O
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
bool ane_bridge_eval(ANEKernelHandle *kernel) {
|
||||
@autoreleasepool {
|
||||
if (!kernel || !kernel->model) return false;
|
||||
NSError *e = nil;
|
||||
return ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)(
|
||||
kernel->model, @selector(evaluateWithQoS:options:request:error:),
|
||||
21, @{}, kernel->request, &e);
|
||||
}
|
||||
}
|
||||
|
||||
void ane_bridge_write_input(ANEKernelHandle *kernel, int idx,
|
||||
const void *data, size_t bytes) {
|
||||
if (!kernel || idx < 0 || idx >= kernel->nInputs) return;
|
||||
IOSurfaceLock(kernel->ioInputs[idx], 0, NULL);
|
||||
memcpy(IOSurfaceGetBaseAddress(kernel->ioInputs[idx]), data, bytes);
|
||||
IOSurfaceUnlock(kernel->ioInputs[idx], 0, NULL);
|
||||
}
|
||||
|
||||
void ane_bridge_read_output(ANEKernelHandle *kernel, int idx,
|
||||
void *data, size_t bytes) {
|
||||
if (!kernel || idx < 0 || idx >= kernel->nOutputs) return;
|
||||
IOSurfaceLock(kernel->ioOutputs[idx], kIOSurfaceLockReadOnly, NULL);
|
||||
memcpy(data, IOSurfaceGetBaseAddress(kernel->ioOutputs[idx]), bytes);
|
||||
IOSurfaceUnlock(kernel->ioOutputs[idx], kIOSurfaceLockReadOnly, NULL);
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Dynamic weight I/O
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
void ane_bridge_write_weight(ANEKernelHandle *kernel, int idx,
|
||||
const void *fp16_data, size_t bytes) {
|
||||
if (!kernel || !kernel->ioWeights || idx < 0 || idx >= kernel->nWeights) return;
|
||||
IOSurfaceLock(kernel->ioWeights[idx], 0, NULL);
|
||||
memcpy(IOSurfaceGetBaseAddress(kernel->ioWeights[idx]), fp16_data, bytes);
|
||||
IOSurfaceUnlock(kernel->ioWeights[idx], 0, NULL);
|
||||
}
|
||||
|
||||
void ane_bridge_write_weight_f32(ANEKernelHandle *kernel, int idx,
|
||||
const float *fp32_data, size_t count) {
|
||||
if (!kernel || !kernel->ioWeights || idx < 0 || idx >= kernel->nWeights) return;
|
||||
IOSurfaceLock(kernel->ioWeights[idx], 0, NULL);
|
||||
cvt_f32_f16(fp32_data, (uint16_t *)IOSurfaceGetBaseAddress(kernel->ioWeights[idx]), count);
|
||||
IOSurfaceUnlock(kernel->ioWeights[idx], 0, NULL);
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Direct IOSurface copy (no CPU round-trip between chained kernels)
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
void ane_bridge_copy_io(ANEKernelHandle *src, int src_out_idx,
|
||||
ANEKernelHandle *dst, int dst_in_idx) {
|
||||
if (!src || !dst) return;
|
||||
if (src_out_idx < 0 || src_out_idx >= src->nOutputs) return;
|
||||
if (dst_in_idx < 0 || dst_in_idx >= dst->nInputs) return;
|
||||
|
||||
IOSurfaceRef srf = src->ioOutputs[src_out_idx];
|
||||
IOSurfaceRef drf = dst->ioInputs[dst_in_idx];
|
||||
size_t bytes = IOSurfaceGetAllocSize(srf);
|
||||
|
||||
IOSurfaceLock(srf, kIOSurfaceLockReadOnly, NULL);
|
||||
IOSurfaceLock(drf, 0, NULL);
|
||||
memcpy(IOSurfaceGetBaseAddress(drf), IOSurfaceGetBaseAddress(srf), bytes);
|
||||
IOSurfaceUnlock(drf, 0, NULL);
|
||||
IOSurfaceUnlock(srf, kIOSurfaceLockReadOnly, NULL);
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Real-time task — 90.6% p99 jitter reduction
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
void ane_bridge_begin_realtime(void) {
|
||||
if (!g_rt_client) return;
|
||||
((void(*)(id,SEL))objc_msgSend)(g_rt_client, @selector(beginRealTimeTask));
|
||||
}
|
||||
|
||||
void ane_bridge_end_realtime(void) {
|
||||
if (!g_rt_client) return;
|
||||
((void(*)(id,SEL))objc_msgSend)(g_rt_client, @selector(endRealTimeTask));
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Lifecycle
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
void ane_bridge_free(ANEKernelHandle *kernel) {
|
||||
@autoreleasepool {
|
||||
if (!kernel) return;
|
||||
NSError *e = nil;
|
||||
if (kernel->model) {
|
||||
((BOOL(*)(id,SEL,unsigned int,NSError**))objc_msgSend)(
|
||||
kernel->model, @selector(unloadWithQoS:error:), 21, &e);
|
||||
}
|
||||
for (int i = 0; i < kernel->nInputs; i++) if (kernel->ioInputs[i]) CFRelease(kernel->ioInputs[i]);
|
||||
for (int i = 0; i < kernel->nOutputs; i++) if (kernel->ioOutputs[i]) CFRelease(kernel->ioOutputs[i]);
|
||||
for (int i = 0; i < kernel->nWeights; i++) if (kernel->ioWeights[i]) CFRelease(kernel->ioWeights[i]);
|
||||
if (kernel->tmpDir) [[NSFileManager defaultManager] removeItemAtPath:kernel->tmpDir error:nil];
|
||||
|
||||
free(kernel->ioInputs);
|
||||
free(kernel->ioOutputs);
|
||||
free(kernel->ioWeights);
|
||||
free(kernel->inputBytes);
|
||||
free(kernel->outputBytes);
|
||||
free(kernel->weightBytes);
|
||||
|
||||
kernel->model = nil;
|
||||
kernel->request = nil;
|
||||
kernel->tmpDir = nil;
|
||||
free(kernel);
|
||||
}
|
||||
}
|
||||
|
||||
int ane_bridge_get_compile_count(void) { return g_compile_count; }
|
||||
void ane_bridge_reset_compile_count(void) { g_compile_count = 0; }
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Weight blob helpers (BLOBFILE mode)
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
uint8_t *ane_bridge_build_weight_blob(const float *src, int rows, int cols,
|
||||
size_t *out_len) {
|
||||
int wsize = rows * cols * 2;
|
||||
int total = 128 + wsize;
|
||||
uint8_t *buf = (uint8_t *)calloc(total, 1);
|
||||
buf[0] = 0x01; buf[4] = 0x02;
|
||||
buf[64] = 0xEF; buf[65] = 0xBE; buf[66] = 0xAD; buf[67] = 0xDE;
|
||||
buf[68] = 0x01;
|
||||
*(uint32_t*)(buf + 72) = wsize;
|
||||
*(uint32_t*)(buf + 80) = 128;
|
||||
cvt_f32_f16(src, (uint16_t *)(buf + 128), rows * cols);
|
||||
*out_len = total;
|
||||
return buf;
|
||||
}
|
||||
|
||||
uint8_t *ane_bridge_build_weight_blob_transposed(const float *src, int rows, int cols,
|
||||
size_t *out_len) {
|
||||
int wsize = rows * cols * 2;
|
||||
int total = 128 + wsize;
|
||||
uint8_t *buf = (uint8_t *)calloc(total, 1);
|
||||
buf[0] = 0x01; buf[4] = 0x02;
|
||||
buf[64] = 0xEF; buf[65] = 0xBE; buf[66] = 0xAD; buf[67] = 0xDE;
|
||||
buf[68] = 0x01;
|
||||
*(uint32_t*)(buf + 72) = wsize;
|
||||
*(uint32_t*)(buf + 80) = 128;
|
||||
uint16_t *fp16 = (uint16_t *)(buf + 128);
|
||||
for (int i = 0; i < rows; i++)
|
||||
for (int j = 0; j < cols; j++) {
|
||||
float32x4_t f = vdupq_n_f32(src[i * cols + j]);
|
||||
fp16[j * rows + i] = vreinterpret_u16_f16(vcvt_f16_f32(f))[0];
|
||||
}
|
||||
*out_len = total;
|
||||
return buf;
|
||||
}
|
||||
|
||||
void ane_bridge_free_blob(void *ptr) { free(ptr); }
|
||||
|
|
|
|||
Binary file not shown.
|
|
@ -0,0 +1,305 @@
|
|||
// test_bridge.m — functional tests for ane_bridge API
|
||||
// Tests both BLOBFILE (upstream compat) and dynamic IOSurface (our approach)
|
||||
|
||||
#import <Foundation/Foundation.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <math.h>
|
||||
#include <mach/mach_time.h>
|
||||
#include "ane_bridge.h"
|
||||
|
||||
static mach_timebase_info_data_t g_tb;
|
||||
static double ms(void) {
|
||||
return (double)mach_absolute_time() * g_tb.numer / g_tb.denom / 1e6;
|
||||
}
|
||||
|
||||
static int passed = 0, failed = 0;
|
||||
#define PASS(msg) do { printf(" [PASS] %s\n", msg); passed++; } while(0)
|
||||
#define FAIL(msg) do { printf(" [FAIL] %s\n", msg); failed++; } while(0)
|
||||
#define CHECK(cond, msg) do { if (cond) PASS(msg); else FAIL(msg); } while(0)
|
||||
|
||||
// Correct MIL header — must match exactly what ANE compiler expects
|
||||
#define MIL_HDR \
|
||||
"program(1.3)\n" \
|
||||
"[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, " \
|
||||
"{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, " \
|
||||
"{\"coremltools-version\", \"9.0\"}})]\n{\n"
|
||||
|
||||
// Dynamic weight matmul: y[1,N,M] = W[1,N,K] @ x[1,K,M]
|
||||
// x declared as [1,K,1,M] (4D), W as [1,N,K] (3D dynamic weight)
|
||||
static const char *mil_dyn_matmul(int N, int K, int M) {
|
||||
static char buf[2048];
|
||||
snprintf(buf, sizeof(buf),
|
||||
MIL_HDR
|
||||
" func main<ios18>(\n"
|
||||
" tensor<fp16, [1, %d, 1, %d]> x,\n"
|
||||
" tensor<fp16, [1, %d, %d]> W) {\n"
|
||||
" tensor<int32, [3]> sh = const()[name=string(\"sh\"), val=tensor<int32, [3]>([1,%d,%d])];\n"
|
||||
" tensor<fp16, [1,%d,%d]> x3 = reshape(shape=sh,x=x)[name=string(\"rx\")];\n"
|
||||
" bool bF = const()[name=string(\"bF\"), val=bool(false)];\n"
|
||||
" tensor<fp16, [1,%d,%d]> out = matmul(transpose_x=bF,transpose_y=bF,x=W,y=x3)[name=string(\"mm\")];\n"
|
||||
" } -> (out);\n"
|
||||
"}\n",
|
||||
K, M, // x: [1,K,1,M]
|
||||
N, K, // W: [1,N,K]
|
||||
K, M, // reshape sh
|
||||
K, M, // x3
|
||||
N, M // out
|
||||
);
|
||||
return buf;
|
||||
}
|
||||
|
||||
// No-weight MIL: elementwise add x+x = 2x
|
||||
static const char *mil_scale2(int rows, int cols) {
|
||||
static char buf[1024];
|
||||
snprintf(buf, sizeof(buf),
|
||||
MIL_HDR
|
||||
" func main<ios18>(tensor<fp16, [1, %d, 1, %d]> x) {\n"
|
||||
" tensor<fp16, [1,%d,1,%d]> out = add(x=x,y=x)[name=string(\"out\")];\n"
|
||||
" } -> (out);\n"
|
||||
"}\n",
|
||||
rows, cols, rows, cols
|
||||
);
|
||||
return buf;
|
||||
}
|
||||
|
||||
static void fill_fp16(void *buf, float val, int count) {
|
||||
uint16_t *p = (uint16_t *)buf;
|
||||
_Float16 fval = (_Float16)val;
|
||||
uint16_t bits;
|
||||
memcpy(&bits, &fval, 2);
|
||||
for (int i = 0; i < count; i++) p[i] = bits;
|
||||
}
|
||||
|
||||
static float read_fp16_elem(const void *buf, int idx) {
|
||||
const uint16_t *p = (const uint16_t *)buf;
|
||||
_Float16 v;
|
||||
memcpy(&v, &p[idx], 2);
|
||||
return (float)v;
|
||||
}
|
||||
|
||||
// --- Test 1: init ---
|
||||
static void test_init(void) {
|
||||
printf("\n[1] ane_bridge_init\n");
|
||||
int r = ane_bridge_init();
|
||||
CHECK(r == 0, "init returns 0");
|
||||
}
|
||||
|
||||
// --- Test 2: compile_dyn — compile once with dynamic weight ---
|
||||
static ANEKernelHandle *g_dyn_kern = NULL;
|
||||
static void test_compile_dyn(void) {
|
||||
printf("\n[2] ane_bridge_compile_dyn (dynamic weight IOSurface)\n");
|
||||
|
||||
int N=64, K=64, M=64;
|
||||
const char *mil = mil_dyn_matmul(N, K, M);
|
||||
size_t mil_len = strlen(mil);
|
||||
|
||||
// x: [1,K,1,M] fp16, W: [1,N,K] fp16, out: [1,N,M] fp16
|
||||
size_t in_sz = (size_t)K * M * 2;
|
||||
size_t w_sz = (size_t)N * K * 2;
|
||||
size_t out_sz = (size_t)N * M * 2;
|
||||
|
||||
double t0 = ms();
|
||||
g_dyn_kern = ane_bridge_compile_dyn(mil, mil_len, 1, &in_sz, 1, &w_sz, out_sz);
|
||||
double elapsed = ms() - t0;
|
||||
|
||||
CHECK(g_dyn_kern != NULL, "compile_dyn returns non-NULL handle");
|
||||
printf(" Compile time: %.1fms\n", elapsed);
|
||||
}
|
||||
|
||||
// --- Test 3: write_weight + eval — dynamic weights actually update ---
|
||||
static void test_dynamic_weight_update(void) {
|
||||
printf("\n[3] Dynamic weight update (write W, eval, check output changes)\n");
|
||||
if (!g_dyn_kern) { FAIL("no kernel from test 2"); return; }
|
||||
|
||||
int N=64, K=64, M=64;
|
||||
size_t in_sz = (size_t)K * M * 2;
|
||||
size_t w_sz = (size_t)N * K * 2;
|
||||
size_t out_sz = (size_t)N * M * 2;
|
||||
|
||||
uint16_t *xbuf = (uint16_t *)malloc(in_sz);
|
||||
uint16_t *wbuf_A = (uint16_t *)calloc(N*K, 2);
|
||||
uint16_t *wbuf_B = (uint16_t *)calloc(N*K, 2);
|
||||
uint16_t *out = (uint16_t *)malloc(out_sz);
|
||||
|
||||
// x = all 1.0
|
||||
fill_fp16(xbuf, 1.0f, K*M);
|
||||
|
||||
// W_A = identity (scale=1): diagonal 1.0
|
||||
_Float16 one = 1.0f; uint16_t one_bits; memcpy(&one_bits, &one, 2);
|
||||
_Float16 two = 2.0f; uint16_t two_bits; memcpy(&two_bits, &two, 2);
|
||||
for (int i = 0; i < N && i < K; i++) wbuf_A[i*K + i] = one_bits;
|
||||
for (int i = 0; i < N && i < K; i++) wbuf_B[i*K + i] = two_bits;
|
||||
|
||||
// Eval A: W = identity
|
||||
ane_bridge_write_input(g_dyn_kern, 0, xbuf, in_sz);
|
||||
ane_bridge_write_weight(g_dyn_kern, 0, wbuf_A, w_sz);
|
||||
bool ok = ane_bridge_eval(g_dyn_kern);
|
||||
CHECK(ok, "eval A succeeds");
|
||||
ane_bridge_read_output(g_dyn_kern, 0, out, out_sz);
|
||||
float sum_A = 0;
|
||||
for (int i = 0; i < N*M; i++) sum_A += read_fp16_elem(out, i);
|
||||
|
||||
// Eval B: W = 2x identity, NO recompile
|
||||
double t0 = ms();
|
||||
ane_bridge_write_weight(g_dyn_kern, 0, wbuf_B, w_sz);
|
||||
double write_ms = ms() - t0;
|
||||
|
||||
t0 = ms();
|
||||
ok = ane_bridge_eval(g_dyn_kern);
|
||||
double eval_ms = ms() - t0;
|
||||
|
||||
CHECK(ok, "eval B succeeds after weight update");
|
||||
ane_bridge_read_output(g_dyn_kern, 0, out, out_sz);
|
||||
float sum_B = 0;
|
||||
for (int i = 0; i < N*M; i++) sum_B += read_fp16_elem(out, i);
|
||||
|
||||
printf(" sum(out) W=identity: %.1f W=2x: %.1f ratio: %.2f (expect ~2.0)\n",
|
||||
sum_A, sum_B, sum_B / (sum_A + 1e-9f));
|
||||
CHECK(fabsf(sum_B / (sum_A + 1e-9f) - 2.0f) < 0.1f, "output doubled after weight update");
|
||||
printf(" write_weight: %.3fms eval: %.2fms\n", write_ms, eval_ms);
|
||||
|
||||
free(xbuf); free(wbuf_A); free(wbuf_B); free(out);
|
||||
}
|
||||
|
||||
// --- Test 4: write_weight_f32 ---
|
||||
static void test_write_weight_f32(void) {
|
||||
printf("\n[4] ane_bridge_write_weight_f32 (fp32 -> fp16 conversion)\n");
|
||||
if (!g_dyn_kern) { FAIL("no kernel"); return; }
|
||||
|
||||
int N=64, K=64, M=64;
|
||||
float *w_fp32 = (float *)calloc(N*K, 4);
|
||||
uint16_t *out1 = (uint16_t *)malloc((size_t)N*M*2);
|
||||
uint16_t *out2 = (uint16_t *)malloc((size_t)N*M*2);
|
||||
uint16_t *xbuf = (uint16_t *)malloc((size_t)K*M*2);
|
||||
fill_fp16(xbuf, 1.0f, K*M);
|
||||
|
||||
// 3x identity in fp32
|
||||
for (int i = 0; i < N && i < K; i++) w_fp32[i*K + i] = 3.0f;
|
||||
ane_bridge_write_input(g_dyn_kern, 0, xbuf, (size_t)K*M*2);
|
||||
ane_bridge_write_weight_f32(g_dyn_kern, 0, w_fp32, (size_t)N*K);
|
||||
bool ok = ane_bridge_eval(g_dyn_kern);
|
||||
CHECK(ok, "eval with fp32-written weight succeeds");
|
||||
ane_bridge_read_output(g_dyn_kern, 0, out1, (size_t)N*M*2);
|
||||
float sum = 0;
|
||||
for (int i = 0; i < N*M; i++) sum += read_fp16_elem(out1, i);
|
||||
float expected = 3.0f * N; // 3x identity: each row sums x (all 1s) scaled by 3
|
||||
// Actually identity matmul with x=all-1s: row i of output = row i of W dotted with x
|
||||
// row i of identity (scaled 3) = 3 at position i, 0 elsewhere. So out[i][t] = 3*x[i][t] = 3
|
||||
printf(" sum(out) with 3x identity (fp32 write): %.1f\n", sum);
|
||||
CHECK(fabsf(sum - 3.0f * N * M) < N*M*0.1f, "fp32 weight write produces correct output");
|
||||
|
||||
free(w_fp32); free(out1); free(out2); free(xbuf);
|
||||
}
|
||||
|
||||
// --- Test 5: copy_io ---
|
||||
static void test_copy_io(void) {
|
||||
printf("\n[5] ane_bridge_copy_io (direct IOSurface-to-IOSurface)\n");
|
||||
|
||||
// Compile a simple no-weight kernel: scale x by 2
|
||||
int R=32, C=32;
|
||||
const char *mil = mil_scale2(R, C);
|
||||
size_t sz = (size_t)R * C * 2;
|
||||
|
||||
ANEKernelHandle *k1 = ane_bridge_compile_dyn(mil, strlen(mil), 1, &sz, 0, NULL, sz);
|
||||
ANEKernelHandle *k2 = ane_bridge_compile_dyn(mil, strlen(mil), 1, &sz, 0, NULL, sz);
|
||||
CHECK(k1 != NULL && k2 != NULL, "two scale2 kernels compiled");
|
||||
if (!k1 || !k2) { ane_bridge_free(k1); ane_bridge_free(k2); return; }
|
||||
|
||||
// k1 input = 1.0, k1 out = 2.0, copy to k2 in, k2 out = 4.0
|
||||
uint16_t *buf = (uint16_t *)malloc(sz);
|
||||
uint16_t *out = (uint16_t *)malloc(sz);
|
||||
fill_fp16(buf, 1.0f, R*C);
|
||||
|
||||
ane_bridge_write_input(k1, 0, buf, sz);
|
||||
ane_bridge_eval(k1);
|
||||
ane_bridge_copy_io(k1, 0, k2, 0); // k2 input = k1 output (should be 2.0)
|
||||
ane_bridge_eval(k2);
|
||||
ane_bridge_read_output(k2, 0, out, sz);
|
||||
|
||||
float val = read_fp16_elem(out, 0);
|
||||
printf(" input=1.0 -> k1(x2) -> copy -> k2(x2) -> output=%.1f (expect 4.0)\n", val);
|
||||
CHECK(fabsf(val - 4.0f) < 0.2f, "copy_io chains two kernels without CPU round-trip");
|
||||
|
||||
free(buf); free(out);
|
||||
ane_bridge_free(k1);
|
||||
ane_bridge_free(k2);
|
||||
}
|
||||
|
||||
// --- Test 6: begin/end realtime ---
|
||||
static void test_realtime(void) {
|
||||
printf("\n[6] ane_bridge_begin/end_realtime\n");
|
||||
if (!g_dyn_kern) { FAIL("no kernel"); return; }
|
||||
|
||||
// Just verify they don't crash. Jitter improvement only visible over many samples.
|
||||
ane_bridge_begin_realtime();
|
||||
bool ok = ane_bridge_eval(g_dyn_kern);
|
||||
ane_bridge_end_realtime();
|
||||
CHECK(ok, "eval inside realtime task succeeds");
|
||||
printf(" (p99 jitter improvement requires statistical measurement — see test_realtime_task.m)\n");
|
||||
}
|
||||
|
||||
// --- Test 7: compile cache ---
|
||||
static void test_compile_cache(void) {
|
||||
printf("\n[7] Compile cache (cache files written to ~/.ane_cache/)\n");
|
||||
|
||||
int N=32, M=32;
|
||||
const char *mil = mil_scale2(N, M);
|
||||
size_t sz = (size_t)N*M*2;
|
||||
|
||||
ANEKernelHandle *k = ane_bridge_compile_dyn(mil, strlen(mil), 1, &sz, 0, NULL, sz);
|
||||
CHECK(k != NULL, "kernel compiled for cache test");
|
||||
if (!k) return;
|
||||
|
||||
// Verify cache files exist on disk
|
||||
// We need the hex ID — compile a fresh model to get it, then check cache
|
||||
NSString *home = NSHomeDirectory();
|
||||
NSString *cacheRoot = [home stringByAppendingPathComponent:@".ane_cache"];
|
||||
NSFileManager *fm = [NSFileManager defaultManager];
|
||||
// Dynamic-weight models produce net.plist only (no data file)
|
||||
NSArray *entries = [fm contentsOfDirectoryAtPath:cacheRoot error:nil];
|
||||
BOOL found_plist = NO;
|
||||
for (NSString *entry in entries) {
|
||||
NSString *plistPath = [[cacheRoot stringByAppendingPathComponent:entry]
|
||||
stringByAppendingPathComponent:@"net.plist"];
|
||||
if ([fm fileExistsAtPath:plistPath]) { found_plist = YES; break; }
|
||||
}
|
||||
CHECK(found_plist, "compiled kernel cached to ~/.ane_cache/");
|
||||
|
||||
// Free k first, then recompile — should hit cache (load only, no compile)
|
||||
ane_bridge_free(k);
|
||||
int before = ane_bridge_get_compile_count();
|
||||
ANEKernelHandle *k2 = ane_bridge_compile_dyn(mil, strlen(mil), 1, &sz, 0, NULL, sz);
|
||||
int after = ane_bridge_get_compile_count();
|
||||
CHECK(k2 != NULL, "cache hit returns valid kernel");
|
||||
CHECK(after == before, "cache hit does not increment compile count");
|
||||
ane_bridge_free(k2);
|
||||
}
|
||||
|
||||
// --- Test 8: free ---
|
||||
static void test_free(void) {
|
||||
printf("\n[8] ane_bridge_free\n");
|
||||
ane_bridge_free(g_dyn_kern);
|
||||
g_dyn_kern = NULL;
|
||||
PASS("free does not crash");
|
||||
}
|
||||
|
||||
int main(void) {
|
||||
@autoreleasepool {
|
||||
mach_timebase_info(&g_tb);
|
||||
printf("=== ane_bridge test suite ===\n");
|
||||
|
||||
test_init();
|
||||
test_compile_dyn();
|
||||
test_dynamic_weight_update();
|
||||
test_write_weight_f32();
|
||||
test_copy_io();
|
||||
test_realtime();
|
||||
test_compile_cache();
|
||||
test_free();
|
||||
|
||||
printf("\n=== Results: %d passed, %d failed ===\n", passed, failed);
|
||||
return failed > 0 ? 1 : 0;
|
||||
}
|
||||
}
|
||||
Loading…
Reference in New Issue