ANE/bridge/ane_bridge.h

150 lines
6.7 KiB
C

// ane_bridge.h — C-callable bridge to ANE private APIs for Python ctypes
// Wraps _ANEInMemoryModel via private AppleNeuralEngine.framework
//
// Two compilation modes:
//
// BLOBFILE (upstream compatible):
// ane_bridge_compile() / ane_bridge_compile_multi_weights()
// Weights compiled into MIL as constants. Requires recompile when weights
// change — hits ANE compile limit (~119), needs exec() restart per batch.
//
// Dynamic IOSurface (our approach):
// ane_bridge_compile_dyn()
// Weights declared as runtime tensor function parameters. Compile ONCE at
// startup, update weights via ane_bridge_write_weight() (0.002ms per call).
// No exec() restart, no compile limit during training.
//
// Extras (our additions):
// ane_bridge_begin/end_realtime() — 90.6% p99 jitter reduction
// ane_bridge_copy_io() — direct IOSurface-to-IOSurface, no CPU
// Compile cache — ~700ms vs ~3800ms on cache hit
#ifndef ANE_BRIDGE_H
#define ANE_BRIDGE_H
#include <stddef.h>
#include <stdint.h>
#include <stdbool.h>
#ifdef __cplusplus
extern "C" {
#endif
// Opaque kernel handle
typedef struct ANEKernelHandle ANEKernelHandle;
// Initialize ANE runtime (load private framework, resolve classes)
// Returns 0 on success, -1 on failure
int ane_bridge_init(void);
// ---------------------------------------------------------------------------
// BLOBFILE compile (upstream compatible)
// Weights compiled into the MIL program as constants.
// ---------------------------------------------------------------------------
// Compile a MIL program with a single weight blob
ANEKernelHandle *ane_bridge_compile(const char *mil_text, size_t mil_len,
const uint8_t *weight_data, size_t weight_len,
int n_inputs, const size_t *input_sizes,
int n_outputs, const size_t *output_sizes);
// Compile with multiple named weight files
ANEKernelHandle *ane_bridge_compile_multi_weights(
const char *mil_text, size_t mil_len,
const char **weight_names, const uint8_t **weight_datas,
const size_t *weight_lens, int n_weights,
int n_inputs, const size_t *input_sizes,
int n_outputs, const size_t *output_sizes);
// ---------------------------------------------------------------------------
// Dynamic weight compile (our approach — compile once, update per Adam step)
// Weights declared as runtime tensor function parameters backed by IOSurfaces.
//
// n_inputs: number of activation input tensors
// input_sizes: byte sizes (fp16) for each activation input
// n_weights: number of dynamic weight tensors
// weight_sizes: byte sizes (fp16) for each weight IOSurface
// output_size: byte size (fp16) of the single output tensor
//
// MIL function signature must match: func main<ios18>(x0, x1, ..., w0, w1, ...)
// where activation inputs come first, weight inputs follow.
// ---------------------------------------------------------------------------
ANEKernelHandle *ane_bridge_compile_dyn(
const char *mil_text, size_t mil_len,
int n_inputs, const size_t *input_sizes,
int n_weights, const size_t *weight_sizes,
size_t output_size);
// ---------------------------------------------------------------------------
// Eval and I/O
// ---------------------------------------------------------------------------
// Evaluate (run) a compiled kernel on ANE
bool ane_bridge_eval(ANEKernelHandle *kernel);
// Write data to activation input tensor (fp16 or raw bytes)
void ane_bridge_write_input(ANEKernelHandle *kernel, int idx,
const void *data, size_t bytes);
// Read data from output tensor (fp16 or raw bytes)
void ane_bridge_read_output(ANEKernelHandle *kernel, int idx,
void *data, size_t bytes);
// ---------------------------------------------------------------------------
// Dynamic weight I/O (our approach)
// ---------------------------------------------------------------------------
// Write fp16 data directly to weight IOSurface (~0.002ms per call)
// idx: weight index (0..n_weights-1)
void ane_bridge_write_weight(ANEKernelHandle *kernel, int idx,
const void *fp16_data, size_t bytes);
// Write fp32 data to weight IOSurface with automatic fp32→fp16 conversion
// count: number of float elements (bytes = count * 2 fp16)
void ane_bridge_write_weight_f32(ANEKernelHandle *kernel, int idx,
const float *fp32_data, size_t count);
// ---------------------------------------------------------------------------
// Direct IOSurface copy — no CPU round-trip between chained kernels
// Copies src kernel's output[src_out_idx] → dst kernel's input[dst_in_idx]
// Zero-copy: just memcpy between IOSurface base addresses
// ---------------------------------------------------------------------------
void ane_bridge_copy_io(ANEKernelHandle *src, int src_out_idx,
ANEKernelHandle *dst, int dst_in_idx);
// ---------------------------------------------------------------------------
// Real-time task — 90.6% p99 jitter reduction
// Wrap a sequence of evals with begin/end to prevent ANE scheduler preemption.
// Proven: plain p99=35.2ms → with RT task p99=3.3ms
// Requires at least one kernel to have been compiled and loaded.
// ---------------------------------------------------------------------------
void ane_bridge_begin_realtime(void);
void ane_bridge_end_realtime(void);
// ---------------------------------------------------------------------------
// Lifecycle
// ---------------------------------------------------------------------------
void ane_bridge_free(ANEKernelHandle *kernel);
// Compile count (useful for tracking exec() restart budget in BLOBFILE mode)
int ane_bridge_get_compile_count(void);
void ane_bridge_reset_compile_count(void);
// ---------------------------------------------------------------------------
// Weight blob helpers (BLOBFILE mode)
// Builds the 128-byte ANE blob header + fp16 weights for use with
// ane_bridge_compile / ane_bridge_compile_multi_weights.
// ---------------------------------------------------------------------------
uint8_t *ane_bridge_build_weight_blob(const float *src, int rows, int cols,
size_t *out_len);
uint8_t *ane_bridge_build_weight_blob_transposed(const float *src, int rows, int cols,
size_t *out_len);
void ane_bridge_free_blob(void *ptr);
#ifdef __cplusplus
}
#endif
#endif // ANE_BRIDGE_H