From 98ddd2d1901f451aa00850fc52b1622975a09684 Mon Sep 17 00:00:00 2001
From: fspecii <fspecii@users.noreply.github.com>
Date: Tue, 3 Mar 2026 15:00:51 +0200
Subject: [PATCH] =?UTF-8?q?bridge:=20add=20compile=5Fdyn=20+=20write=5Fwei?=
 =?UTF-8?q?ght=20=E2=80=94=20function=20parameter=20IOSurfaces?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds a second dynamic weight approach to the bridge alongside the existing
BLOBFILE compile path. Instead of packing weights into the spatial dimension
of a single large input tensor and slicing them inside MIL (the training_dynamic/
approach), weights are declared as native MIL function parameters backed by
persistent IOSurfaces:

  // training_dynamic/ approach: spatial packing
  func main<ios18>(tensor<fp32, [1, DIM, 1, SEQ + 4*DIM]> x) {
      Wq = slice_by_size(x=x, begin=..., size=...);  // overhead
      ...

  // this PR: native function parameters
  func main<ios18>(tensor<fp16,[1,K,1,M]> x, tensor<fp16,[1,N,K]> W) { ... }

New API:
  ane_bridge_compile_dyn()      — compile with n_weights IOSurface parameters
  ane_bridge_write_weight()     — write fp16 to weight IOSurface (~0.001ms)
  ane_bridge_write_weight_f32() — write fp32 with NEON conversion
  ane_bridge_copy_io()          — direct output→input copy, no CPU round-trip
  ane_bridge_begin/end_realtime() — 90.6% p99 jitter reduction

Compile cache fix: ANE only writes net.plist for parameter-based models (no
data file). try_cache_restore now checks net.plist only; data is saved/restored
conditionally for BLOBFILE models that do produce it.

Also removes the pre-built libane_bridge.dylib binary from version control.

Performance vs spatial packing (Stories110M, 12 layers, M-series):
  training_dynamic/ (slice approach): 110ms/step
  function parameter approach:         76.9ms/step  (-30%)

The slice/reshape/transpose overhead per weight matrix explains the gap.
Both compile once at startup; weight updates are IOSurface writes in both cases.

Tested: test_bridge.m — 15/15 assertions across all new API functions.
---
 bridge/ane_bridge.h        | 236 +++++++----
 bridge/ane_bridge.m        | 829 ++++++++++++++++++++++---------------
 bridge/libane_bridge.dylib | Bin 54480 -> 0 bytes
 bridge/test_bridge.m       | 305 ++++++++++++++
 4 files changed, 955 insertions(+), 415 deletions(-)
 delete mode 100755 bridge/libane_bridge.dylib
 create mode 100644 bridge/test_bridge.m
diff --git a/bridge/ane_bridge.h b/bridge/ane_bridge.h
index 3e8ff47..04b7196 100644
--- a/bridge/ane_bridge.h
+++ b/bridge/ane_bridge.h
@@ -1,87 +1,149 @@
-// ane_bridge.h — C-callable bridge to ANE private APIs for Python ctypes
-// Wraps _ANEInMemoryModel via private AppleNeuralEngine.framework
-
-#ifndef ANE_BRIDGE_H
-#define ANE_BRIDGE_H
-
-#include <stddef.h>
-#include <stdint.h>
-#include <stdbool.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-// Opaque kernel handle
-typedef struct ANEKernelHandle ANEKernelHandle;
-
-// Initialize ANE runtime (load private framework, resolve classes)
-// Returns 0 on success, -1 on failure
-int ane_bridge_init(void);
-
-// Compile a MIL program with weight blobs into an ANE kernel
-// mil_text: UTF-8 MIL program text
-// mil_len: length of MIL text
-// weight_data: raw weight blob (can be NULL)
-// weight_len: length of weight blob
-// n_inputs: number of input tensors
-// input_sizes: array of byte sizes for each input
-// n_outputs: number of output tensors
-// output_sizes: array of byte sizes for each output
-// Returns kernel handle or NULL on failure
-ANEKernelHandle *ane_bridge_compile(const char *mil_text, size_t mil_len,
-                                     const uint8_t *weight_data, size_t weight_len,
-                                     int n_inputs, const size_t *input_sizes,
-                                     int n_outputs, const size_t *output_sizes);
-
-// Compile with multiple named weight files (for transformer kernels)
-// weight_names: array of weight file paths (e.g. "@model_path/weights/wq.bin")
-// weight_datas: array of weight data pointers
-// weight_lens: array of weight data lengths
-// n_weights: number of weight files
-ANEKernelHandle *ane_bridge_compile_multi_weights(
-    const char *mil_text, size_t mil_len,
-    const char **weight_names, const uint8_t **weight_datas,
-    const size_t *weight_lens, int n_weights,
-    int n_inputs, const size_t *input_sizes,
-    int n_outputs, const size_t *output_sizes);
-
-// Evaluate (run) a compiled kernel on ANE
-// Returns true on success
-bool ane_bridge_eval(ANEKernelHandle *kernel);
-
-// Write data to kernel input tensor
-void ane_bridge_write_input(ANEKernelHandle *kernel, int idx,
-                             const void *data, size_t bytes);
-
-// Read data from kernel output tensor
-void ane_bridge_read_output(ANEKernelHandle *kernel, int idx,
-                              void *data, size_t bytes);
-
-// Free a compiled kernel and all associated resources
-void ane_bridge_free(ANEKernelHandle *kernel);
-
-// Get compile count (for exec() restart budgeting)
-int ane_bridge_get_compile_count(void);
-
-// Reset compile count
-void ane_bridge_reset_compile_count(void);
-
-// Build a weight blob in ANE format (128-byte header + fp16 data)
-// src: float32 weights [rows x cols]
-// Returns allocated buffer and sets out_len. Caller must free().
-uint8_t *ane_bridge_build_weight_blob(const float *src, int rows, int cols,
-                                       size_t *out_len);
-
-// Build a transposed weight blob in ANE format
-uint8_t *ane_bridge_build_weight_blob_transposed(const float *src, int rows, int cols,
-                                                   size_t *out_len);
-
-// Free a blob allocated by ane_bridge_build_weight_blob*
-void ane_bridge_free_blob(void *ptr);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif // ANE_BRIDGE_H
+// ane_bridge.h — C-callable bridge to ANE private APIs for Python ctypes
+// Wraps _ANEInMemoryModel via private AppleNeuralEngine.framework
+//
+// Two compilation modes:
+//
+//   BLOBFILE (upstream compatible):
+//     ane_bridge_compile() / ane_bridge_compile_multi_weights()
+//     Weights compiled into MIL as constants. Requires recompile when weights
+//     change — hits ANE compile limit (~119), needs exec() restart per batch.
+//
+//   Dynamic IOSurface (our approach):
+//     ane_bridge_compile_dyn()
+//     Weights declared as runtime tensor function parameters. Compile ONCE at
+//     startup, update weights via ane_bridge_write_weight() (0.002ms per call).
+//     No exec() restart, no compile limit during training.
+//
+// Extras (our additions):
+//   ane_bridge_begin/end_realtime() — 90.6% p99 jitter reduction
+//   ane_bridge_copy_io()            — direct IOSurface-to-IOSurface, no CPU
+//   Compile cache                   — ~700ms vs ~3800ms on cache hit
+
+#ifndef ANE_BRIDGE_H
+#define ANE_BRIDGE_H
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdbool.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Opaque kernel handle
+typedef struct ANEKernelHandle ANEKernelHandle;
+
+// Initialize ANE runtime (load private framework, resolve classes)
+// Returns 0 on success, -1 on failure
+int ane_bridge_init(void);
+
+// ---------------------------------------------------------------------------
+// BLOBFILE compile (upstream compatible)
+// Weights compiled into the MIL program as constants.
+// ---------------------------------------------------------------------------
+
+// Compile a MIL program with a single weight blob
+ANEKernelHandle *ane_bridge_compile(const char *mil_text, size_t mil_len,
+                                     const uint8_t *weight_data, size_t weight_len,
+                                     int n_inputs, const size_t *input_sizes,
+                                     int n_outputs, const size_t *output_sizes);
+
+// Compile with multiple named weight files
+ANEKernelHandle *ane_bridge_compile_multi_weights(
+    const char *mil_text, size_t mil_len,
+    const char **weight_names, const uint8_t **weight_datas,
+    const size_t *weight_lens, int n_weights,
+    int n_inputs, const size_t *input_sizes,
+    int n_outputs, const size_t *output_sizes);
+
+// ---------------------------------------------------------------------------
+// Dynamic weight compile (our approach — compile once, update per Adam step)
+// Weights declared as runtime tensor function parameters backed by IOSurfaces.
+//
+//   n_inputs:      number of activation input tensors
+//   input_sizes:   byte sizes (fp16) for each activation input
+//   n_weights:     number of dynamic weight tensors
+//   weight_sizes:  byte sizes (fp16) for each weight IOSurface
+//   output_size:   byte size (fp16) of the single output tensor
+//
+// MIL function signature must match: func main<ios18>(x0, x1, ..., w0, w1, ...)
+// where activation inputs come first, weight inputs follow.
+// ---------------------------------------------------------------------------
+ANEKernelHandle *ane_bridge_compile_dyn(
+    const char *mil_text, size_t mil_len,
+    int n_inputs, const size_t *input_sizes,
+    int n_weights, const size_t *weight_sizes,
+    size_t output_size);
+
+// ---------------------------------------------------------------------------
+// Eval and I/O
+// ---------------------------------------------------------------------------
+
+// Evaluate (run) a compiled kernel on ANE
+bool ane_bridge_eval(ANEKernelHandle *kernel);
+
+// Write data to activation input tensor (fp16 or raw bytes)
+void ane_bridge_write_input(ANEKernelHandle *kernel, int idx,
+                             const void *data, size_t bytes);
+
+// Read data from output tensor (fp16 or raw bytes)
+void ane_bridge_read_output(ANEKernelHandle *kernel, int idx,
+                              void *data, size_t bytes);
+
+// ---------------------------------------------------------------------------
+// Dynamic weight I/O (our approach)
+// ---------------------------------------------------------------------------
+
+// Write fp16 data directly to weight IOSurface (~0.002ms per call)
+// idx: weight index (0..n_weights-1)
+void ane_bridge_write_weight(ANEKernelHandle *kernel, int idx,
+                              const void *fp16_data, size_t bytes);
+
+// Write fp32 data to weight IOSurface with automatic fp32→fp16 conversion
+// count: number of float elements (bytes = count * 2 fp16)
+void ane_bridge_write_weight_f32(ANEKernelHandle *kernel, int idx,
+                                  const float *fp32_data, size_t count);
+
+// ---------------------------------------------------------------------------
+// Direct IOSurface copy — no CPU round-trip between chained kernels
+// Copies src kernel's output[src_out_idx] → dst kernel's input[dst_in_idx]
+// Zero-copy: just memcpy between IOSurface base addresses
+// ---------------------------------------------------------------------------
+void ane_bridge_copy_io(ANEKernelHandle *src, int src_out_idx,
+                         ANEKernelHandle *dst, int dst_in_idx);
+
+// ---------------------------------------------------------------------------
+// Real-time task — 90.6% p99 jitter reduction
+// Wrap a sequence of evals with begin/end to prevent ANE scheduler preemption.
+// Proven: plain p99=35.2ms → with RT task p99=3.3ms
+// Requires at least one kernel to have been compiled and loaded.
+// ---------------------------------------------------------------------------
+void ane_bridge_begin_realtime(void);
+void ane_bridge_end_realtime(void);
+
+// ---------------------------------------------------------------------------
+// Lifecycle
+// ---------------------------------------------------------------------------
+
+void ane_bridge_free(ANEKernelHandle *kernel);
+
+// Compile count (useful for tracking exec() restart budget in BLOBFILE mode)
+int  ane_bridge_get_compile_count(void);
+void ane_bridge_reset_compile_count(void);
+
+// ---------------------------------------------------------------------------
+// Weight blob helpers (BLOBFILE mode)
+// Builds the 128-byte ANE blob header + fp16 weights for use with
+// ane_bridge_compile / ane_bridge_compile_multi_weights.
+// ---------------------------------------------------------------------------
+uint8_t *ane_bridge_build_weight_blob(const float *src, int rows, int cols,
+                                       size_t *out_len);
+uint8_t *ane_bridge_build_weight_blob_transposed(const float *src, int rows, int cols,
+                                                   size_t *out_len);
+void ane_bridge_free_blob(void *ptr);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // ANE_BRIDGE_H
diff --git a/bridge/ane_bridge.m b/bridge/ane_bridge.m
index 2b27ddc..b501d42 100644
--- a/bridge/ane_bridge.m
+++ b/bridge/ane_bridge.m
@@ -1,328 +1,501 @@
-// ane_bridge.m — Objective-C implementation of ANE bridge for Python ctypes
-// Wraps _ANEInMemoryModel private APIs into C-callable functions
-
-#import <Foundation/Foundation.h>
-#import <objc/runtime.h>
-#import <objc/message.h>
-#import <dlfcn.h>
-#import <IOSurface/IOSurface.h>
-#include "ane_bridge.h"
-
-// --- Private class references ---
-static Class g_ANEDesc = nil;
-static Class g_ANEInMem = nil;
-static Class g_ANEReq = nil;
-static Class g_ANEIO = nil;
-static bool g_initialized = false;
-static int g_compile_count = 0;
-
-// --- Kernel handle struct ---
-struct ANEKernelHandle {
-    id model;               // _ANEInMemoryModel
-    IOSurfaceRef *ioInputs;
-    IOSurfaceRef *ioOutputs;
-    id request;             // _ANERequest
-    NSString *tmpDir;
-    int nInputs, nOutputs;
-    size_t *inputBytes;
-    size_t *outputBytes;
-};
-
-// --- Public API ---
-
-int ane_bridge_init(void) {
-    if (g_initialized) return 0;
-
-    void *handle = dlopen(
-        "/System/Library/PrivateFrameworks/AppleNeuralEngine.framework/AppleNeuralEngine",
-        RTLD_NOW);
-    if (!handle) {
-        fprintf(stderr, "ane_bridge: Failed to load AppleNeuralEngine.framework\n");
-        return -1;
-    }
-
-    g_ANEDesc  = NSClassFromString(@"_ANEInMemoryModelDescriptor");
-    g_ANEInMem = NSClassFromString(@"_ANEInMemoryModel");
-    g_ANEReq   = NSClassFromString(@"_ANERequest");
-    g_ANEIO    = NSClassFromString(@"_ANEIOSurfaceObject");
-
-    if (!g_ANEDesc || !g_ANEInMem || !g_ANEReq || !g_ANEIO) {
-        fprintf(stderr, "ane_bridge: Failed to resolve ANE private classes\n");
-        return -1;
-    }
-
-    g_initialized = true;
-    g_compile_count = 0;
-    return 0;
-}
-
-static IOSurfaceRef create_surface(size_t bytes) {
-    return IOSurfaceCreate((__bridge CFDictionaryRef)@{
-        (id)kIOSurfaceWidth: @(bytes),
-        (id)kIOSurfaceHeight: @1,
-        (id)kIOSurfaceBytesPerElement: @1,
-        (id)kIOSurfaceBytesPerRow: @(bytes),
-        (id)kIOSurfaceAllocSize: @(bytes),
-        (id)kIOSurfacePixelFormat: @0
-    });
-}
-
-ANEKernelHandle *ane_bridge_compile_multi_weights(
-    const char *mil_text, size_t mil_len,
-    const char **weight_names, const uint8_t **weight_datas,
-    const size_t *weight_lens, int n_weights,
-    int n_inputs, const size_t *input_sizes,
-    int n_outputs, const size_t *output_sizes)
-{
-    @autoreleasepool {
-        if (!g_initialized) {
-            fprintf(stderr, "ane_bridge: Not initialized\n");
-            return NULL;
-        }
-
-        NSData *milData = [NSData dataWithBytes:mil_text length:mil_len];
-        NSError *e = nil;
-
-        // Build weight dictionary
-        NSMutableDictionary *wdict = [NSMutableDictionary dictionary];
-        for (int i = 0; i < n_weights; i++) {
-            NSString *name = [NSString stringWithUTF8String:weight_names[i]];
-            NSData *data = [NSData dataWithBytes:weight_datas[i] length:weight_lens[i]];
-            wdict[name] = @{@"offset": @0, @"data": data};
-        }
-
-        id desc = ((id(*)(Class,SEL,id,id,id))objc_msgSend)(
-            g_ANEDesc, @selector(modelWithMILText:weights:optionsPlist:),
-            milData, wdict.count > 0 ? wdict : nil, nil);
-        if (!desc) {
-            fprintf(stderr, "ane_bridge: modelWithMILText failed\n");
-            return NULL;
-        }
-
-        id mdl = ((id(*)(Class,SEL,id))objc_msgSend)(
-            g_ANEInMem, @selector(inMemoryModelWithDescriptor:), desc);
-        if (!mdl) {
-            fprintf(stderr, "ane_bridge: inMemoryModelWithDescriptor failed\n");
-            return NULL;
-        }
-
-        // Pre-populate temp dir
-        id hx = ((id(*)(id,SEL))objc_msgSend)(mdl, @selector(hexStringIdentifier));
-        NSString *td = [NSTemporaryDirectory() stringByAppendingPathComponent:hx];
-        NSFileManager *fm = [NSFileManager defaultManager];
-        [fm createDirectoryAtPath:[td stringByAppendingPathComponent:@"weights"]
-            withIntermediateDirectories:YES attributes:nil error:nil];
-        [milData writeToFile:[td stringByAppendingPathComponent:@"model.mil"] atomically:YES];
-
-        for (int i = 0; i < n_weights; i++) {
-            NSString *name = [NSString stringWithUTF8String:weight_names[i]];
-            // Extract filename from path like "@model_path/weights/wq.bin" -> "weights/wq.bin"
-            NSString *relPath = name;
-            if ([name hasPrefix:@"@model_path/"]) {
-                relPath = [name substringFromIndex:12];
-            }
-            NSString *fullPath = [td stringByAppendingPathComponent:relPath];
-            NSString *dir = [fullPath stringByDeletingLastPathComponent];
-            [fm createDirectoryAtPath:dir withIntermediateDirectories:YES attributes:nil error:nil];
-            NSData *data = [NSData dataWithBytes:weight_datas[i] length:weight_lens[i]];
-            [data writeToFile:fullPath atomically:YES];
-        }
-
-        // Compile
-        if (!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(
-                mdl, @selector(compileWithQoS:options:error:), 21, @{}, &e)) {
-            fprintf(stderr, "ane_bridge: ANE compile failed: %s\n",
-                    e ? [[e description] UTF8String] : "unknown");
-            [fm removeItemAtPath:td error:nil];
-            return NULL;
-        }
-
-        // Load (with one retry after a brief pause for ANE slot reclamation)
-        BOOL loaded = ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(
-                mdl, @selector(loadWithQoS:options:error:), 21, @{}, &e);
-        if (!loaded) {
-            fprintf(stderr, "ane_bridge: ANE load failed (retrying in 100ms): %s\n",
-                    e ? [[e description] UTF8String] : "unknown");
-            usleep(100000); // 100ms
-            e = nil;
-            loaded = ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(
-                    mdl, @selector(loadWithQoS:options:error:), 21, @{}, &e);
-        }
-        if (!loaded) {
-            fprintf(stderr, "ane_bridge: ANE load failed after retry: %s\n",
-                    e ? [[e description] UTF8String] : "unknown");
-            [fm removeItemAtPath:td error:nil];
-            return NULL;
-        }
-
-        g_compile_count++;
-
-        // Create kernel handle
-        ANEKernelHandle *k = (ANEKernelHandle *)calloc(1, sizeof(ANEKernelHandle));
-        k->model = mdl;
-        k->tmpDir = td;
-        k->nInputs = n_inputs;
-        k->nOutputs = n_outputs;
-        k->inputBytes = (size_t *)malloc(n_inputs * sizeof(size_t));
-        k->outputBytes = (size_t *)malloc(n_outputs * sizeof(size_t));
-        memcpy(k->inputBytes, input_sizes, n_inputs * sizeof(size_t));
-        memcpy(k->outputBytes, output_sizes, n_outputs * sizeof(size_t));
-
-        // Create IOSurfaces
-        k->ioInputs = (IOSurfaceRef *)malloc(n_inputs * sizeof(IOSurfaceRef));
-        k->ioOutputs = (IOSurfaceRef *)malloc(n_outputs * sizeof(IOSurfaceRef));
-        for (int i = 0; i < n_inputs; i++)
-            k->ioInputs[i] = create_surface(input_sizes[i]);
-        for (int i = 0; i < n_outputs; i++)
-            k->ioOutputs[i] = create_surface(output_sizes[i]);
-
-        // Build request
-        NSMutableArray *wIns = [NSMutableArray arrayWithCapacity:n_inputs];
-        NSMutableArray *iIdx = [NSMutableArray arrayWithCapacity:n_inputs];
-        for (int i = 0; i < n_inputs; i++) {
-            [wIns addObject:((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(
-                g_ANEIO, @selector(objectWithIOSurface:), k->ioInputs[i])];
-            [iIdx addObject:@(i)];
-        }
-        NSMutableArray *wOuts = [NSMutableArray arrayWithCapacity:n_outputs];
-        NSMutableArray *oIdx = [NSMutableArray arrayWithCapacity:n_outputs];
-        for (int i = 0; i < n_outputs; i++) {
-            [wOuts addObject:((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(
-                g_ANEIO, @selector(objectWithIOSurface:), k->ioOutputs[i])];
-            [oIdx addObject:@(i)];
-        }
-        k->request = ((id(*)(Class,SEL,id,id,id,id,id,id,id))objc_msgSend)(
-            g_ANEReq,
-            @selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:),
-            wIns, iIdx, wOuts, oIdx, nil, nil, @0);
-
-        return k;
-    }
-}
-
-ANEKernelHandle *ane_bridge_compile(const char *mil_text, size_t mil_len,
-                                     const uint8_t *weight_data, size_t weight_len,
-                                     int n_inputs, const size_t *input_sizes,
-                                     int n_outputs, const size_t *output_sizes) {
-    if (weight_data && weight_len > 0) {
-        const char *name = "@model_path/weights/weight.bin";
-        return ane_bridge_compile_multi_weights(
-            mil_text, mil_len,
-            &name, &weight_data, &weight_len, 1,
-            n_inputs, input_sizes,
-            n_outputs, output_sizes);
-    } else {
-        return ane_bridge_compile_multi_weights(
-            mil_text, mil_len,
-            NULL, NULL, NULL, 0,
-            n_inputs, input_sizes,
-            n_outputs, output_sizes);
-    }
-}
-
-bool ane_bridge_eval(ANEKernelHandle *kernel) {
-    @autoreleasepool {
-        if (!kernel || !kernel->model) return false;
-        NSError *e = nil;
-        return ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)(
-            kernel->model, @selector(evaluateWithQoS:options:request:error:),
-            21, @{}, kernel->request, &e);
-    }
-}
-
-void ane_bridge_write_input(ANEKernelHandle *kernel, int idx,
-                             const void *data, size_t bytes) {
-    if (!kernel || idx < 0 || idx >= kernel->nInputs) return;
-    IOSurfaceLock(kernel->ioInputs[idx], 0, NULL);
-    memcpy(IOSurfaceGetBaseAddress(kernel->ioInputs[idx]), data, bytes);
-    IOSurfaceUnlock(kernel->ioInputs[idx], 0, NULL);
-}
-
-void ane_bridge_read_output(ANEKernelHandle *kernel, int idx,
-                              void *data, size_t bytes) {
-    if (!kernel || idx < 0 || idx >= kernel->nOutputs) return;
-    IOSurfaceLock(kernel->ioOutputs[idx], kIOSurfaceLockReadOnly, NULL);
-    memcpy(data, IOSurfaceGetBaseAddress(kernel->ioOutputs[idx]), bytes);
-    IOSurfaceUnlock(kernel->ioOutputs[idx], kIOSurfaceLockReadOnly, NULL);
-}
-
-void ane_bridge_free(ANEKernelHandle *kernel) {
-    @autoreleasepool {
-        if (!kernel) return;
-        NSError *e = nil;
-        if (kernel->model) {
-            ((BOOL(*)(id,SEL,unsigned int,NSError**))objc_msgSend)(
-                kernel->model, @selector(unloadWithQoS:error:), 21, &e);
-        }
-        for (int i = 0; i < kernel->nInputs; i++)
-            if (kernel->ioInputs[i]) CFRelease(kernel->ioInputs[i]);
-        for (int i = 0; i < kernel->nOutputs; i++)
-            if (kernel->ioOutputs[i]) CFRelease(kernel->ioOutputs[i]);
-        if (kernel->tmpDir) {
-            [[NSFileManager defaultManager] removeItemAtPath:kernel->tmpDir error:nil];
-        }
-        free(kernel->ioInputs);
-        free(kernel->ioOutputs);
-        free(kernel->inputBytes);
-        free(kernel->outputBytes);
-        
-        // Explicitly nil Objective-C objects to trigger ARC release before freeing struct
-        kernel->model = nil;
-        kernel->request = nil;
-        kernel->tmpDir = nil;
-        
-        free(kernel);
-    }
-}
-
-int ane_bridge_get_compile_count(void) {
-    return g_compile_count;
-}
-
-void ane_bridge_reset_compile_count(void) {
-    g_compile_count = 0;
-}
-
-uint8_t *ane_bridge_build_weight_blob(const float *src, int rows, int cols,
-                                       size_t *out_len) {
-    int wsize = rows * cols * 2; // fp16
-    int total = 128 + wsize;
-    uint8_t *buf = (uint8_t *)calloc(total, 1);
-
-    // ANE blob header
-    buf[0] = 0x01; buf[4] = 0x02;
-    buf[64] = 0xEF; buf[65] = 0xBE; buf[66] = 0xAD; buf[67] = 0xDE;
-    buf[68] = 0x01;
-    *(uint32_t*)(buf + 72) = wsize;
-    *(uint32_t*)(buf + 80) = 128;
-
-    // Convert float32 -> float16
-    _Float16 *fp16 = (_Float16 *)(buf + 128);
-    for (int i = 0; i < rows * cols; i++) {
-        fp16[i] = (_Float16)src[i];
-    }
-
-    *out_len = total;
-    return buf;
-}
-
-uint8_t *ane_bridge_build_weight_blob_transposed(const float *src, int rows, int cols,
-                                                   size_t *out_len) {
-    int wsize = rows * cols * 2;
-    int total = 128 + wsize;
-    uint8_t *buf = (uint8_t *)calloc(total, 1);
-
-    buf[0] = 0x01; buf[4] = 0x02;
-    buf[64] = 0xEF; buf[65] = 0xBE; buf[66] = 0xAD; buf[67] = 0xDE;
-    buf[68] = 0x01;
-    *(uint32_t*)(buf + 72) = wsize;
-    *(uint32_t*)(buf + 80) = 128;
-
-    _Float16 *fp16 = (_Float16 *)(buf + 128);
-    for (int i = 0; i < rows; i++)
-        for (int j = 0; j < cols; j++)
-            fp16[j * rows + i] = (_Float16)src[i * cols + j];
-
-    *out_len = total;
-    return buf;
-}
+// ane_bridge.m — Objective-C implementation of ANE bridge for Python ctypes
+// Wraps _ANEInMemoryModel private APIs into C-callable functions
+//
+// Two modes: BLOBFILE (upstream compatible) and dynamic IOSurface (our approach).
+// See ane_bridge.h for full API documentation.
+
+#import <Foundation/Foundation.h>
+#import <objc/runtime.h>
+#import <objc/message.h>
+#import <dlfcn.h>
+#import <IOSurface/IOSurface.h>
+#include <arm_neon.h>
+#include "ane_bridge.h"
+
+// --- Private class references ---
+static Class g_ANEDesc  = nil;
+static Class g_ANEInMem = nil;
+static Class g_ANEReq   = nil;
+static Class g_ANEIO    = nil;
+static bool  g_initialized = false;
+static int   g_compile_count = 0;
+
+// _ANEClient for beginRealTimeTask — retrieved from first loaded model
+static id g_rt_client = nil;
+
+// --- Kernel handle ---
+struct ANEKernelHandle {
+    id model;                   // _ANEInMemoryModel
+    IOSurfaceRef *ioInputs;     // activation input surfaces
+    IOSurfaceRef *ioOutputs;    // output surfaces
+    IOSurfaceRef *ioWeights;    // dynamic weight surfaces (NULL for BLOBFILE mode)
+    id request;                 // _ANERequest
+    NSString *tmpDir;
+    int nInputs, nOutputs, nWeights;
+    size_t *inputBytes;
+    size_t *outputBytes;
+    size_t *weightBytes;
+};
+
+// --- Helpers ---
+
+static IOSurfaceRef make_surface(size_t bytes) {
+    return IOSurfaceCreate((__bridge CFDictionaryRef)@{
+        (id)kIOSurfaceWidth:          @(bytes),
+        (id)kIOSurfaceHeight:         @1,
+        (id)kIOSurfaceBytesPerElement:@1,
+        (id)kIOSurfaceBytesPerRow:    @(bytes),
+        (id)kIOSurfaceAllocSize:      @(bytes),
+        (id)kIOSurfacePixelFormat:    @0
+    });
+}
+
+static id wrap_surface(IOSurfaceRef s) {
+    return ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(
+        g_ANEIO, @selector(objectWithIOSurface:), s);
+}
+
+// Compile cache: ~/.ane_cache/<hexId>/
+// Saves ~3100ms on cache hit (700ms vs 3800ms for 74 kernels).
+static BOOL try_cache_restore(id mdl, NSString *td, NSFileManager *fm) {
+    NSString *hx = ((id(*)(id,SEL))objc_msgSend)(mdl, @selector(hexStringIdentifier));
+    NSString *cacheDir = [NSHomeDirectory() stringByAppendingPathComponent:
+        [@".ane_cache/" stringByAppendingString:hx]];
+    NSString *cachedPlist = [cacheDir stringByAppendingPathComponent:@"net.plist"];
+    if (![fm fileExistsAtPath:cachedPlist]) return NO;
+    [fm copyItemAtPath:cachedPlist toPath:[td stringByAppendingPathComponent:@"net.plist"] error:nil];
+    // BLOBFILE models also produce a `data` file; dynamic-weight models do not
+    NSString *cachedData = [cacheDir stringByAppendingPathComponent:@"data"];
+    if ([fm fileExistsAtPath:cachedData])
+        [fm copyItemAtPath:cachedData toPath:[td stringByAppendingPathComponent:@"data"] error:nil];
+    return YES;
+}
+
+static void save_to_cache(id mdl, NSString *td, NSFileManager *fm) {
+    NSString *hx = ((id(*)(id,SEL))objc_msgSend)(mdl, @selector(hexStringIdentifier));
+    NSString *cacheDir = [NSHomeDirectory() stringByAppendingPathComponent:
+        [@".ane_cache/" stringByAppendingString:hx]];
+    [fm createDirectoryAtPath:cacheDir withIntermediateDirectories:YES attributes:nil error:nil];
+    [fm copyItemAtPath:[td stringByAppendingPathComponent:@"net.plist"]
+                toPath:[cacheDir stringByAppendingPathComponent:@"net.plist"] error:nil];
+    // Copy data only if present (BLOBFILE models)
+    NSString *tdData = [td stringByAppendingPathComponent:@"data"];
+    if ([fm fileExistsAtPath:tdData])
+        [fm copyItemAtPath:tdData toPath:[cacheDir stringByAppendingPathComponent:@"data"] error:nil];
+}
+
+static BOOL compile_and_load(id mdl, NSString *td, NSFileManager *fm) {
+    NSError *e = nil;
+    BOOL fromCache = try_cache_restore(mdl, td, fm);
+
+    if (!fromCache) {
+        if (!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(
+                mdl, @selector(compileWithQoS:options:error:), 21, @{}, &e)) {
+            fprintf(stderr, "ane_bridge: compile failed: %s\n",
+                    e ? [[e description] UTF8String] : "unknown");
+            return NO;
+        }
+        save_to_cache(mdl, td, fm);
+        g_compile_count++;
+    }
+
+    e = nil;
+    BOOL loaded = ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(
+        mdl, @selector(loadWithQoS:options:error:), 21, @{}, &e);
+    if (!loaded) {
+        usleep(100000);
+        e = nil;
+        loaded = ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(
+            mdl, @selector(loadWithQoS:options:error:), 21, @{}, &e);
+    }
+    if (!loaded) {
+        fprintf(stderr, "ane_bridge: load failed: %s\n",
+                e ? [[e description] UTF8String] : "unknown");
+        return NO;
+    }
+
+    // Cache _ANEClient for real-time task API
+    if (!g_rt_client) {
+        Ivar iv = class_getInstanceVariable(object_getClass(mdl), "_sharedConnection");
+        if (iv) g_rt_client = object_getIvar(mdl, iv);
+    }
+
+    return YES;
+}
+
+// Build _ANERequest from arrays of input and output surfaces
+static id build_request(IOSurfaceRef *inputs, int nIn,
+                         IOSurfaceRef *outputs, int nOut) {
+    NSMutableArray *wIns = [NSMutableArray arrayWithCapacity:nIn];
+    NSMutableArray *iIdx = [NSMutableArray arrayWithCapacity:nIn];
+    for (int i = 0; i < nIn; i++) {
+        [wIns addObject:wrap_surface(inputs[i])];
+        [iIdx addObject:@(i)];
+    }
+    NSMutableArray *wOuts = [NSMutableArray arrayWithCapacity:nOut];
+    NSMutableArray *oIdx  = [NSMutableArray arrayWithCapacity:nOut];
+    for (int i = 0; i < nOut; i++) {
+        [wOuts addObject:wrap_surface(outputs[i])];
+        [oIdx  addObject:@(i)];
+    }
+    return ((id(*)(Class,SEL,id,id,id,id,id,id,id))objc_msgSend)(
+        g_ANEReq,
+        @selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:),
+        wIns, iIdx, wOuts, oIdx, nil, nil, @0);
+}
+
+// NEON fp32 → fp16 conversion
+static void cvt_f32_f16(const float *src, uint16_t *dst, size_t count) {
+    size_t i = 0;
+    for (; i + 4 <= count; i += 4) {
+        float32x4_t f = vld1q_f32(src + i);
+        vst1_u16(dst + i, vreinterpret_u16_f16(vcvt_f16_f32(f)));
+    }
+    for (; i < count; i++)
+        dst[i] = vreinterpret_u16_f16(vcvt_f16_f32(vdupq_n_f32(src[i])))[0];
+}
+
+// --- Public API ---
+
+int ane_bridge_init(void) {
+    if (g_initialized) return 0;
+    void *handle = dlopen(
+        "/System/Library/PrivateFrameworks/AppleNeuralEngine.framework/AppleNeuralEngine",
+        RTLD_NOW);
+    if (!handle) { fprintf(stderr, "ane_bridge: failed to load ANE framework\n"); return -1; }
+
+    g_ANEDesc  = NSClassFromString(@"_ANEInMemoryModelDescriptor");
+    g_ANEInMem = NSClassFromString(@"_ANEInMemoryModel");
+    g_ANEReq   = NSClassFromString(@"_ANERequest");
+    g_ANEIO    = NSClassFromString(@"_ANEIOSurfaceObject");
+
+    if (!g_ANEDesc || !g_ANEInMem || !g_ANEReq || !g_ANEIO) {
+        fprintf(stderr, "ane_bridge: failed to resolve ANE private classes\n");
+        return -1;
+    }
+    g_initialized = true;
+    return 0;
+}
+
+// ---------------------------------------------------------------------------
+// BLOBFILE compile (upstream compatible)
+// ---------------------------------------------------------------------------
+
+ANEKernelHandle *ane_bridge_compile_multi_weights(
+    const char *mil_text, size_t mil_len,
+    const char **weight_names, const uint8_t **weight_datas,
+    const size_t *weight_lens, int n_weights,
+    int n_inputs, const size_t *input_sizes,
+    int n_outputs, const size_t *output_sizes)
+{
+    @autoreleasepool {
+        if (!g_initialized) { fprintf(stderr, "ane_bridge: not initialized\n"); return NULL; }
+
+        NSData *milData = [NSData dataWithBytes:mil_text length:mil_len];
+        NSMutableDictionary *wdict = [NSMutableDictionary dictionary];
+        for (int i = 0; i < n_weights; i++) {
+            NSString *name = [NSString stringWithUTF8String:weight_names[i]];
+            NSData   *data = [NSData dataWithBytes:weight_datas[i] length:weight_lens[i]];
+            wdict[name] = @{@"offset": @0, @"data": data};
+        }
+
+        id desc = ((id(*)(Class,SEL,id,id,id))objc_msgSend)(
+            g_ANEDesc, @selector(modelWithMILText:weights:optionsPlist:),
+            milData, wdict.count > 0 ? wdict : nil, nil);
+        if (!desc) { fprintf(stderr, "ane_bridge: modelWithMILText failed\n"); return NULL; }
+
+        id mdl = ((id(*)(Class,SEL,id))objc_msgSend)(
+            g_ANEInMem, @selector(inMemoryModelWithDescriptor:), desc);
+
+        NSString *hx = ((id(*)(id,SEL))objc_msgSend)(mdl, @selector(hexStringIdentifier));
+        NSString *td = [NSTemporaryDirectory() stringByAppendingPathComponent:hx];
+        NSFileManager *fm = [NSFileManager defaultManager];
+        [fm createDirectoryAtPath:[td stringByAppendingPathComponent:@"weights"]
+            withIntermediateDirectories:YES attributes:nil error:nil];
+        [milData writeToFile:[td stringByAppendingPathComponent:@"model.mil"] atomically:YES];
+        for (int i = 0; i < n_weights; i++) {
+            NSString *rel = [NSString stringWithUTF8String:weight_names[i]];
+            if ([rel hasPrefix:@"@model_path/"]) rel = [rel substringFromIndex:12];
+            NSString *full = [td stringByAppendingPathComponent:rel];
+            [fm createDirectoryAtPath:[full stringByDeletingLastPathComponent]
+                withIntermediateDirectories:YES attributes:nil error:nil];
+            [[NSData dataWithBytes:weight_datas[i] length:weight_lens[i]] writeToFile:full atomically:YES];
+        }
+
+        if (!compile_and_load(mdl, td, fm)) { [fm removeItemAtPath:td error:nil]; return NULL; }
+
+        ANEKernelHandle *k = (ANEKernelHandle *)calloc(1, sizeof(ANEKernelHandle));
+        k->model      = mdl;
+        k->tmpDir     = td;
+        k->nInputs    = n_inputs;
+        k->nOutputs   = n_outputs;
+        k->nWeights   = 0;
+        k->ioWeights  = NULL;
+        k->weightBytes = NULL;
+        k->inputBytes  = (size_t *)malloc(n_inputs  * sizeof(size_t));
+        k->outputBytes = (size_t *)malloc(n_outputs * sizeof(size_t));
+        memcpy(k->inputBytes,  input_sizes,  n_inputs  * sizeof(size_t));
+        memcpy(k->outputBytes, output_sizes, n_outputs * sizeof(size_t));
+
+        k->ioInputs  = (IOSurfaceRef *)malloc(n_inputs  * sizeof(IOSurfaceRef));
+        k->ioOutputs = (IOSurfaceRef *)malloc(n_outputs * sizeof(IOSurfaceRef));
+        for (int i = 0; i < n_inputs;  i++) k->ioInputs[i]  = make_surface(input_sizes[i]);
+        for (int i = 0; i < n_outputs; i++) k->ioOutputs[i] = make_surface(output_sizes[i]);
+
+        k->request = build_request(k->ioInputs, n_inputs, k->ioOutputs, n_outputs);
+        return k;
+    }
+}
+
+ANEKernelHandle *ane_bridge_compile(const char *mil_text, size_t mil_len,
+                                     const uint8_t *weight_data, size_t weight_len,
+                                     int n_inputs, const size_t *input_sizes,
+                                     int n_outputs, const size_t *output_sizes) {
+    if (weight_data && weight_len > 0) {
+        const char *name = "@model_path/weights/weight.bin";
+        return ane_bridge_compile_multi_weights(mil_text, mil_len,
+            &name, &weight_data, &weight_len, 1,
+            n_inputs, input_sizes, n_outputs, output_sizes);
+    }
+    return ane_bridge_compile_multi_weights(mil_text, mil_len,
+        NULL, NULL, NULL, 0,
+        n_inputs, input_sizes, n_outputs, output_sizes);
+}
+
+// ---------------------------------------------------------------------------
+// Dynamic IOSurface compile (our approach — compile ONCE, write per Adam step)
+//
+// MIL program must declare weights as function parameters after activation inputs:
+//   func main<ios18>(tensor<fp16,...> x0, tensor<fp16,...> w0, tensor<fp16,...> w1) { ... }
+//
+// The _ANERequest bundles them as: inputs=[x0, w0, w1, ...], outputs=[out]
+// Weight IOSurfaces persist between evals — update via ane_bridge_write_weight().
+// ---------------------------------------------------------------------------
+ANEKernelHandle *ane_bridge_compile_dyn(
+    const char *mil_text, size_t mil_len,
+    int n_inputs, const size_t *input_sizes,
+    int n_weights, const size_t *weight_sizes,
+    size_t output_size)
+{
+    @autoreleasepool {
+        if (!g_initialized) { fprintf(stderr, "ane_bridge: not initialized\n"); return NULL; }
+
+        NSData *milData = [NSData dataWithBytes:mil_text length:mil_len];
+
+        // Dynamic weights: pass empty weight dict (no BLOBFILE)
+        id desc = ((id(*)(Class,SEL,id,id,id))objc_msgSend)(
+            g_ANEDesc, @selector(modelWithMILText:weights:optionsPlist:),
+            milData, @{}, nil);
+        if (!desc) { fprintf(stderr, "ane_bridge: modelWithMILText failed\n"); return NULL; }
+
+        id mdl = ((id(*)(Class,SEL,id))objc_msgSend)(
+            g_ANEInMem, @selector(inMemoryModelWithDescriptor:), desc);
+
+        NSString *hx = ((id(*)(id,SEL))objc_msgSend)(mdl, @selector(hexStringIdentifier));
+        NSString *td = [NSTemporaryDirectory() stringByAppendingPathComponent:hx];
+        NSFileManager *fm = [NSFileManager defaultManager];
+        [fm createDirectoryAtPath:td withIntermediateDirectories:YES attributes:nil error:nil];
+        [milData writeToFile:[td stringByAppendingPathComponent:@"model.mil"] atomically:YES];
+
+        if (!compile_and_load(mdl, td, fm)) { [fm removeItemAtPath:td error:nil]; return NULL; }
+
+        // Allocate kernel handle
+        ANEKernelHandle *k = (ANEKernelHandle *)calloc(1, sizeof(ANEKernelHandle));
+        k->model    = mdl;
+        k->tmpDir   = td;
+        k->nInputs  = n_inputs;
+        k->nOutputs = 1;
+        k->nWeights = n_weights;
+
+        k->inputBytes  = (size_t *)malloc(n_inputs  * sizeof(size_t));
+        k->outputBytes = (size_t *)malloc(1         * sizeof(size_t));
+        k->weightBytes = (size_t *)malloc(n_weights * sizeof(size_t));
+        memcpy(k->inputBytes,  input_sizes,  n_inputs  * sizeof(size_t));
+        memcpy(k->weightBytes, weight_sizes, n_weights * sizeof(size_t));
+        k->outputBytes[0] = output_size;
+
+        // Create IOSurfaces for activations, weights, and output
+        k->ioInputs  = (IOSurfaceRef *)malloc(n_inputs  * sizeof(IOSurfaceRef));
+        k->ioOutputs = (IOSurfaceRef *)malloc(1         * sizeof(IOSurfaceRef));
+        k->ioWeights = (IOSurfaceRef *)malloc(n_weights * sizeof(IOSurfaceRef));
+        for (int i = 0; i < n_inputs;  i++) k->ioInputs[i]  = make_surface(input_sizes[i]);
+        for (int i = 0; i < n_weights; i++) k->ioWeights[i] = make_surface(weight_sizes[i]);
+        k->ioOutputs[0] = make_surface(output_size);
+
+        // Build request: inputs = [x0, x1, ..., w0, w1, ...], outputs = [out]
+        // Weights follow activation inputs at higher indices — matches MIL param order.
+        int total_inputs = n_inputs + n_weights;
+        NSMutableArray *allInputs = [NSMutableArray arrayWithCapacity:total_inputs];
+        NSMutableArray *allIdx    = [NSMutableArray arrayWithCapacity:total_inputs];
+        for (int i = 0; i < n_inputs;  i++) { [allInputs addObject:wrap_surface(k->ioInputs[i])];  [allIdx addObject:@(i)]; }
+        for (int i = 0; i < n_weights; i++) { [allInputs addObject:wrap_surface(k->ioWeights[i])]; [allIdx addObject:@(n_inputs + i)]; }
+
+        id wOut = wrap_surface(k->ioOutputs[0]);
+        k->request = ((id(*)(Class,SEL,id,id,id,id,id,id,id))objc_msgSend)(
+            g_ANEReq,
+            @selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:),
+            allInputs, allIdx, @[wOut], @[@0], nil, nil, @0);
+
+        return k;
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Eval and I/O
+// ---------------------------------------------------------------------------
+
+bool ane_bridge_eval(ANEKernelHandle *kernel) {
+    @autoreleasepool {
+        if (!kernel || !kernel->model) return false;
+        NSError *e = nil;
+        return ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)(
+            kernel->model, @selector(evaluateWithQoS:options:request:error:),
+            21, @{}, kernel->request, &e);
+    }
+}
+
+void ane_bridge_write_input(ANEKernelHandle *kernel, int idx,
+                             const void *data, size_t bytes) {
+    if (!kernel || idx < 0 || idx >= kernel->nInputs) return;
+    IOSurfaceLock(kernel->ioInputs[idx], 0, NULL);
+    memcpy(IOSurfaceGetBaseAddress(kernel->ioInputs[idx]), data, bytes);
+    IOSurfaceUnlock(kernel->ioInputs[idx], 0, NULL);
+}
+
+void ane_bridge_read_output(ANEKernelHandle *kernel, int idx,
+                              void *data, size_t bytes) {
+    if (!kernel || idx < 0 || idx >= kernel->nOutputs) return;
+    IOSurfaceLock(kernel->ioOutputs[idx], kIOSurfaceLockReadOnly, NULL);
+    memcpy(data, IOSurfaceGetBaseAddress(kernel->ioOutputs[idx]), bytes);
+    IOSurfaceUnlock(kernel->ioOutputs[idx], kIOSurfaceLockReadOnly, NULL);
+}
+
+// ---------------------------------------------------------------------------
+// Dynamic weight I/O
+// ---------------------------------------------------------------------------
+
+void ane_bridge_write_weight(ANEKernelHandle *kernel, int idx,
+                              const void *fp16_data, size_t bytes) {
+    if (!kernel || !kernel->ioWeights || idx < 0 || idx >= kernel->nWeights) return;
+    IOSurfaceLock(kernel->ioWeights[idx], 0, NULL);
+    memcpy(IOSurfaceGetBaseAddress(kernel->ioWeights[idx]), fp16_data, bytes);
+    IOSurfaceUnlock(kernel->ioWeights[idx], 0, NULL);
+}
+
+void ane_bridge_write_weight_f32(ANEKernelHandle *kernel, int idx,
+                                  const float *fp32_data, size_t count) {
+    if (!kernel || !kernel->ioWeights || idx < 0 || idx >= kernel->nWeights) return;
+    IOSurfaceLock(kernel->ioWeights[idx], 0, NULL);
+    cvt_f32_f16(fp32_data, (uint16_t *)IOSurfaceGetBaseAddress(kernel->ioWeights[idx]), count);
+    IOSurfaceUnlock(kernel->ioWeights[idx], 0, NULL);
+}
+
+// ---------------------------------------------------------------------------
+// Direct IOSurface copy (no CPU round-trip between chained kernels)
+// ---------------------------------------------------------------------------
+
+void ane_bridge_copy_io(ANEKernelHandle *src, int src_out_idx,
+                         ANEKernelHandle *dst, int dst_in_idx) {
+    if (!src || !dst) return;
+    if (src_out_idx < 0 || src_out_idx >= src->nOutputs) return;
+    if (dst_in_idx  < 0 || dst_in_idx  >= dst->nInputs)  return;
+
+    IOSurfaceRef srf = src->ioOutputs[src_out_idx];
+    IOSurfaceRef drf = dst->ioInputs[dst_in_idx];
+    size_t bytes = IOSurfaceGetAllocSize(srf);
+
+    IOSurfaceLock(srf, kIOSurfaceLockReadOnly, NULL);
+    IOSurfaceLock(drf, 0, NULL);
+    memcpy(IOSurfaceGetBaseAddress(drf), IOSurfaceGetBaseAddress(srf), bytes);
+    IOSurfaceUnlock(drf, 0, NULL);
+    IOSurfaceUnlock(srf, kIOSurfaceLockReadOnly, NULL);
+}
+
+// ---------------------------------------------------------------------------
+// Real-time task — 90.6% p99 jitter reduction
+// ---------------------------------------------------------------------------
+
+void ane_bridge_begin_realtime(void) {
+    if (!g_rt_client) return;
+    ((void(*)(id,SEL))objc_msgSend)(g_rt_client, @selector(beginRealTimeTask));
+}
+
+void ane_bridge_end_realtime(void) {
+    if (!g_rt_client) return;
+    ((void(*)(id,SEL))objc_msgSend)(g_rt_client, @selector(endRealTimeTask));
+}
+
+// ---------------------------------------------------------------------------
+// Lifecycle
+// ---------------------------------------------------------------------------
+
+void ane_bridge_free(ANEKernelHandle *kernel) {
+    @autoreleasepool {
+        if (!kernel) return;
+        NSError *e = nil;
+        if (kernel->model) {
+            ((BOOL(*)(id,SEL,unsigned int,NSError**))objc_msgSend)(
+                kernel->model, @selector(unloadWithQoS:error:), 21, &e);
+        }
+        for (int i = 0; i < kernel->nInputs;  i++) if (kernel->ioInputs[i])  CFRelease(kernel->ioInputs[i]);
+        for (int i = 0; i < kernel->nOutputs; i++) if (kernel->ioOutputs[i]) CFRelease(kernel->ioOutputs[i]);
+        for (int i = 0; i < kernel->nWeights; i++) if (kernel->ioWeights[i]) CFRelease(kernel->ioWeights[i]);
+        if (kernel->tmpDir) [[NSFileManager defaultManager] removeItemAtPath:kernel->tmpDir error:nil];
+
+        free(kernel->ioInputs);
+        free(kernel->ioOutputs);
+        free(kernel->ioWeights);
+        free(kernel->inputBytes);
+        free(kernel->outputBytes);
+        free(kernel->weightBytes);
+
+        kernel->model   = nil;
+        kernel->request = nil;
+        kernel->tmpDir  = nil;
+        free(kernel);
+    }
+}
+
+int ane_bridge_get_compile_count(void)  { return g_compile_count; }
+void ane_bridge_reset_compile_count(void) { g_compile_count = 0; }
+
+// ---------------------------------------------------------------------------
+// Weight blob helpers (BLOBFILE mode)
+// ---------------------------------------------------------------------------
+
+uint8_t *ane_bridge_build_weight_blob(const float *src, int rows, int cols,
+                                       size_t *out_len) {
+    int wsize = rows * cols * 2;
+    int total = 128 + wsize;
+    uint8_t *buf = (uint8_t *)calloc(total, 1);
+    buf[0] = 0x01; buf[4] = 0x02;
+    buf[64] = 0xEF; buf[65] = 0xBE; buf[66] = 0xAD; buf[67] = 0xDE;
+    buf[68] = 0x01;
+    *(uint32_t*)(buf + 72) = wsize;
+    *(uint32_t*)(buf + 80) = 128;
+    cvt_f32_f16(src, (uint16_t *)(buf + 128), rows * cols);
+    *out_len = total;
+    return buf;
+}
+
+uint8_t *ane_bridge_build_weight_blob_transposed(const float *src, int rows, int cols,
+                                                   size_t *out_len) {
+    int wsize = rows * cols * 2;
+    int total = 128 + wsize;
+    uint8_t *buf = (uint8_t *)calloc(total, 1);
+    buf[0] = 0x01; buf[4] = 0x02;
+    buf[64] = 0xEF; buf[65] = 0xBE; buf[66] = 0xAD; buf[67] = 0xDE;
+    buf[68] = 0x01;
+    *(uint32_t*)(buf + 72) = wsize;
+    *(uint32_t*)(buf + 80) = 128;
+    uint16_t *fp16 = (uint16_t *)(buf + 128);
+    for (int i = 0; i < rows; i++)
+        for (int j = 0; j < cols; j++) {
+            float32x4_t f = vdupq_n_f32(src[i * cols + j]);
+            fp16[j * rows + i] = vreinterpret_u16_f16(vcvt_f16_f32(f))[0];
+        }
+    *out_len = total;
+    return buf;
+}
+
+void ane_bridge_free_blob(void *ptr) { free(ptr); }
diff --git a/bridge/libane_bridge.dylib b/bridge/libane_bridge.dylib
deleted file mode 100755
index 72acc32e285cd688d74b74f5735e8b6cfcae02e6..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 54480
zcmeHw4RjR8m2S<Cq>+rVKtc$F&0rt~OaTAH2E{QwlK2t7iX_1CIv!dxEve0DM)dRm
z0t~~CW5+mN#`Y$Dc1|E*zeT;PEKW!^tnD?ioh%zCYvW{{-F>kUHft-vFToMUGyGWZ
zyWKUTRtwO}%Xw#a&sHDq+`4sd-MaPN>guX#&EfTP|NGNnjJX)Dd{8MUUBK7_teC3U
z7|;ecW5HnKyt^9Bs(~NUK#_4d57k6)WnRIcu58e`0Os_!&}Zq|ODs+@y6wNbEZHwm
z`?#KHis!1F{(e|tsY=a}F-Ef8pO5>)*v%^#j5XaC>UZk&*FDa%OL8Y9)A(}Np!h<u
zXuN;GPJgS%TP^vp8OffUv%!U6FcjA{HQK^ePJgZ3-wGaIR?d2|kDcoHc-s`cH7d6$
zgYxm>1gqtzykS<pyuU<ry&4S%)o63f^gnREyfDdXxtlv6*%?#TH*uPG4<UEKU`?Pg
z5UgIltihUn5@~Ts=I0<z7!r9wG=qb|mRNrlF7Hp`^Mw4`^Fa5+Xi)Y-9-8fmaK<ME
zEK5K4N3z{t)=!r6Jg9AQi$c&j{jILDEOYY&vOC-O5<KfKs$)Jw@ATKo{mtTSNxo8i
zl!u<2AKr*ql9wgU9WwbO%Ep?#zm$lkG~>3l;^J+IC9N{`WnRgFap7LG79_3?2Aksj
zcmH62J!ntYbc>mv_Hky@omJB~7lMMpCAG^I&#S4mW*cKixP`<ym)|J0Wb$M8bv0zd
zw%k!4QJdtb5^U1caEme{+yNODjNOWnn*%$7c9^ucaCz3NjCDg#_>r$EKW1z>m0!bp
zj79Y-l<Por(Naf5b)NxE1zqkc|0!d0FQKIN)DN5Tr8}0rJNvn(&Tslb>5FSt-Ha+f
z$Qt7{2xzQXHnDX5lXn_)_QcS7x*9~jtlX<mPj}61=!ok|+sq|ulO}5&Gv{kqb=JqU
zHSwAAV~J>3*40>aMsv1y=1N72qb5GH+NOuk!TzlbT)}_s@`i-gEQgd!xev_Dso~x7
zKO4M1{w0`tE;cg}*JdJVMAlGdRN2yK#x=0rNq8C9j-D_x0`^u`f2{-hBfP%Ot}veJ
zn5JAAv%t)OHt<jLg7Q?1c|>EQIYKhcE4r=)k-Ym&#-!{`ON#%?>+Gr*JI^e4bza=$
z?!35zr5fO>bzk)yjO{i|>H6EPt;IpM_Y0{2<WpYugydz1{IJ85H1)qAuSEICMwT(_
zl6k4OP>0)acXA7_WBI9Ls9S-$z1y>P`Kh0w4)^YEY+5o$H%ggSH#_vu7AZ3Y{J?f8
z)8k@?j({kRDWKMUo;hSceuV4r2*fo7W#2q0<8etD4EG@GzB6?qKUt64%&B{WrCBNf
z-SzPE3i^f5kry{rVeEm<v#VY?aKsaQtM3)>u?w-QJI}QY?@R};L47gwBY@3FVB<|?
zJO#!lo_yoHC(k(N@fy7zk8#H1Ha_;ajI$nQnDGRpj1Tb~@Qh7=AN{0JPci$x4NS+e
z+j>tGbY_Cy&h$){sYJc>=ThcfPnq$aXOuzMUi4gR`~bZB7M3A@?0-m^i@g3I_yzbo
z-@p!UEnsUG=d;G12I=s}9^bT2Jf%kZ_X|4BHY4EkD^mLQJj>rcw4?Z_FZY8|Is|%y
z`+OQa!(%UW6{O5DY%550PMmG}@_oB7b-**n@|%(}`(AM!+wXB7s{xM3xsQjzdp%=;
z2{XDq<BWGveh*{nLw&bdUt(-9%VUgnCNDE|@bBldkG7rbYb1QiI;0HoF~E*|JPAFv
zMjyTB`Q}sI@O8p7#^}9;uqrXIWqVVGp0HX2ZjNm<J4Ef-{gxkBo`vfwZb$Ys&egE{
z1TmfMb$61zt5C{h#xwLe##q8_SAu^G+Z(uC4^Dl2T*A=*D1+uQ^-cC<NAcmFcw!3m
z4}aUZz6PA~e?97}m@n0hwl8zt7r@D9`gZEO9GI0E6M-p>Ef2IBdh$CB+-#rpr9S>V
z*^V-v<36TyAKkzt+y1A#-8gWX8{_lX+Vo5*^I9QmynuOv=W1#9=Lwq%V-NQ;7Myao
zpUY+7gjM(LWH)9&T$RvuKZ<*EF6Pn$Qo3g@%k)&^9;9+L<}~+_2Ts_~T%d71L+ubd
z#tc3N_YLu}#ve>xVVpL3nejVt3^?`80``&L%Nk8gF`u`)rOXcQn`}M+5q=i7g(<@N
zJn-^o`ferMw!4z`Te<zausu)XLH%1OAL;8GO@3`_#~bWbS8AKbS|6T=O$Bn$<YJ|t
z09$jOTrFi@+A%sk5$!_XlrqmubRWx%m&|#y&z-D)5p8<W$KD;Q(nnAxJRi?vw^7>?
z#D8uv%VgW0taf8AjKQ2PF{tf#U~kT&#Zu-ulz|Om9c%m*)x*ab;ClWBDT96C;B~Ft
zo!aKL;{F=!DQ?R5XA%3LF?v0z6M2^HcG!-EZPxx;n|%rH!Ly^K%u`tBDUN@JEzS4c
z;1t^~@Fzjt9+tsV{EZ2c7vm{0TFkmq<9=`)vr6&o*!{_T_(JX|ezb+ZQsYO^lfTX2
zH174_J3%>rr^#QL(O}k%GE{Kv<x73&%XiG1>T&d23;l%(_<wzVC;6u}!h^B>3iUL;
z<Ckj7`>5n?*)Mn3``KRYZph?g33%TETh1m9%vk_>%6InyDN}<o<+~T_`#I$5Z?BOu
zFO*1!DffjV0*9yOu@QxMX8Ca+xO~a_(Xh92JjVDO%1JlwYuuAJQ+XuIJgwzTZQUw8
z-g6bp#JAr6IAJ>mI>IVBmSvta>&lH%)X_bVyv|w|G83f?t&=qVV(19N6Q0qK%Z*{C
zTw)Yt+vHQ5a>E0e#&HJsak7YI&Uq@P(X-_$7i&C&+<y!oempOJ&e#)szrxZ_(3;)X
z*Z2^{cMZ$z$=QyBZHdv1=g!Lx+umMpC-Qf?`|HTjRkWs*8s+Gh=D>M3#}PWJAIa-K
zMm;_2=Ad0Eh+=xp!x|}{nAfMvM@bpPm3evNrt}Gn<89cJzYo9(Cu>c_dIOo(oqqvO
z4yQgU49XqZ{yJw%&n!HDx$V!vHx;l{&{J{h#D%`b-ig2p@%19Fgzayjr}jSsC;UlA
z_3!cepMcYP^U3Qqom=y*xXXFmV|m=~Zrp&l#~Odb+Z+bB;x02|ISQHLHo&d8%Z(Bq
z_jhu(6{hWr+;$ImIpW^xE;r*QoY*$(r@&ofurC3p=Q-U!y_hRz%y`zGM2`O*xa1?o
z0?@<oL*sh_oYr}EHO-X@?0bC1SIoZ3jGZPQXKXiliGex%MEdx=&a=H$owP2}y?74K
z1-h3yxXpUvB~m5<-ivujYoKQ8#v1J=FEgShA7?~NUSiy5@^T{xe!z2gx)5WS8jvzm
zT~cPQ%XjKWct%ic7dO>oul7ak+3K)&TaG>4vd#;!rJdN*rD$*0i#3SWv`XwPX-(S;
ze>BEF1E+g&C3pzL*Pw$FUd=Ptw42SkQez?bC)j^{@M+&@;CRrs4&`ib;5P5TrUK=@
z9qwaiR}~;GANG)iofon0Q(tBtFa~p<xZ~QiDbKOwo&uIB!x)d4bG6j?7W`10H-XcA
zYTDjp*_v3mic;Nw>FF#&n?+Nl%p%~@bEB*CH#aaV-=F1vv3JS({3`c34&0n`H*(v_
z+;()o?G|o30-WZ$+5hLbT?KE4In}*Cuak5vg?aTJOT9OdrC!B)N4WmPm0j=3jI$`y
zygp;{65|ix2X=gUP+m_k%4;d|S_}LZBCiWNfv1_vSJ9@=<2rV6lc)3ID=hUJ?8kes
zAK!n~+|KPdFN8qA<KUu>{M2u;MtuNG-}kV??}k`v|3r4|nQC@yGGe21LRRO-yr*Y|
z7li#+{ri~fA9xCl^B4NQadA_1C;Bt{?t6tDv!9nJC(k3E2F$Bg%rQE%;EZ;9!nIN+
zj2!l%Z=AVLhp=b(cVJ?z`#$rze-Q1cJ&n_iSW3|Ev#6uBD|<FUe`def3mtNK)3^Jk
zU{B|At?9yA=sdgmyvWO!<g~Sp_^{V<87}zj#TkXhcLwJb75)$8<DUM~x-@Wlg7zS^
z?(VeJQya3+#z5zzd$B*D`$$3@6DW@9RvbUSR2;s+aZGT;v4Hw3vf>Dsab(x*QbV@s
zsGT#G&+}Ml4xC-(!u-J5mCpsja~0bEqL3Y;`MMf8S&f`bK@MCKtbEYiq<vQ%?79o2
zOd~kug>q1eakoyh_RYKS3>*a+U+<LStde@^<te9ri+QO6KYvFey}GaO&@i+m`!x2H
ztuuybhw*0HeTru!*_-V~N*R1-ddR$g3Z=|i_@sOOR&dH?6ZAAU_d}+8XEOL|#4wyV
zd{PXn$4Z$Z;@8o=3661Z#lB)Jl`#*u4riIsREDpu*kj=Pb!(m>4w`?(rd(#^fzy0H
zv%CuTTp%~^2|tFt49x*5(;1w~>EH{nbLPN<y#UtK&8~`HZ5-dw<93fIy<d8KHgZXQ
zQBL1NzxX}_=lNv4ub9oj*mtw%7F=W!d$GLOHD~lfcGy=8T^&1o9M8}j3s~dvk!)=f
z@b>`=I>#K}-Y?^Ou+pQ=n;t7X;!YWESLzh*-+*gNS9Rgmu6ZM*t_5QPT@zX-40p9o
zShgBm0^bL|9sE7;SHLH1yR+;1ug&e6>T;zPx!kEW+t^1ZF7`E!ca@#;W1b%`+VTf{
zAJIs@k7`>U>nyrzSCRLsU8b$e_nJTNM|Uvt{Q$eLW(0eV_DR%MLYqRbZ`V}#G>Uw?
zj=FuXN%N!^*pZ3X`6si@zC3GBr6XT<?9;(l&F>}7wLFh!nhQ9)kr%9ao4lPDu^wf=
zfA|FVIpIBJv%8|aV|;_p;~r54yyuvBmzj8P1m5_r^pbdw0p8^{y!(LnDV!Gx=R)RA
z{nk@DZO?GFw)^40##5vBj9sE^dTeRomdBQ@-SXHq!22NZPHoxpixZyVuf4lv>o4B(
zjGl&kmSS9~-xsn}v3ta>?}MhIJl^Fwg_lc>d(g*Kz~`^<Rq7t(aUb$Z`wx7dRZ43x
zzV|9U=_$s(<Qn6B&oE;f+tRhex3%kmVyWxF@}YD2?c|+Z`yQFw^(=Fxc47=aNm{ww
z<0?8Oo$qTTpC>_I!+rNmQU0#UEPvN$^73~jy;cs7>|__7Ew*xa(lgvRj&{jRU*o*Z
z-lz9&3|qNVCvV>AWvOSGdsooqJr%%sUcoxB&dg&Ca#f9-RUvl)>_>iV<`Uo4oi2VD
z>z;f3TV}b;R>nP?tH*tuD_?Ib<G#+-(>k8@bEmDm$W~rvE7#k~D{bYwZRLAy<tAJC
zK3h3zD{HngzQfDKv&B~4ZYzJqR&KV+`0j4dH32wHm?ULPjMiCpe&+vWmQ4MWA@tLS
z(9aq|f9nwX*+b~>7(y=%p|2T2zi<fsk|FeUL+Bfa(BCzL{+=Q9tB25shtRhUp^prq
zZ@;|$_kZ8@4oynu_xW|2x=z*=+iQ?OdwWD#rX(~uGB4VqM(wXF2I*M;%cM&Gd|8bs
zVZR>pM`Ci=KghWMy+;uX2A0jMjV@K%Vp_-2SXhbFDDjY{w(Bu%u+FThSJoyJyva6O
zW?!W&Z@Nzj>6eK@Q{u76I>nE!{O#5_{h^2)k1O$_0Zo?0biW!^bybe28<lVoi#0dL
z6`iRJ?}Ow4rfn4KDphY?TDznXe@WnPHU~Y>pK2c%8TSt^XTM%iTUzxvGab%oQzJ|=
zi@|nTZ=E@yPl`VjYikF(ym_U6GKP?du8GFhM+e!O=v#65r)Y|<b>MwH2I#+O)~vSp
zRI9N)anymmk=(2+n%`_`8QVkSfzPz~j3zb8tQXN$9mu#64WqL<jG-D4$D&G9uVgD4
z=g)4i{%SxDhpn7dGFj8)4nn6|ZkI!<-ciZ;D5>&`v4+Z6JH3sIS1Oto(@2|$>MW!w
zn6NdfhQwp?1a#`Gay^>VMiF+K5>{<`Rf$*1I{I!(=qPeelve7bs)H&cO0)$oSXgP6
z6A^u>9K}m;4Wcz85dsUVAqoX=3~fd7U=1c-*&NdrD;@DlGsa4$tdk=Ngy9^CW=-O3
z9IbM^4)3AW4VA1_*<e9b8wO@-vx*3=Fpnx3CCurb56?MCD|ybcfYwFSIMNbLv^6Oj
z`2ccuaYZz)wnUZil328*lHqkBpf=62w_PREaObX5YVj(SCxCa&2WUBMPw4SVH40XX
zX%Pa{v4n0`T0C3J!>dX(H!E6YyP`EW=rWqNYq5|LPG|}Y6<~yWl20BBZYA7+zr8V+
z4JgeCR@G6XL=+u!aETl@r$%lbatjKre?p=iT$oMgYq2)oQ6g$bmk({drs_&#Y(8#7
zOsQC#8j>RsOj?HTj2Zn|4skz-E<eZ(!dcn75Szyul!>x`4@71A1v@s372=(JzFCbk
zkn0}2NdMt5=ALym^Blkn{VB!FD;2Z6XGXAmyi39{j(O21aW~HXUc42jvbY2S0s(=5
zKtLcM5D*9m1Ox&C0fB%(Kp-Fx5C{ka1Ofs9fq+0jARrJB2nYlO0{=-6C{EHlH2eY(
ze|~LodM`$@^<t|yi623lvh|j%IEnXOri@o?Cdc0pn;d^HYjXN6Dz&HIYZ9m5#S$OR
zIo_F>GX5^d<oF$)$wzWt!a4qCA6Id56zBMxW>d!R08CE5Wh6cQewX-I&gu88BwIh*
z@8|S8N2<S$bG*+pWk2T=IKQ6riJVX3d@|?sdrI<igUS8LqZT0egGQ?Go4?1Ul2bp-
zPqlt#jd!sAI<B|y8?2wk^%g#Z^^bDBm8Ze_Z*YC~g{3pTkGP%>XHfj~<4lU*%J*RZ
z9uvS>c&FNbNhnS+#p2{&_^DuiP{C1ey~{Nf;zJ-H5D*9m1Ox&C0fB%(Kp-Fx5C{ka
z1Ofs9fq+0jARrJB2nYlO0s;YnfIvVXAP^7;2m}NI0s(=5KtLcM5D*9m1Ox&C0fB%(
zKp-Fx5C{ka1Ofs9fq+0jARrJB2nYlO0s;YnfIvVXAP^7;{5L~jsrdu<Vm69%7w7rr
z58#X0&0Kz(^P`-<&h?L)Ka4MCZ=1jSE@nNP_i{do+fCy<%K1*tALo1z=ky<O;&=7z
zEay7!ZwKcGIiJk?wf<lFVphfFFy|XN-*58&W=R#p6$l6f1Ofs9fq+0jARrJB2nYlO
z0s;YnfIvVXAP^7;2m}NI0s(=5KtLcM5D*9m1Ox&C0fB%(Kp-Fx5C{ka1Ofs9fq+0j
zARrJB2nYlO0s;YnfIvVXAP^7;2m}NI0s(=5KtLcM5D*9m1Ox&Cf&Y>S_z`t0Xqbz!
zso-F_%gamU;IneIjFHSF{4euTe<iic)h7qmyWH@L{{-IE2zmtccZ|7vKt(R*z5{eO
zXfx=4f_@Duax?b~&^>PE*#g=JItqFR^gGa}pkfd6i~~&v-2qzSVcvT{kAhwXy#qQ2
zD)lmNjhE#;0@?@q8OWW-^6mg7LEp_|uCsZJ1(!8cN91^Xz7}h1&^0yMLRF1QTYF5C
zwT>E9Q$l)7>tMm<Rg0>F)k^{m4Z%q;nXg8arE*kmQM5t2Wr?<?OH^n5*Dh~JXw7m+
zsn!%(S6DDtu|UzQ<hT+DhczW0w<(syLThZ|ifF{FSd(iRh@hrH-3V{}l~o<O60cLV
zc@d>eiRzb9)yLM`s~0M2OX~o`I(35*nIF^I<N?*I)Ue*lg27;1mqTlUq1H9QW?794
ztZ7NeTG*-zE1K5Mf+3mWWWjJG)~-ZZu(=(O>&+-=3PNvQuc-t`8#ilH+CuHXJ`s;7
z3Yy27?h6Iw1QMb|6aZBhi$&^Ums2F-h|p}(7H?@#q6o#5c;lR;%PK<QDiQ+yJzTCn
zT(9T}ExJ;UBoJ@){CX>(K~ted*5$#KYDiaOQOvJFs-+2CZi*-YO_MLHtg$r@Vk-8_
zvnU^0B7{^05ma?7iu6QvYvP&nyk2QmG$k5B(2fb6HLi{73MMt5!*ol#f^t*|Hfd_O
zMG3Np=$`km4{yQ8LpQmCx+X{C?XkELW~CnX%aN!~PICp@5)oYut~cjXoZaYQ>1ojH
zoPm$iv+!}W7$52B__%;Y#w~|~v4q~9(Ak&biXIHb+S+l0fhRC;??sy%8S^zI)JT~7
z3^qk#O>7J|WV4iYa)iw@r_rL8K^~T?QB`M+n!-Zn6bq_R>T3s$<!JFE?l($*?cT8A
zA=e|W58cCtoy5w8WeyABa2PyOLK0(jkgxMF);-RWRW7q@EqOba{aoJ9Wr@pw%jMNv
zF2wRf{*qk&ESL9kc@dY7a5>E7ZZ6-?Wp<s_|3NPMx%`h@mbm;rmsfLn7?yIge=gt5
z<-J_~GMA5Vc^8+vx%@nr*?6n}<6QQ0*;Qcqm$-Zb<l$E{)|0gCZ{zZ5E(f@r=JHZ5
zvu&3BDlYrDyqe3!T&AR5gM2+?K9CsmbNRzDR{L5mvr(4(WiB@sTJlydZ{YgfT=tH(
z^xx)kJNN$*mlygh{Tp1~P-e+^)S2;Y=Jr0sM)95I{{1N9AF^UyH(Ihe_oxPsNYdxz
z6%IM<kQ0`SPwS!^)%s`2-*U*WT5_)ce|5-b9QMVC1XVfv&pPDgmJAc?YIex?JM8y3
z<R4lx?g;DpsY8C-A@?}s(+)Z9kX`VM;Bx*89P%}mj8@h)$sx~m$SWOkt3%%Ekat@$
zKCSCXhrHL4u}WCiVTb$+hkU{z|K1^AaLCva(fD)zXInC?tm`viH-NBZGxvnkz_IGG
z>7d&|m7vdq?f`uObSG#YXg+8GXd#I1u|=T8pe3NCpk<(}AKFje0=gCC>}xLSBv1fU
z1*!(sfT-`;pgEu`#X$Yo<G55TG{zaAnV_3MH-l~i(dB3M=iK0D%@wuw$DrLX`+T}Y
zo#PQ{w=f@O_6lozWP1z`e#F_$&8MBc0vpZT19o8j;HRx!Z+}?YcR9Iz=byZV{eRtm
zUwXUZ+|>NB+pjC_>i^iSnPY!7#Gzp5{n{0_djDTG;r-``|DeN#^Azx(b@!h=orKKe
zTrh4O?)b@pA3Y|mXq-PgdjPbbNhZnR@baenaLTN7w2<ixuu9citL1h%r0N|`18Z|U
z;Dj^DiXnGw4d`{U-dedHUT9yhwJBj0N6xHXRpOPhj$?0A0w={voY7*MvzIVVFcgJ(
zn5>K_(H1y#gc??wW$gEJ=ivbZ3@h=Frnb|G$JxwwkQvw{r=XrKc;c1KF>SHZ5wA4I
z?F^w+j@M~Qv%0~lh?<9S@&g1euUrw0t1VF_yd)NFaaw6gTWp<Dt1E3B&r1S@m6Rva
z3RDR<B$}+DJ00SXhHo0GIs)xD1&2|jELG#o6N}=UemUzJoH2D6FOlPBT5}B?uv*?}
z#Ql?QkWYkW20>qZ)JDU~Al%HQzcEHDBIZb}O{KFt&A}ENo>X0xBlxxfN9vYfVA;GH
z<d_$0qf3=GUanWxW^0#Ifj<rTvCewzvUBeYJA!Sl#LVppJ{c=T9rj`F>CBBy3ceCT
zT?BL&DRB*F?9JVNHydV&Lw%OCXDuRXQ~yJBMz{kqOPqXgy8P6iYiDjb`;BK$jotI~
z8_)gImpvoj|E(fV|4jMf>2sGS-g>UDefYC)&oA2Y{6F5jxcKPXPp{kcy}56eY<{e+
z@yTlsfA7GZ$?uiEw(h3J*n{b{Cr|Fuo?P_<<F<<n|Dvn#{-?UO{`Gh7iv8p_%}2by
zI(BT0d-+{?eVY$$-uS@3y|`=PSEv8wqONxzu6X^Uz15zV(o;6SQ+Ll_|Lz~Y`}1)R
z7F2B+^Tj!x+lOlZr|Rc*|7+dXZ*6+{(YyZe#l3Sz-MRC^hu@k1X~mB5D=Ti(zxw?j
z|Ed0mYTq_f-M`}86W@Aa`iG?}8%`FL_l}RZeakmuef6WGI_4g@w`=9*7ss!C<Z~aq
e^SkzW_Z@62JUZ*myYHR4CcXRmRL_N%9{4}oSt#-V

diff --git a/bridge/test_bridge.m b/bridge/test_bridge.m
new file mode 100644
index 0000000..9044542
--- /dev/null
+++ b/bridge/test_bridge.m
@@ -0,0 +1,305 @@
+// test_bridge.m — functional tests for ane_bridge API
+// Tests both BLOBFILE (upstream compat) and dynamic IOSurface (our approach)
+
+#import <Foundation/Foundation.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include <mach/mach_time.h>
+#include "ane_bridge.h"
+
+static mach_timebase_info_data_t g_tb;
+static double ms(void) {
+    return (double)mach_absolute_time() * g_tb.numer / g_tb.denom / 1e6;
+}
+
+static int passed = 0, failed = 0;
+#define PASS(msg) do { printf("  [PASS] %s\n", msg); passed++; } while(0)
+#define FAIL(msg) do { printf("  [FAIL] %s\n", msg); failed++; } while(0)
+#define CHECK(cond, msg) do { if (cond) PASS(msg); else FAIL(msg); } while(0)
+
+// Correct MIL header — must match exactly what ANE compiler expects
+#define MIL_HDR \
+    "program(1.3)\n" \
+    "[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, " \
+    "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, " \
+    "{\"coremltools-version\", \"9.0\"}})]\n{\n"
+
+// Dynamic weight matmul: y[1,N,M] = W[1,N,K] @ x[1,K,M]
+// x declared as [1,K,1,M] (4D), W as [1,N,K] (3D dynamic weight)
+static const char *mil_dyn_matmul(int N, int K, int M) {
+    static char buf[2048];
+    snprintf(buf, sizeof(buf),
+        MIL_HDR
+        "    func main<ios18>(\n"
+        "        tensor<fp16, [1, %d, 1, %d]> x,\n"
+        "        tensor<fp16, [1, %d, %d]> W) {\n"
+        "        tensor<int32, [3]> sh = const()[name=string(\"sh\"), val=tensor<int32, [3]>([1,%d,%d])];\n"
+        "        tensor<fp16, [1,%d,%d]> x3 = reshape(shape=sh,x=x)[name=string(\"rx\")];\n"
+        "        bool bF = const()[name=string(\"bF\"), val=bool(false)];\n"
+        "        tensor<fp16, [1,%d,%d]> out = matmul(transpose_x=bF,transpose_y=bF,x=W,y=x3)[name=string(\"mm\")];\n"
+        "    } -> (out);\n"
+        "}\n",
+        K, M,   // x: [1,K,1,M]
+        N, K,   // W: [1,N,K]
+        K, M,   // reshape sh
+        K, M,   // x3
+        N, M    // out
+    );
+    return buf;
+}
+
+// No-weight MIL: elementwise add x+x = 2x
+static const char *mil_scale2(int rows, int cols) {
+    static char buf[1024];
+    snprintf(buf, sizeof(buf),
+        MIL_HDR
+        "    func main<ios18>(tensor<fp16, [1, %d, 1, %d]> x) {\n"
+        "        tensor<fp16, [1,%d,1,%d]> out = add(x=x,y=x)[name=string(\"out\")];\n"
+        "    } -> (out);\n"
+        "}\n",
+        rows, cols, rows, cols
+    );
+    return buf;
+}
+
+static void fill_fp16(void *buf, float val, int count) {
+    uint16_t *p = (uint16_t *)buf;
+    _Float16 fval = (_Float16)val;
+    uint16_t bits;
+    memcpy(&bits, &fval, 2);
+    for (int i = 0; i < count; i++) p[i] = bits;
+}
+
+static float read_fp16_elem(const void *buf, int idx) {
+    const uint16_t *p = (const uint16_t *)buf;
+    _Float16 v;
+    memcpy(&v, &p[idx], 2);
+    return (float)v;
+}
+
+// --- Test 1: init ---
+static void test_init(void) {
+    printf("\n[1] ane_bridge_init\n");
+    int r = ane_bridge_init();
+    CHECK(r == 0, "init returns 0");
+}
+
+// --- Test 2: compile_dyn — compile once with dynamic weight ---
+static ANEKernelHandle *g_dyn_kern = NULL;
+static void test_compile_dyn(void) {
+    printf("\n[2] ane_bridge_compile_dyn (dynamic weight IOSurface)\n");
+
+    int N=64, K=64, M=64;
+    const char *mil = mil_dyn_matmul(N, K, M);
+    size_t mil_len = strlen(mil);
+
+    // x: [1,K,1,M] fp16, W: [1,N,K] fp16, out: [1,N,M] fp16
+    size_t in_sz  = (size_t)K * M * 2;
+    size_t w_sz   = (size_t)N * K * 2;
+    size_t out_sz = (size_t)N * M * 2;
+
+    double t0 = ms();
+    g_dyn_kern = ane_bridge_compile_dyn(mil, mil_len, 1, &in_sz, 1, &w_sz, out_sz);
+    double elapsed = ms() - t0;
+
+    CHECK(g_dyn_kern != NULL, "compile_dyn returns non-NULL handle");
+    printf("  Compile time: %.1fms\n", elapsed);
+}
+
+// --- Test 3: write_weight + eval — dynamic weights actually update ---
+static void test_dynamic_weight_update(void) {
+    printf("\n[3] Dynamic weight update (write W, eval, check output changes)\n");
+    if (!g_dyn_kern) { FAIL("no kernel from test 2"); return; }
+
+    int N=64, K=64, M=64;
+    size_t in_sz  = (size_t)K * M * 2;
+    size_t w_sz   = (size_t)N * K * 2;
+    size_t out_sz = (size_t)N * M * 2;
+
+    uint16_t *xbuf   = (uint16_t *)malloc(in_sz);
+    uint16_t *wbuf_A = (uint16_t *)calloc(N*K, 2);
+    uint16_t *wbuf_B = (uint16_t *)calloc(N*K, 2);
+    uint16_t *out    = (uint16_t *)malloc(out_sz);
+
+    // x = all 1.0
+    fill_fp16(xbuf, 1.0f, K*M);
+
+    // W_A = identity (scale=1): diagonal 1.0
+    _Float16 one = 1.0f; uint16_t one_bits; memcpy(&one_bits, &one, 2);
+    _Float16 two = 2.0f; uint16_t two_bits; memcpy(&two_bits, &two, 2);
+    for (int i = 0; i < N && i < K; i++) wbuf_A[i*K + i] = one_bits;
+    for (int i = 0; i < N && i < K; i++) wbuf_B[i*K + i] = two_bits;
+
+    // Eval A: W = identity
+    ane_bridge_write_input(g_dyn_kern, 0, xbuf, in_sz);
+    ane_bridge_write_weight(g_dyn_kern, 0, wbuf_A, w_sz);
+    bool ok = ane_bridge_eval(g_dyn_kern);
+    CHECK(ok, "eval A succeeds");
+    ane_bridge_read_output(g_dyn_kern, 0, out, out_sz);
+    float sum_A = 0;
+    for (int i = 0; i < N*M; i++) sum_A += read_fp16_elem(out, i);
+
+    // Eval B: W = 2x identity, NO recompile
+    double t0 = ms();
+    ane_bridge_write_weight(g_dyn_kern, 0, wbuf_B, w_sz);
+    double write_ms = ms() - t0;
+
+    t0 = ms();
+    ok = ane_bridge_eval(g_dyn_kern);
+    double eval_ms = ms() - t0;
+
+    CHECK(ok, "eval B succeeds after weight update");
+    ane_bridge_read_output(g_dyn_kern, 0, out, out_sz);
+    float sum_B = 0;
+    for (int i = 0; i < N*M; i++) sum_B += read_fp16_elem(out, i);
+
+    printf("  sum(out) W=identity: %.1f  W=2x: %.1f  ratio: %.2f (expect ~2.0)\n",
+           sum_A, sum_B, sum_B / (sum_A + 1e-9f));
+    CHECK(fabsf(sum_B / (sum_A + 1e-9f) - 2.0f) < 0.1f, "output doubled after weight update");
+    printf("  write_weight: %.3fms  eval: %.2fms\n", write_ms, eval_ms);
+
+    free(xbuf); free(wbuf_A); free(wbuf_B); free(out);
+}
+
+// --- Test 4: write_weight_f32 ---
+static void test_write_weight_f32(void) {
+    printf("\n[4] ane_bridge_write_weight_f32 (fp32 -> fp16 conversion)\n");
+    if (!g_dyn_kern) { FAIL("no kernel"); return; }
+
+    int N=64, K=64, M=64;
+    float *w_fp32  = (float *)calloc(N*K, 4);
+    uint16_t *out1 = (uint16_t *)malloc((size_t)N*M*2);
+    uint16_t *out2 = (uint16_t *)malloc((size_t)N*M*2);
+    uint16_t *xbuf = (uint16_t *)malloc((size_t)K*M*2);
+    fill_fp16(xbuf, 1.0f, K*M);
+
+    // 3x identity in fp32
+    for (int i = 0; i < N && i < K; i++) w_fp32[i*K + i] = 3.0f;
+    ane_bridge_write_input(g_dyn_kern, 0, xbuf, (size_t)K*M*2);
+    ane_bridge_write_weight_f32(g_dyn_kern, 0, w_fp32, (size_t)N*K);
+    bool ok = ane_bridge_eval(g_dyn_kern);
+    CHECK(ok, "eval with fp32-written weight succeeds");
+    ane_bridge_read_output(g_dyn_kern, 0, out1, (size_t)N*M*2);
+    float sum = 0;
+    for (int i = 0; i < N*M; i++) sum += read_fp16_elem(out1, i);
+    float expected = 3.0f * N; // 3x identity: each row sums x (all 1s) scaled by 3
+    // Actually identity matmul with x=all-1s: row i of output = row i of W dotted with x
+    // row i of identity (scaled 3) = 3 at position i, 0 elsewhere. So out[i][t] = 3*x[i][t] = 3
+    printf("  sum(out) with 3x identity (fp32 write): %.1f\n", sum);
+    CHECK(fabsf(sum - 3.0f * N * M) < N*M*0.1f, "fp32 weight write produces correct output");
+
+    free(w_fp32); free(out1); free(out2); free(xbuf);
+}
+
+// --- Test 5: copy_io ---
+static void test_copy_io(void) {
+    printf("\n[5] ane_bridge_copy_io (direct IOSurface-to-IOSurface)\n");
+
+    // Compile a simple no-weight kernel: scale x by 2
+    int R=32, C=32;
+    const char *mil = mil_scale2(R, C);
+    size_t sz = (size_t)R * C * 2;
+
+    ANEKernelHandle *k1 = ane_bridge_compile_dyn(mil, strlen(mil), 1, &sz, 0, NULL, sz);
+    ANEKernelHandle *k2 = ane_bridge_compile_dyn(mil, strlen(mil), 1, &sz, 0, NULL, sz);
+    CHECK(k1 != NULL && k2 != NULL, "two scale2 kernels compiled");
+    if (!k1 || !k2) { ane_bridge_free(k1); ane_bridge_free(k2); return; }
+
+    // k1 input = 1.0, k1 out = 2.0, copy to k2 in, k2 out = 4.0
+    uint16_t *buf = (uint16_t *)malloc(sz);
+    uint16_t *out = (uint16_t *)malloc(sz);
+    fill_fp16(buf, 1.0f, R*C);
+
+    ane_bridge_write_input(k1, 0, buf, sz);
+    ane_bridge_eval(k1);
+    ane_bridge_copy_io(k1, 0, k2, 0);  // k2 input = k1 output (should be 2.0)
+    ane_bridge_eval(k2);
+    ane_bridge_read_output(k2, 0, out, sz);
+
+    float val = read_fp16_elem(out, 0);
+    printf("  input=1.0 -> k1(x2) -> copy -> k2(x2) -> output=%.1f (expect 4.0)\n", val);
+    CHECK(fabsf(val - 4.0f) < 0.2f, "copy_io chains two kernels without CPU round-trip");
+
+    free(buf); free(out);
+    ane_bridge_free(k1);
+    ane_bridge_free(k2);
+}
+
+// --- Test 6: begin/end realtime ---
+static void test_realtime(void) {
+    printf("\n[6] ane_bridge_begin/end_realtime\n");
+    if (!g_dyn_kern) { FAIL("no kernel"); return; }
+
+    // Just verify they don't crash. Jitter improvement only visible over many samples.
+    ane_bridge_begin_realtime();
+    bool ok = ane_bridge_eval(g_dyn_kern);
+    ane_bridge_end_realtime();
+    CHECK(ok, "eval inside realtime task succeeds");
+    printf("  (p99 jitter improvement requires statistical measurement — see test_realtime_task.m)\n");
+}
+
+// --- Test 7: compile cache ---
+static void test_compile_cache(void) {
+    printf("\n[7] Compile cache (cache files written to ~/.ane_cache/)\n");
+
+    int N=32, M=32;
+    const char *mil = mil_scale2(N, M);
+    size_t sz = (size_t)N*M*2;
+
+    ANEKernelHandle *k = ane_bridge_compile_dyn(mil, strlen(mil), 1, &sz, 0, NULL, sz);
+    CHECK(k != NULL, "kernel compiled for cache test");
+    if (!k) return;
+
+    // Verify cache files exist on disk
+    // We need the hex ID — compile a fresh model to get it, then check cache
+    NSString *home = NSHomeDirectory();
+    NSString *cacheRoot = [home stringByAppendingPathComponent:@".ane_cache"];
+    NSFileManager *fm = [NSFileManager defaultManager];
+    // Dynamic-weight models produce net.plist only (no data file)
+    NSArray *entries = [fm contentsOfDirectoryAtPath:cacheRoot error:nil];
+    BOOL found_plist = NO;
+    for (NSString *entry in entries) {
+        NSString *plistPath = [[cacheRoot stringByAppendingPathComponent:entry]
+                                stringByAppendingPathComponent:@"net.plist"];
+        if ([fm fileExistsAtPath:plistPath]) { found_plist = YES; break; }
+    }
+    CHECK(found_plist, "compiled kernel cached to ~/.ane_cache/");
+
+    // Free k first, then recompile — should hit cache (load only, no compile)
+    ane_bridge_free(k);
+    int before = ane_bridge_get_compile_count();
+    ANEKernelHandle *k2 = ane_bridge_compile_dyn(mil, strlen(mil), 1, &sz, 0, NULL, sz);
+    int after = ane_bridge_get_compile_count();
+    CHECK(k2 != NULL, "cache hit returns valid kernel");
+    CHECK(after == before, "cache hit does not increment compile count");
+    ane_bridge_free(k2);
+}
+
+// --- Test 8: free ---
+static void test_free(void) {
+    printf("\n[8] ane_bridge_free\n");
+    ane_bridge_free(g_dyn_kern);
+    g_dyn_kern = NULL;
+    PASS("free does not crash");
+}
+
+int main(void) {
+    @autoreleasepool {
+        mach_timebase_info(&g_tb);
+        printf("=== ane_bridge test suite ===\n");
+
+        test_init();
+        test_compile_dyn();
+        test_dynamic_weight_update();
+        test_write_weight_f32();
+        test_copy_io();
+        test_realtime();
+        test_compile_cache();
+        test_free();
+
+        printf("\n=== Results: %d passed, %d failed ===\n", passed, failed);
+        return failed > 0 ? 1 : 0;
+    }
+}