// test_bridge.m — functional tests for ane_bridge API // Tests both BLOBFILE (upstream compat) and dynamic IOSurface (our approach) #import #include #include #include #include #include #include "ane_bridge.h" static mach_timebase_info_data_t g_tb; static double ms(void) { return (double)mach_absolute_time() * g_tb.numer / g_tb.denom / 1e6; } static int passed = 0, failed = 0; #define PASS(msg) do { printf(" [PASS] %s\n", msg); passed++; } while(0) #define FAIL(msg) do { printf(" [FAIL] %s\n", msg); failed++; } while(0) #define CHECK(cond, msg) do { if (cond) PASS(msg); else FAIL(msg); } while(0) // Correct MIL header — must match exactly what ANE compiler expects #define MIL_HDR \ "program(1.3)\n" \ "[buildInfo = dict({{\"coremlc-component-MIL\", \"3510.2.1\"}, " \ "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, " \ "{\"coremltools-version\", \"9.0\"}})]\n{\n" // Dynamic weight matmul: y[1,N,M] = W[1,N,K] @ x[1,K,M] // x declared as [1,K,1,M] (4D), W as [1,N,K] (3D dynamic weight) static const char *mil_dyn_matmul(int N, int K, int M) { static char buf[2048]; snprintf(buf, sizeof(buf), MIL_HDR " func main(\n" " tensor x,\n" " tensor W) {\n" " tensor sh = const()[name=string(\"sh\"), val=tensor([1,%d,%d])];\n" " tensor x3 = reshape(shape=sh,x=x)[name=string(\"rx\")];\n" " bool bF = const()[name=string(\"bF\"), val=bool(false)];\n" " tensor out = matmul(transpose_x=bF,transpose_y=bF,x=W,y=x3)[name=string(\"mm\")];\n" " } -> (out);\n" "}\n", K, M, // x: [1,K,1,M] N, K, // W: [1,N,K] K, M, // reshape sh K, M, // x3 N, M // out ); return buf; } // No-weight MIL: elementwise add x+x = 2x static const char *mil_scale2(int rows, int cols) { static char buf[1024]; snprintf(buf, sizeof(buf), MIL_HDR " func main(tensor x) {\n" " tensor out = add(x=x,y=x)[name=string(\"out\")];\n" " } -> (out);\n" "}\n", rows, cols, rows, cols ); return buf; } static void fill_fp16(void *buf, float val, int count) { uint16_t *p = (uint16_t *)buf; _Float16 fval = (_Float16)val; uint16_t bits; memcpy(&bits, &fval, 2); for (int i = 0; i < count; i++) p[i] = bits; } static float read_fp16_elem(const void *buf, int idx) { const uint16_t *p = (const uint16_t *)buf; _Float16 v; memcpy(&v, &p[idx], 2); return (float)v; } // --- Test 1: init --- static void test_init(void) { printf("\n[1] ane_bridge_init\n"); int r = ane_bridge_init(); CHECK(r == 0, "init returns 0"); } // --- Test 2: compile_dyn — compile once with dynamic weight --- static ANEKernelHandle *g_dyn_kern = NULL; static void test_compile_dyn(void) { printf("\n[2] ane_bridge_compile_dyn (dynamic weight IOSurface)\n"); int N=64, K=64, M=64; const char *mil = mil_dyn_matmul(N, K, M); size_t mil_len = strlen(mil); // x: [1,K,1,M] fp16, W: [1,N,K] fp16, out: [1,N,M] fp16 size_t in_sz = (size_t)K * M * 2; size_t w_sz = (size_t)N * K * 2; size_t out_sz = (size_t)N * M * 2; double t0 = ms(); g_dyn_kern = ane_bridge_compile_dyn(mil, mil_len, 1, &in_sz, 1, &w_sz, out_sz); double elapsed = ms() - t0; CHECK(g_dyn_kern != NULL, "compile_dyn returns non-NULL handle"); printf(" Compile time: %.1fms\n", elapsed); } // --- Test 3: write_weight + eval — dynamic weights actually update --- static void test_dynamic_weight_update(void) { printf("\n[3] Dynamic weight update (write W, eval, check output changes)\n"); if (!g_dyn_kern) { FAIL("no kernel from test 2"); return; } int N=64, K=64, M=64; size_t in_sz = (size_t)K * M * 2; size_t w_sz = (size_t)N * K * 2; size_t out_sz = (size_t)N * M * 2; uint16_t *xbuf = (uint16_t *)malloc(in_sz); uint16_t *wbuf_A = (uint16_t *)calloc(N*K, 2); uint16_t *wbuf_B = (uint16_t *)calloc(N*K, 2); uint16_t *out = (uint16_t *)malloc(out_sz); // x = all 1.0 fill_fp16(xbuf, 1.0f, K*M); // W_A = identity (scale=1): diagonal 1.0 _Float16 one = 1.0f; uint16_t one_bits; memcpy(&one_bits, &one, 2); _Float16 two = 2.0f; uint16_t two_bits; memcpy(&two_bits, &two, 2); for (int i = 0; i < N && i < K; i++) wbuf_A[i*K + i] = one_bits; for (int i = 0; i < N && i < K; i++) wbuf_B[i*K + i] = two_bits; // Eval A: W = identity ane_bridge_write_input(g_dyn_kern, 0, xbuf, in_sz); ane_bridge_write_weight(g_dyn_kern, 0, wbuf_A, w_sz); bool ok = ane_bridge_eval(g_dyn_kern); CHECK(ok, "eval A succeeds"); ane_bridge_read_output(g_dyn_kern, 0, out, out_sz); float sum_A = 0; for (int i = 0; i < N*M; i++) sum_A += read_fp16_elem(out, i); // Eval B: W = 2x identity, NO recompile double t0 = ms(); ane_bridge_write_weight(g_dyn_kern, 0, wbuf_B, w_sz); double write_ms = ms() - t0; t0 = ms(); ok = ane_bridge_eval(g_dyn_kern); double eval_ms = ms() - t0; CHECK(ok, "eval B succeeds after weight update"); ane_bridge_read_output(g_dyn_kern, 0, out, out_sz); float sum_B = 0; for (int i = 0; i < N*M; i++) sum_B += read_fp16_elem(out, i); printf(" sum(out) W=identity: %.1f W=2x: %.1f ratio: %.2f (expect ~2.0)\n", sum_A, sum_B, sum_B / (sum_A + 1e-9f)); CHECK(fabsf(sum_B / (sum_A + 1e-9f) - 2.0f) < 0.1f, "output doubled after weight update"); printf(" write_weight: %.3fms eval: %.2fms\n", write_ms, eval_ms); free(xbuf); free(wbuf_A); free(wbuf_B); free(out); } // --- Test 4: write_weight_f32 --- static void test_write_weight_f32(void) { printf("\n[4] ane_bridge_write_weight_f32 (fp32 -> fp16 conversion)\n"); if (!g_dyn_kern) { FAIL("no kernel"); return; } int N=64, K=64, M=64; float *w_fp32 = (float *)calloc(N*K, 4); uint16_t *out1 = (uint16_t *)malloc((size_t)N*M*2); uint16_t *out2 = (uint16_t *)malloc((size_t)N*M*2); uint16_t *xbuf = (uint16_t *)malloc((size_t)K*M*2); fill_fp16(xbuf, 1.0f, K*M); // 3x identity in fp32 for (int i = 0; i < N && i < K; i++) w_fp32[i*K + i] = 3.0f; ane_bridge_write_input(g_dyn_kern, 0, xbuf, (size_t)K*M*2); ane_bridge_write_weight_f32(g_dyn_kern, 0, w_fp32, (size_t)N*K); bool ok = ane_bridge_eval(g_dyn_kern); CHECK(ok, "eval with fp32-written weight succeeds"); ane_bridge_read_output(g_dyn_kern, 0, out1, (size_t)N*M*2); float sum = 0; for (int i = 0; i < N*M; i++) sum += read_fp16_elem(out1, i); float expected = 3.0f * N; // 3x identity: each row sums x (all 1s) scaled by 3 // Actually identity matmul with x=all-1s: row i of output = row i of W dotted with x // row i of identity (scaled 3) = 3 at position i, 0 elsewhere. So out[i][t] = 3*x[i][t] = 3 printf(" sum(out) with 3x identity (fp32 write): %.1f\n", sum); CHECK(fabsf(sum - 3.0f * N * M) < N*M*0.1f, "fp32 weight write produces correct output"); free(w_fp32); free(out1); free(out2); free(xbuf); } // --- Test 5: copy_io --- static void test_copy_io(void) { printf("\n[5] ane_bridge_copy_io (direct IOSurface-to-IOSurface)\n"); // Compile a simple no-weight kernel: scale x by 2 int R=32, C=32; const char *mil = mil_scale2(R, C); size_t sz = (size_t)R * C * 2; ANEKernelHandle *k1 = ane_bridge_compile_dyn(mil, strlen(mil), 1, &sz, 0, NULL, sz); ANEKernelHandle *k2 = ane_bridge_compile_dyn(mil, strlen(mil), 1, &sz, 0, NULL, sz); CHECK(k1 != NULL && k2 != NULL, "two scale2 kernels compiled"); if (!k1 || !k2) { ane_bridge_free(k1); ane_bridge_free(k2); return; } // k1 input = 1.0, k1 out = 2.0, copy to k2 in, k2 out = 4.0 uint16_t *buf = (uint16_t *)malloc(sz); uint16_t *out = (uint16_t *)malloc(sz); fill_fp16(buf, 1.0f, R*C); ane_bridge_write_input(k1, 0, buf, sz); ane_bridge_eval(k1); ane_bridge_copy_io(k1, 0, k2, 0); // k2 input = k1 output (should be 2.0) ane_bridge_eval(k2); ane_bridge_read_output(k2, 0, out, sz); float val = read_fp16_elem(out, 0); printf(" input=1.0 -> k1(x2) -> copy -> k2(x2) -> output=%.1f (expect 4.0)\n", val); CHECK(fabsf(val - 4.0f) < 0.2f, "copy_io chains two kernels without CPU round-trip"); free(buf); free(out); ane_bridge_free(k1); ane_bridge_free(k2); } // --- Test 6: begin/end realtime --- static void test_realtime(void) { printf("\n[6] ane_bridge_begin/end_realtime\n"); if (!g_dyn_kern) { FAIL("no kernel"); return; } // Just verify they don't crash. Jitter improvement only visible over many samples. ane_bridge_begin_realtime(); bool ok = ane_bridge_eval(g_dyn_kern); ane_bridge_end_realtime(); CHECK(ok, "eval inside realtime task succeeds"); printf(" (p99 jitter improvement requires statistical measurement — see test_realtime_task.m)\n"); } // --- Test 7: compile cache --- static void test_compile_cache(void) { printf("\n[7] Compile cache (cache files written to ~/.ane_cache/)\n"); int N=32, M=32; const char *mil = mil_scale2(N, M); size_t sz = (size_t)N*M*2; ANEKernelHandle *k = ane_bridge_compile_dyn(mil, strlen(mil), 1, &sz, 0, NULL, sz); CHECK(k != NULL, "kernel compiled for cache test"); if (!k) return; // Verify cache files exist on disk // We need the hex ID — compile a fresh model to get it, then check cache NSString *home = NSHomeDirectory(); NSString *cacheRoot = [home stringByAppendingPathComponent:@".ane_cache"]; NSFileManager *fm = [NSFileManager defaultManager]; // Dynamic-weight models produce net.plist only (no data file) NSArray *entries = [fm contentsOfDirectoryAtPath:cacheRoot error:nil]; BOOL found_plist = NO; for (NSString *entry in entries) { NSString *plistPath = [[cacheRoot stringByAppendingPathComponent:entry] stringByAppendingPathComponent:@"net.plist"]; if ([fm fileExistsAtPath:plistPath]) { found_plist = YES; break; } } CHECK(found_plist, "compiled kernel cached to ~/.ane_cache/"); // Free k first, then recompile — should hit cache (load only, no compile) ane_bridge_free(k); int before = ane_bridge_get_compile_count(); ANEKernelHandle *k2 = ane_bridge_compile_dyn(mil, strlen(mil), 1, &sz, 0, NULL, sz); int after = ane_bridge_get_compile_count(); CHECK(k2 != NULL, "cache hit returns valid kernel"); CHECK(after == before, "cache hit does not increment compile count"); ane_bridge_free(k2); } // --- Test 8: free --- static void test_free(void) { printf("\n[8] ane_bridge_free\n"); ane_bridge_free(g_dyn_kern); g_dyn_kern = NULL; PASS("free does not crash"); } int main(void) { @autoreleasepool { mach_timebase_info(&g_tb); printf("=== ane_bridge test suite ===\n"); test_init(); test_compile_dyn(); test_dynamic_weight_update(); test_write_weight_f32(); test_copy_io(); test_realtime(); test_compile_cache(); test_free(); printf("\n=== Results: %d passed, %d failed ===\n", passed, failed); return failed > 0 ? 1 : 0; } }