// tiny_train.m — Train a 2-layer linear model on ANE (forward AND backward) // y = W2 @ relu(W1 @ x), MSE loss, SGD update // Forward: ANE conv with baked weights // Backward dx: ANE conv with transposed baked weights // Backward dW: CPU (outer product, memory-bound) #import #import #import #import #import #import #include static Class g_D, g_I, g_AR, g_AIO; static void ane_init(void) { dlopen("/System/Library/PrivateFrameworks/AppleNeuralEngine.framework/AppleNeuralEngine", RTLD_NOW); g_D = NSClassFromString(@"_ANEInMemoryModelDescriptor"); g_I = NSClassFromString(@"_ANEInMemoryModel"); g_AR = NSClassFromString(@"_ANERequest"); g_AIO= NSClassFromString(@"_ANEIOSurfaceObject"); } static IOSurfaceRef make_surface(size_t bytes) { return IOSurfaceCreate((__bridge CFDictionaryRef)@{ (id)kIOSurfaceWidth:@(bytes), (id)kIOSurfaceHeight:@1, (id)kIOSurfaceBytesPerElement:@1, (id)kIOSurfaceBytesPerRow:@(bytes), (id)kIOSurfaceAllocSize:@(bytes), (id)kIOSurfacePixelFormat:@0}); } static NSData *build_blob(const float *w, int rows, int cols) { int wsize = rows * cols * 2; int total = 128 + wsize; uint8_t *buf = (uint8_t*)calloc(total, 1); buf[0] = 0x01; buf[4] = 0x02; buf[64] = 0xEF; buf[65] = 0xBE; buf[66] = 0xAD; buf[67] = 0xDE; buf[68] = 0x01; *(uint32_t*)(buf+72) = wsize; *(uint32_t*)(buf+80) = 128; _Float16 *fp16 = (_Float16*)(buf + 128); for (int i = 0; i < rows * cols; i++) fp16[i] = (_Float16)w[i]; return [NSData dataWithBytesNoCopy:buf length:total freeWhenDone:YES]; } // Build blob with TRANSPOSED weights: W[rows,cols] → W^T[cols,rows] static NSData *build_blob_transposed(const float *w, int rows, int cols) { int wsize = cols * rows * 2; int total = 128 + wsize; uint8_t *buf = (uint8_t*)calloc(total, 1); buf[0] = 0x01; buf[4] = 0x02; buf[64] = 0xEF; buf[65] = 0xBE; buf[66] = 0xAD; buf[67] = 0xDE; buf[68] = 0x01; *(uint32_t*)(buf+72) = wsize; *(uint32_t*)(buf+80) = 128; _Float16 *fp16 = (_Float16*)(buf + 128); for (int i = 0; i < rows; i++) for (int j = 0; j < cols; j++) fp16[j * rows + i] = (_Float16)w[i * cols + j]; // transpose return [NSData dataWithBytesNoCopy:buf length:total freeWhenDone:YES]; } static int g_fp16_io = 0; // M1/M2: cast op unsupported, use fp16 I/O directly static NSString *gen_conv_mil(int in_ch, int out_ch, int sp) { if (g_fp16_io) { return [NSString stringWithFormat: @"program(1.0)\n[buildInfo = dict, tensor>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n" " func main(tensor x) {\n" " tensor W = const()[name = tensor(\"W\"), " "val = tensor(BLOBFILE(path = tensor(\"@model_path/weights/weight.bin\"), offset = tensor(64)))];\n" " tensor pt = const()[name = tensor(\"pt\"), val = tensor(\"valid\")];\n" " tensor st = const()[name = tensor(\"st\"), val = tensor([1, 1])];\n" " tensor pd = const()[name = tensor(\"pd\"), val = tensor([0, 0, 0, 0])];\n" " tensor dl = const()[name = tensor(\"dl\"), val = tensor([1, 1])];\n" " tensor gr = const()[name = tensor(\"gr\"), val = tensor(1)];\n" " tensor y = conv(dilations = dl, groups = gr, pad = pd, " "pad_type = pt, strides = st, weight = W, x = x)[name = tensor(\"cv\")];\n" " } -> (y);\n}\n", in_ch, sp, out_ch, in_ch, out_ch, in_ch, out_ch, sp]; } return [NSString stringWithFormat: @"program(1.0)\n[buildInfo = dict, tensor>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n" " func main(tensor x) {\n" " tensor d1 = const()[name = tensor(\"d1\"), val = tensor(\"fp16\")];\n" " tensor x16 = cast(dtype = d1, x = x)[name = tensor(\"cx\")];\n" " tensor W = const()[name = tensor(\"W\"), " "val = tensor(BLOBFILE(path = tensor(\"@model_path/weights/weight.bin\"), offset = tensor(64)))];\n" " tensor pt = const()[name = tensor(\"pt\"), val = tensor(\"valid\")];\n" " tensor st = const()[name = tensor(\"st\"), val = tensor([1, 1])];\n" " tensor pd = const()[name = tensor(\"pd\"), val = tensor([0, 0, 0, 0])];\n" " tensor dl = const()[name = tensor(\"dl\"), val = tensor([1, 1])];\n" " tensor gr = const()[name = tensor(\"gr\"), val = tensor(1)];\n" " tensor y16 = conv(dilations = dl, groups = gr, pad = pd, " "pad_type = pt, strides = st, weight = W, x = x16)[name = tensor(\"cv\")];\n" " tensor d2 = const()[name = tensor(\"d2\"), val = tensor(\"fp32\")];\n" " tensor y = cast(dtype = d2, x = y16)[name = tensor(\"co\")];\n" " } -> (y);\n}\n", in_ch, sp, in_ch, sp, out_ch, in_ch, out_ch, in_ch, out_ch, sp, out_ch, sp]; } typedef struct { void *model; // CFBridgingRetain'd _ANEInMemoryModel IOSurfaceRef ioIn, ioOut; void *request; // CFBridgingRetain'd _ANERequest void *tmpDir; // CFBridgingRetain'd NSString } Kern; static Kern *compile_kern_with_blob(NSData *blob, int in_ch, int out_ch, int sp) { NSString *mil = gen_conv_mil(in_ch, out_ch, sp); NSData *milData = [mil dataUsingEncoding:NSUTF8StringEncoding]; NSDictionary *wd = @{@"@model_path/weights/weight.bin":@{@"offset":@0,@"data":blob}}; id desc = ((id(*)(Class,SEL,id,id,id))objc_msgSend)(g_D, @selector(modelWithMILText:weights:optionsPlist:), milData, wd, nil); if (!desc) return NULL; id mdl = ((id(*)(Class,SEL,id))objc_msgSend)(g_I, @selector(inMemoryModelWithDescriptor:), desc); id hx = ((id(*)(id,SEL))objc_msgSend)(mdl, @selector(hexStringIdentifier)); NSString *td = [NSTemporaryDirectory() stringByAppendingPathComponent:hx]; NSFileManager *fm = [NSFileManager defaultManager]; [fm createDirectoryAtPath:[td stringByAppendingPathComponent:@"weights"] withIntermediateDirectories:YES attributes:nil error:nil]; [milData writeToFile:[td stringByAppendingPathComponent:@"model.mil"] atomically:YES]; [blob writeToFile:[td stringByAppendingPathComponent:@"weights/weight.bin"] atomically:YES]; NSError *e = nil; if (!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(compileWithQoS:options:error:), 21, @{}, &e)) { if (!g_fp16_io) { printf("[ANE] fp32 compile failed, retrying with fp16 I/O (M1/M2 fallback)\n"); g_fp16_io = 1; return compile_kern_with_blob(blob, in_ch, out_ch, sp); } return NULL; } if (!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(loadWithQoS:options:error:), 21, @{}, &e)) return NULL; size_t bpe = g_fp16_io ? 2 : 4; size_t inB = in_ch * sp * bpe, outB = out_ch * sp * bpe; IOSurfaceRef ioI = make_surface(inB), ioO = make_surface(outB); id wI = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioI); id wO = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioO); id req = ((id(*)(Class,SEL,id,id,id,id,id,id,id))objc_msgSend)(g_AR, @selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:), @[wI], @[@0], @[wO], @[@0], nil, nil, @0); Kern *k = calloc(1, sizeof(Kern)); k->model = (void*)CFBridgingRetain(mdl); k->ioIn = ioI; k->ioOut = ioO; k->request = (void*)CFBridgingRetain(req); k->tmpDir = (void*)CFBridgingRetain(td); return k; } static void free_kern(Kern *k) { if (!k) return; id mdl = (__bridge id)k->model; NSError *e = nil; ((BOOL(*)(id,SEL,unsigned int,NSError**))objc_msgSend)(mdl, @selector(unloadWithQoS:error:), 21, &e); CFRelease(k->ioIn); CFRelease(k->ioOut); NSString *td = (__bridge id)k->tmpDir; [[NSFileManager defaultManager] removeItemAtPath:td error:nil]; CFRelease(k->model); CFRelease(k->request); CFRelease(k->tmpDir); free(k); } // ANE eval: input [S, in_ch] row-major ↔ [in_ch, S] channels-first static void ane_eval(Kern *k, const float *in, float *out, int in_ch, int out_ch, int sp) { IOSurfaceLock(k->ioIn, 0, NULL); void *base_in = IOSurfaceGetBaseAddress(k->ioIn); if (g_fp16_io) { _Float16 *dst = (_Float16*)base_in; for (int t = 0; t < sp; t++) for (int c = 0; c < in_ch; c++) dst[c*sp + t] = (_Float16)in[t*in_ch + c]; } else { float *dst = (float*)base_in; for (int t = 0; t < sp; t++) for (int c = 0; c < in_ch; c++) dst[c*sp + t] = in[t*in_ch + c]; } IOSurfaceUnlock(k->ioIn, 0, NULL); NSError *e = nil; id mdl = (__bridge id)k->model; id req = (__bridge id)k->request; ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)( mdl, @selector(evaluateWithQoS:options:request:error:), 21, @{}, req, &e); IOSurfaceLock(k->ioOut, kIOSurfaceLockReadOnly, NULL); void *base_out = IOSurfaceGetBaseAddress(k->ioOut); if (g_fp16_io) { _Float16 *src = (_Float16*)base_out; for (int t = 0; t < sp; t++) for (int c = 0; c < out_ch; c++) out[t*out_ch + c] = (float)src[c*sp + t]; } else { float *src = (float*)base_out; for (int t = 0; t < sp; t++) for (int c = 0; c < out_ch; c++) out[t*out_ch + c] = src[c*sp + t]; } IOSurfaceUnlock(k->ioOut, kIOSurfaceLockReadOnly, NULL); } int main(int argc, char *argv[]) { @autoreleasepool { ane_init(); mach_timebase_info_data_t tb; mach_timebase_info(&tb); int D = 64, H = 128, S = 16; int steps = 25; // 4 kernels × 25 = 100 compiles, under 119 limit float lr = 0.5f; int recompile_every = 1; // recompile every step for correct gradients float *W1 = (float*)malloc(H * D * sizeof(float)); float *W2 = (float*)malloc(D * H * sizeof(float)); for (int i = 0; i < H*D; i++) W1[i] = 0.01f * sinf(i * 1.3f + 0.7f); for (int i = 0; i < D*H; i++) W2[i] = 0.01f * cosf(i * 0.9f + 1.1f); float *x = (float*)calloc(S * D, sizeof(float)); float *y_target = (float*)calloc(S * D, sizeof(float)); for (int t = 0; t < S; t++) for (int i = 0; i < D; i++) { float v = sinf((t * D + i) * 0.1f); x[t*D + i] = v; y_target[t*D + i] = v; } printf("=== Tiny 2-Layer ANE Training (Forward + Backward on ANE) ===\n"); printf("x:[%d,%d] → W1:[%d,%d] → ReLU → W2:[%d,%d] → y:[%d,%d]\n", S,D, H,D, D,H, S,D); printf("Forward: ANE conv | Backward dx: ANE conv(W^T) | Backward dW: CPU\n"); printf("Steps: %d, LR: %.4f, Recompile every %d steps\n\n", steps, lr, recompile_every); float *h = (float*)malloc(S * H * sizeof(float)); float *h_relu = (float*)malloc(S * H * sizeof(float)); float *y = (float*)malloc(S * D * sizeof(float)); float *dy = (float*)malloc(S * D * sizeof(float)); float *dh_relu = (float*)malloc(S * H * sizeof(float)); float *dh = (float*)malloc(S * H * sizeof(float)); float *dx_layer = (float*)malloc(S * D * sizeof(float)); // not used for update but proves backward works float *dW1 = (float*)calloc(H * D, sizeof(float)); float *dW2 = (float*)calloc(D * H, sizeof(float)); // 4 ANE kernels: 2 forward + 2 backward (transposed weights) Kern *k1_fwd = NULL, *k2_fwd = NULL; // W1: [H,D]→conv(D→H), W2: [D,H]→conv(H→D) Kern *k1_bwd = NULL, *k2_bwd = NULL; // W1^T: [D,H]→conv(H→D), W2^T: [H,D]→conv(D→H) bool on_ane = true; printf("%-6s %-12s %-10s %-6s\n", "Step", "MSE Loss", "ms/step", "Backend"); printf("--------------------------------------\n"); for (int step = 0; step < steps; step++) { uint64_t t0 = mach_absolute_time(); if (on_ane && step % recompile_every == 0) { free_kern(k1_fwd); free_kern(k2_fwd); free_kern(k1_bwd); free_kern(k2_bwd); k1_fwd = k2_fwd = k1_bwd = k2_bwd = NULL; @autoreleasepool { k1_fwd = compile_kern_with_blob(build_blob(W1, H, D), D, H, S); k2_fwd = compile_kern_with_blob(build_blob(W2, D, H), H, D, S); // Backward: dx = W^T @ dy → conv with transposed weight // W2^T: [H,D] as conv weight, input dy [1,D,1,S] → output dh [1,H,1,S] k2_bwd = compile_kern_with_blob(build_blob_transposed(W2, D, H), D, H, S); // W1^T: [D,H] as conv weight, input dh [1,H,1,S] → output dx [1,D,1,S] k1_bwd = compile_kern_with_blob(build_blob_transposed(W1, H, D), H, D, S); } if (!k1_fwd || !k2_fwd || !k1_bwd || !k2_bwd) { printf("ANE limit at step %d, continuing on CPU\n", step); free_kern(k1_fwd); free_kern(k2_fwd); free_kern(k1_bwd); free_kern(k2_bwd); k1_fwd = k2_fwd = k1_bwd = k2_bwd = NULL; on_ane = false; } } if (on_ane) { // === Forward on ANE === ane_eval(k1_fwd, x, h, D, H, S); for (int i = 0; i < S*H; i++) h_relu[i] = h[i] > 0 ? h[i] : 0; ane_eval(k2_fwd, h_relu, y, H, D, S); } else { for (int t = 0; t < S; t++) for (int i = 0; i < H; i++) { float s = 0; for (int j = 0; j < D; j++) s += W1[i*D+j] * x[t*D+j]; h[t*H+i] = s; } for (int i = 0; i < S*H; i++) h_relu[i] = h[i] > 0 ? h[i] : 0; for (int t = 0; t < S; t++) for (int i = 0; i < D; i++) { float s = 0; for (int j = 0; j < H; j++) s += W2[i*H+j] * h_relu[t*H+j]; y[t*D+i] = s; } } // MSE loss + dL/dy float loss = 0; for (int i = 0; i < S*D; i++) { float diff = y[i] - y_target[i]; loss += diff * diff; dy[i] = 2.0f * diff / (S * D); } loss /= (S * D); if (on_ane) { // === Backward dx on ANE === // dh_relu = W2^T @ dy (ANE conv with transposed W2) ane_eval(k2_bwd, dy, dh_relu, D, H, S); // ReLU backward (CPU, element-wise) for (int i = 0; i < S*H; i++) dh[i] = h[i] > 0 ? dh_relu[i] : 0; // dx = W1^T @ dh (ANE conv with transposed W1) ane_eval(k1_bwd, dh, dx_layer, H, D, S); } else { memset(dh_relu, 0, S * H * sizeof(float)); for (int t = 0; t < S; t++) for (int j = 0; j < H; j++) for (int i = 0; i < D; i++) dh_relu[t*H + j] += W2[i*H + j] * dy[t*D + i]; for (int i = 0; i < S*H; i++) dh[i] = h[i] > 0 ? dh_relu[i] : 0; } // dW on CPU (outer products — memory-bound, not worth ANE) memset(dW2, 0, D * H * sizeof(float)); for (int t = 0; t < S; t++) for (int i = 0; i < D; i++) for (int j = 0; j < H; j++) dW2[i*H + j] += dy[t*D + i] * h_relu[t*H + j]; memset(dW1, 0, H * D * sizeof(float)); for (int t = 0; t < S; t++) for (int i = 0; i < H; i++) for (int j = 0; j < D; j++) dW1[i*D + j] += dh[t*H + i] * x[t*D + j]; // SGD for (int i = 0; i < H*D; i++) W1[i] -= lr * dW1[i]; for (int i = 0; i < D*H; i++) W2[i] -= lr * dW2[i]; double ms = (double)(mach_absolute_time() - t0) * tb.numer / tb.denom / 1e6; if (step % 1 == 0 || step == steps - 1) printf("%-6d %-12.6f %-10.1f %-6s\n", step, loss, ms, on_ane ? "ANE" : "CPU"); if (loss < 1e-6f) { printf("\nConverged at step %d!\n", step); break; } } printf("\nFinal output vs target (first 8):\n"); if (on_ane && k1_fwd && k2_fwd) { ane_eval(k1_fwd, x, h, D, H, S); for (int i = 0; i < S*H; i++) h_relu[i] = h[i] > 0 ? h[i] : 0; ane_eval(k2_fwd, h_relu, y, H, D, S); } printf(" y: "); for (int i = 0; i < 8; i++) printf("%.4f ", y[i]); printf("\n"); printf(" target: "); for (int i = 0; i < 8; i++) printf("%.4f ", y_target[i]); printf("\n"); free_kern(k1_fwd); free_kern(k2_fwd); free_kern(k1_bwd); free_kern(k2_bwd); free(W1); free(W2); free(x); free(y_target); free(h); free(h_relu); free(y); free(dy); free(dh_relu); free(dh); free(dx_layer); free(dW1); free(dW2); printf("\nDone.\n"); } return 0; }