// quant_probe.m — Probe whether ANE executes int8/int4 quantized ops natively // Tests: (1) fp16 baseline conv, (2) int8 via constexpr_affine_dequantize, // (3) int4 via constexpr_affine_dequantize, (4) raw int8 conv weight, // (5) uint8 palettized via constexpr_lut_to_dense // If ANE hardware does native quantized execution (not just dequant-to-fp16), // we expect 2-4x speedup over fp16 at same dimensions. #import #import #import #import #import #import #include #include #include // ── ANE private API boilerplate ────────────────────────────────────────────── static Class g_D, g_I, g_AR, g_AIO; static mach_timebase_info_data_t g_tb; static void ane_init(void) { dlopen("/System/Library/PrivateFrameworks/AppleNeuralEngine.framework/" "AppleNeuralEngine", RTLD_NOW); g_D = NSClassFromString(@"_ANEInMemoryModelDescriptor"); g_I = NSClassFromString(@"_ANEInMemoryModel"); g_AR = NSClassFromString(@"_ANERequest"); g_AIO = NSClassFromString(@"_ANEIOSurfaceObject"); } static double tb_ms(uint64_t t) { return (double)t * g_tb.numer / g_tb.denom / 1e6; } static IOSurfaceRef make_surface(size_t bytes) { if (bytes < 49152) bytes = 49152; return IOSurfaceCreate((__bridge CFDictionaryRef)@{ (id)kIOSurfaceWidth: @(bytes), (id)kIOSurfaceHeight: @1, (id)kIOSurfaceBytesPerElement: @1, (id)kIOSurfaceBytesPerRow: @(bytes), (id)kIOSurfaceAllocSize: @(bytes), (id)kIOSurfacePixelFormat: @0 }); } // ── Weight blob builders ───────────────────────────────────────────────────── // FP16 blob: global header (64B) + chunk header (64B) + fp16 data static NSData *build_fp16_blob(int oc, int ic) { NSUInteger wsize = (NSUInteger)oc * ic * 2; NSUInteger total = 128 + wsize; uint8_t *buf = (uint8_t*)calloc(total, 1); buf[0] = 1; buf[4] = 2; buf[64] = 0xEF; buf[65] = 0xBE; buf[66] = 0xAD; buf[67] = 0xDE; buf[68] = 1; *(uint32_t*)(buf + 72) = (uint32_t)wsize; *(uint32_t*)(buf + 80) = 128; _Float16 *fp16 = (_Float16*)(buf + 128); for (NSUInteger i = 0; i < (NSUInteger)oc * ic; i++) fp16[i] = (_Float16)(((float)arc4random() / UINT32_MAX - 0.5f) * 0.1f); return [NSData dataWithBytesNoCopy:buf length:total freeWhenDone:YES]; } // INT8 blob: same header structure, but int8 data (1 byte per weight) static NSData *build_int8_blob(int oc, int ic) { NSUInteger wsize = (NSUInteger)oc * ic; NSUInteger total = 128 + wsize; uint8_t *buf = (uint8_t*)calloc(total, 1); buf[0] = 1; buf[4] = 2; buf[64] = 0xEF; buf[65] = 0xBE; buf[66] = 0xAD; buf[67] = 0xDE; buf[68] = 1; *(uint32_t*)(buf + 72) = (uint32_t)wsize; *(uint32_t*)(buf + 80) = 128; int8_t *i8 = (int8_t*)(buf + 128); for (NSUInteger i = 0; i < (NSUInteger)oc * ic; i++) i8[i] = (int8_t)(arc4random() % 256 - 128); return [NSData dataWithBytesNoCopy:buf length:total freeWhenDone:YES]; } // INT4 blob: packed nibbles (2 weights per byte), row-major [oc, ic/2] static NSData *build_int4_blob(int oc, int ic) { NSUInteger wsize = (NSUInteger)oc * ic / 2; NSUInteger total = 128 + wsize; uint8_t *buf = (uint8_t*)calloc(total, 1); buf[0] = 1; buf[4] = 2; buf[64] = 0xEF; buf[65] = 0xBE; buf[66] = 0xAD; buf[67] = 0xDE; buf[68] = 1; *(uint32_t*)(buf + 72) = (uint32_t)wsize; *(uint32_t*)(buf + 80) = 128; uint8_t *packed = buf + 128; for (NSUInteger i = 0; i < wsize; i++) packed[i] = (uint8_t)(arc4random() & 0xFF); return [NSData dataWithBytesNoCopy:buf length:total freeWhenDone:YES]; } // Scale+ZP blob for affine dequantize: fp16 scale per output channel + int8 zp static NSData *build_scale_blob(int oc) { NSUInteger wsize = (NSUInteger)oc * 2; // fp16 per channel NSUInteger total = 128 + wsize; uint8_t *buf = (uint8_t*)calloc(total, 1); buf[0] = 1; buf[4] = 2; buf[64] = 0xEF; buf[65] = 0xBE; buf[66] = 0xAD; buf[67] = 0xDE; buf[68] = 1; *(uint32_t*)(buf + 72) = (uint32_t)wsize; *(uint32_t*)(buf + 80) = 128; _Float16 *s = (_Float16*)(buf + 128); for (int i = 0; i < oc; i++) s[i] = (_Float16)(0.01f); return [NSData dataWithBytesNoCopy:buf length:total freeWhenDone:YES]; } static NSData *build_zp_int8_blob(int oc) { NSUInteger wsize = (NSUInteger)oc; NSUInteger total = 128 + wsize; uint8_t *buf = (uint8_t*)calloc(total, 1); buf[0] = 1; buf[4] = 2; buf[64] = 0xEF; buf[65] = 0xBE; buf[66] = 0xAD; buf[67] = 0xDE; buf[68] = 1; *(uint32_t*)(buf + 72) = (uint32_t)wsize; *(uint32_t*)(buf + 80) = 128; // zero-points all 0 return [NSData dataWithBytesNoCopy:buf length:total freeWhenDone:YES]; } static NSData *build_zp_int4_blob(int oc) { // For int4, zero points are also int4 packed or per-channel uint8 // Use uint8 zero-point (one per output channel) NSUInteger wsize = (NSUInteger)oc; NSUInteger total = 128 + wsize; uint8_t *buf = (uint8_t*)calloc(total, 1); buf[0] = 1; buf[4] = 2; buf[64] = 0xEF; buf[65] = 0xBE; buf[66] = 0xAD; buf[67] = 0xDE; buf[68] = 1; *(uint32_t*)(buf + 72) = (uint32_t)wsize; *(uint32_t*)(buf + 80) = 128; memset(buf + 128, 8, wsize); // zero-point = 8 for uint4 center return [NSData dataWithBytesNoCopy:buf length:total freeWhenDone:YES]; } // ── Compile + eval helpers ─────────────────────────────────────────────────── typedef struct { id model; NSString *td; bool ok; } Kern; static Kern try_compile(NSString *mil, NSDictionary *wd) { Kern k = {nil, nil, false}; NSData *md = [mil dataUsingEncoding:NSUTF8StringEncoding]; NSError *e = nil; id desc = ((id(*)(Class,SEL,id,id,id))objc_msgSend)( g_D, @selector(modelWithMILText:weights:optionsPlist:), md, wd ?: @{}, nil); if (!desc) { printf(" descriptor=NULL\n"); return k; } id mdl = ((id(*)(Class,SEL,id))objc_msgSend)( g_I, @selector(inMemoryModelWithDescriptor:), desc); id hx = ((id(*)(id,SEL))objc_msgSend)(mdl, @selector(hexStringIdentifier)); NSString *td = [NSTemporaryDirectory() stringByAppendingPathComponent:hx]; NSFileManager *fm = [NSFileManager defaultManager]; [fm createDirectoryAtPath:[td stringByAppendingPathComponent:@"weights"] withIntermediateDirectories:YES attributes:nil error:nil]; [md writeToFile:[td stringByAppendingPathComponent:@"model.mil"] atomically:YES]; for (NSString *path in wd) { NSString *rel = [path stringByReplacingOccurrencesOfString:@"@model_path/" withString:@""]; [wd[path][@"data"] writeToFile:[td stringByAppendingPathComponent:rel] atomically:YES]; } if (!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)( mdl, @selector(compileWithQoS:options:error:), 21, @{}, &e)) { NSString *desc_str = [e localizedDescription] ?: @"unknown"; if ([desc_str length] > 200) desc_str = [desc_str substringToIndex:200]; printf(" compile FAIL: %s\n", [desc_str UTF8String]); [fm removeItemAtPath:td error:nil]; return k; } if (!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)( mdl, @selector(loadWithQoS:options:error:), 21, @{}, &e)) { printf(" load FAIL\n"); [fm removeItemAtPath:td error:nil]; return k; } k.model = mdl; k.td = td; k.ok = true; return k; } static void kern_free(Kern *k) { if (!k->ok) return; NSError *e = nil; ((BOOL(*)(id,SEL,unsigned int,NSError**))objc_msgSend)( k->model, @selector(unloadWithQoS:error:), 21, &e); [[NSFileManager defaultManager] removeItemAtPath:k->td error:nil]; k->ok = false; } // Benchmark: returns ms/eval, or -1 on failure static double bench_kern(Kern *k, size_t inBytes, size_t outBytes, int warmup, int iters) { if (!k->ok) return -1; IOSurfaceRef ioIn = make_surface(inBytes); IOSurfaceRef ioOut = make_surface(outBytes); id wIn = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioIn); id wOut = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioOut); id req = ((id(*)(Class,SEL,id,id,id,id,id,id,id))objc_msgSend)(g_AR, @selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:), @[wIn], @[@0], @[wOut], @[@0], nil, nil, @0); NSError *e = nil; for (int i = 0; i < warmup; i++) ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)( k->model, @selector(evaluateWithQoS:options:request:error:), 21, @{}, req, &e); uint64_t t0 = mach_absolute_time(); for (int i = 0; i < iters; i++) { BOOL ok = ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)( k->model, @selector(evaluateWithQoS:options:request:error:), 21, @{}, req, &e); if (!ok) { CFRelease(ioIn); CFRelease(ioOut); return -1; } } double ms = tb_ms(mach_absolute_time() - t0) / iters; CFRelease(ioIn); CFRelease(ioOut); return ms; } // ── MIL generators ─────────────────────────────────────────────────────────── #define MIL_HDR \ @"program(1.3)\n[buildInfo = dict({{\"coremlc-component-MIL\", \"3510.2.1\"}, " \ "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, " \ "{\"coremltools-version\", \"9.0\"}})]\n{\n" // Test 1: FP16 baseline conv (baked weights) static NSString *gen_fp16_conv(int ic, int oc, int sp) { return [NSString stringWithFormat: @"%@" " func main(tensor x) {\n" " string pt = const()[name=string(\"pt\"), val=string(\"valid\")];\n" " tensor st = const()[name=string(\"st\"), val=tensor([1,1])];\n" " tensor pd = const()[name=string(\"pd\"), val=tensor([0,0,0,0])];\n" " tensor dl = const()[name=string(\"dl\"), val=tensor([1,1])];\n" " int32 gr = const()[name=string(\"gr\"), val=int32(1)];\n" " string to16 = const()[name=string(\"to16\"), val=string(\"fp16\")];\n" " tensor x16 = cast(dtype=to16, x=x)[name=string(\"cx\")];\n" " tensor W = const()[name=string(\"W\"), " "val=tensor(BLOBFILE(path=string(\"@model_path/weights/weight.bin\"), offset=uint64(64)))];\n" " tensor y16 = conv(dilations=dl, groups=gr, pad=pd, pad_type=pt, strides=st, weight=W, x=x16)[name=string(\"cv\")];\n" " string to32 = const()[name=string(\"to32\"), val=string(\"fp32\")];\n" " tensor y = cast(dtype=to32, x=y16)[name=string(\"co\")];\n" " } -> (y);\n}\n", MIL_HDR, ic, sp, ic, sp, oc, ic, oc, ic, oc, sp, oc, sp]; } // Test 2: INT8 weights via constexpr_affine_dequantize → fp16 conv // This is how coremltools emits int8 quantized models // dequant formula: fp16_weight = scale * (int8_weight - zero_point) static NSString *gen_int8_dequant_conv(int ic, int oc, int sp) { return [NSString stringWithFormat: @"%@" " func main(tensor x) {\n" " string pt = const()[name=string(\"pt\"), val=string(\"valid\")];\n" " tensor st = const()[name=string(\"st\"), val=tensor([1,1])];\n" " tensor pd = const()[name=string(\"pd\"), val=tensor([0,0,0,0])];\n" " tensor dl = const()[name=string(\"dl\"), val=tensor([1,1])];\n" " int32 gr = const()[name=string(\"gr\"), val=int32(1)];\n" " string to16 = const()[name=string(\"to16\"), val=string(\"fp16\")];\n" " tensor x16 = cast(dtype=to16, x=x)[name=string(\"cx\")];\n" " tensor Wq = const()[name=string(\"Wq\"), " "val=tensor(BLOBFILE(path=string(\"@model_path/weights/weight.bin\"), offset=uint64(64)))];\n" " tensor sc = const()[name=string(\"sc\"), " "val=tensor(BLOBFILE(path=string(\"@model_path/weights/scale.bin\"), offset=uint64(64)))];\n" " tensor zp = const()[name=string(\"zp\"), " "val=tensor(BLOBFILE(path=string(\"@model_path/weights/zp.bin\"), offset=uint64(64)))];\n" " int32 ax = const()[name=string(\"ax\"), val=int32(0)];\n" " tensor W = constexpr_affine_dequantize(axis=ax, zero_point=zp, quantized_data=Wq, scale=sc)[name=string(\"dq\")];\n" " tensor y16 = conv(dilations=dl, groups=gr, pad=pd, pad_type=pt, strides=st, weight=W, x=x16)[name=string(\"cv\")];\n" " string to32 = const()[name=string(\"to32\"), val=string(\"fp32\")];\n" " tensor y = cast(dtype=to32, x=y16)[name=string(\"co\")];\n" " } -> (y);\n}\n", MIL_HDR, ic, sp, ic, sp, oc, ic, oc, ic, oc, oc, oc, oc, oc, ic, oc, sp, oc, sp]; } // Test 3: INT4 (uint4) weights via constexpr_affine_dequantize → fp16 conv // uint4 packed: 2 values per byte, axis=1 dequantize static NSString *gen_int4_dequant_conv(int ic, int oc, int sp) { // For uint4, quantized_data shape is [oc, ic/2, 1, 1] packed // But MIL may want the logical shape [oc, ic, 1, 1] with uint4 type return [NSString stringWithFormat: @"%@" " func main(tensor x) {\n" " string pt = const()[name=string(\"pt\"), val=string(\"valid\")];\n" " tensor st = const()[name=string(\"st\"), val=tensor([1,1])];\n" " tensor pd = const()[name=string(\"pd\"), val=tensor([0,0,0,0])];\n" " tensor dl = const()[name=string(\"dl\"), val=tensor([1,1])];\n" " int32 gr = const()[name=string(\"gr\"), val=int32(1)];\n" " string to16 = const()[name=string(\"to16\"), val=string(\"fp16\")];\n" " tensor x16 = cast(dtype=to16, x=x)[name=string(\"cx\")];\n" " tensor Wq = const()[name=string(\"Wq\"), " "val=tensor(BLOBFILE(path=string(\"@model_path/weights/weight.bin\"), offset=uint64(64)))];\n" " tensor sc = const()[name=string(\"sc\"), " "val=tensor(BLOBFILE(path=string(\"@model_path/weights/scale.bin\"), offset=uint64(64)))];\n" " tensor zp = const()[name=string(\"zp\"), " "val=tensor(BLOBFILE(path=string(\"@model_path/weights/zp.bin\"), offset=uint64(64)))];\n" " int32 ax = const()[name=string(\"ax\"), val=int32(0)];\n" " tensor W = constexpr_affine_dequantize(axis=ax, zero_point=zp, quantized_data=Wq, scale=sc)[name=string(\"dq\")];\n" " tensor y16 = conv(dilations=dl, groups=gr, pad=pd, pad_type=pt, strides=st, weight=W, x=x16)[name=string(\"cv\")];\n" " string to32 = const()[name=string(\"to32\"), val=string(\"fp32\")];\n" " tensor y = cast(dtype=to32, x=y16)[name=string(\"co\")];\n" " } -> (y);\n}\n", MIL_HDR, ic, sp, ic, sp, oc, ic, oc, ic, oc, oc, oc, oc, oc, ic, oc, sp, oc, sp]; } // Test 4: Block-wise int4 quantization (constexpr_blockwise_shift_scale) // This is the more modern approach used in coremltools 8+ static NSString *gen_int4_blockwise_conv(int ic, int oc, int sp, int block_size) { int n_blocks = ic / block_size; return [NSString stringWithFormat: @"%@" " func main(tensor x) {\n" " string pt = const()[name=string(\"pt\"), val=string(\"valid\")];\n" " tensor st = const()[name=string(\"st\"), val=tensor([1,1])];\n" " tensor pd = const()[name=string(\"pd\"), val=tensor([0,0,0,0])];\n" " tensor dl = const()[name=string(\"dl\"), val=tensor([1,1])];\n" " int32 gr = const()[name=string(\"gr\"), val=int32(1)];\n" " string to16 = const()[name=string(\"to16\"), val=string(\"fp16\")];\n" " tensor x16 = cast(dtype=to16, x=x)[name=string(\"cx\")];\n" " tensor Wq = const()[name=string(\"Wq\"), " "val=tensor(BLOBFILE(path=string(\"@model_path/weights/weight.bin\"), offset=uint64(64)))];\n" " tensor ss = const()[name=string(\"ss\"), " "val=tensor(BLOBFILE(path=string(\"@model_path/weights/scale.bin\"), offset=uint64(64)))];\n" " tensor W = constexpr_blockwise_shift_scale(data=Wq, scale=ss)[name=string(\"dq\")];\n" " tensor y16 = conv(dilations=dl, groups=gr, pad=pd, pad_type=pt, strides=st, weight=W, x=x16)[name=string(\"cv\")];\n" " string to32 = const()[name=string(\"to32\"), val=string(\"fp32\")];\n" " tensor y = cast(dtype=to32, x=y16)[name=string(\"co\")];\n" " } -> (y);\n}\n", MIL_HDR, ic, sp, ic, sp, oc, ic, oc, ic, oc, n_blocks, oc, n_blocks, oc, ic, oc, sp, oc, sp]; } // Test 5: Palettized (LUT) weights via constexpr_lut_to_dense (iOS16) // 4-bit indices packed into bytes, 16-entry fp16 lookup table // indices: packed byte tensor of size ceil(4 * oc * ic / 8) = oc*ic/2 bytes // lut: [1, 1, 16] for shared LUT across all channels // shape: [oc, ic, 1, 1] output shape static NSString *gen_lut4_conv(int ic, int oc, int sp) { int packed_bytes = oc * ic / 2; // 4-bit, 2 per byte return [NSString stringWithFormat: @"%@" " func main(tensor x) {\n" " string pt = const()[name=string(\"pt\"), val=string(\"valid\")];\n" " tensor st = const()[name=string(\"st\"), val=tensor([1,1])];\n" " tensor pd = const()[name=string(\"pd\"), val=tensor([0,0,0,0])];\n" " tensor dl = const()[name=string(\"dl\"), val=tensor([1,1])];\n" " int32 gr = const()[name=string(\"gr\"), val=int32(1)];\n" " string to16 = const()[name=string(\"to16\"), val=string(\"fp16\")];\n" " tensor x16 = cast(dtype=to16, x=x)[name=string(\"cx\")];\n" " tensor idx = const()[name=string(\"idx\"), " "val=tensor(BLOBFILE(path=string(\"@model_path/weights/weight.bin\"), offset=uint64(64)))];\n" " tensor lut = const()[name=string(\"lut\"), " "val=tensor(BLOBFILE(path=string(\"@model_path/weights/lut.bin\"), offset=uint64(64)))];\n" " tensor shp = const()[name=string(\"shp\"), val=tensor([%d, %d, 1, 1])];\n" " tensor W = constexpr_lut_to_dense(indices=idx, lut=lut, shape=shp)[name=string(\"dq\")];\n" " tensor y16 = conv(dilations=dl, groups=gr, pad=pd, pad_type=pt, strides=st, weight=W, x=x16)[name=string(\"cv\")];\n" " string to32 = const()[name=string(\"to32\"), val=string(\"fp32\")];\n" " tensor y = cast(dtype=to32, x=y16)[name=string(\"co\")];\n" " } -> (y);\n}\n", MIL_HDR, ic, sp, ic, sp, packed_bytes, packed_bytes, oc, ic, oc, ic, oc, sp, oc, sp]; } // LUT blob: fp16 lookup table [1, 1, 16] — shared across all channels static NSData *build_lut_blob(int oc) { (void)oc; // shared LUT, oc not needed NSUInteger wsize = 1 * 1 * 16 * 2; // [1,1,16] fp16 NSUInteger total = 128 + wsize; uint8_t *buf = (uint8_t*)calloc(total, 1); buf[0] = 1; buf[4] = 2; buf[64] = 0xEF; buf[65] = 0xBE; buf[66] = 0xAD; buf[67] = 0xDE; buf[68] = 1; *(uint32_t*)(buf + 72) = (uint32_t)wsize; *(uint32_t*)(buf + 80) = 128; _Float16 *fp16 = (_Float16*)(buf + 128); for (int j = 0; j < 16; j++) fp16[j] = (_Float16)((j - 8) * 0.01f); return [NSData dataWithBytesNoCopy:buf length:total freeWhenDone:YES]; } // Packed 4-bit index blob for LUT: oc*ic/2 bytes (2 indices per byte) static NSData *build_lut_index_blob(int oc, int ic) { NSUInteger wsize = (NSUInteger)oc * ic / 2; NSUInteger total = 128 + wsize; uint8_t *buf = (uint8_t*)calloc(total, 1); buf[0] = 1; buf[4] = 2; buf[64] = 0xEF; buf[65] = 0xBE; buf[66] = 0xAD; buf[67] = 0xDE; buf[68] = 1; *(uint32_t*)(buf + 72) = (uint32_t)wsize; *(uint32_t*)(buf + 80) = 128; uint8_t *packed = buf + 128; for (NSUInteger i = 0; i < wsize; i++) packed[i] = (uint8_t)(arc4random() & 0xFF); // random 4-bit pairs return [NSData dataWithBytesNoCopy:buf length:total freeWhenDone:YES]; } // Blockwise scale blob: [oc, n_blocks, 1, 1] fp16 static NSData *build_blockwise_scale_blob(int oc, int n_blocks) { NSUInteger wsize = (NSUInteger)oc * n_blocks * 2; NSUInteger total = 128 + wsize; uint8_t *buf = (uint8_t*)calloc(total, 1); buf[0] = 1; buf[4] = 2; buf[64] = 0xEF; buf[65] = 0xBE; buf[66] = 0xAD; buf[67] = 0xDE; buf[68] = 1; *(uint32_t*)(buf + 72) = (uint32_t)wsize; *(uint32_t*)(buf + 80) = 128; _Float16 *fp16 = (_Float16*)(buf + 128); for (NSUInteger i = 0; i < (NSUInteger)oc * n_blocks; i++) fp16[i] = (_Float16)(0.01f); return [NSData dataWithBytesNoCopy:buf length:total freeWhenDone:YES]; } // ── Main ───────────────────────────────────────────────────────────────────── static void run_test(const char *name, int ic, int oc, int sp, NSString *mil, NSDictionary *wd) { printf("\n [%s] %dx%d sp=%d\n", name, oc, ic, sp); Kern k = try_compile(mil, wd); if (!k.ok) { printf(" RESULT: COMPILE FAILED\n"); return; } printf(" compile+load: OK\n"); size_t inBytes = (size_t)ic * sp * 4; size_t outBytes = (size_t)oc * sp * 4; double ms = bench_kern(&k, inBytes, outBytes, 10, 100); if (ms < 0) { printf(" RESULT: EVAL FAILED\n"); } else { double gflops = 2.0 * oc * ic * sp / 1e9; double tflops = gflops / ms; printf(" %.3f ms/eval (%.2f GFLOP → %.3f TFLOPS)\n", ms, gflops, tflops); } kern_free(&k); } int main(int argc, char **argv) { @autoreleasepool { mach_timebase_info(&g_tb); ane_init(); printf("╔══════════════════════════════════════════════════════════════╗\n"); printf("║ ANE Quantization Probe — int8 / int4 / LUT on Neural Engine ║\n"); printf("╚══════════════════════════════════════════════════════════════╝\n"); printf("\nGoal: Determine if ANE executes quantized ops natively or just\n"); printf("dequantizes to fp16. Native execution → 2-4x speedup over fp16.\n"); printf("Dequant-only → same speed (compute-bound) but smaller weight blobs.\n\n"); // Test dimensions - representative of transformer layers typedef struct { int ic, oc, sp; const char *desc; } Cfg; Cfg cfgs[] = { {768, 768, 64, "Stories110M attn proj"}, {768, 2048, 64, "Stories110M FFN up"}, {2048, 768, 64, "Stories110M FFN down"}, {1024, 1024, 64, "1K square"}, {2048, 2048, 64, "2K square (stress)"}, }; int ncfg = sizeof(cfgs) / sizeof(cfgs[0]); for (int ci = 0; ci < ncfg; ci++) { int ic = cfgs[ci].ic, oc = cfgs[ci].oc, sp = cfgs[ci].sp; printf("\n━━━ %s (%dx%d, seq=%d) ━━━\n", cfgs[ci].desc, oc, ic, sp); // ── Test 1: FP16 baseline ── { NSString *mil = gen_fp16_conv(ic, oc, sp); NSData *wb = build_fp16_blob(oc, ic); NSDictionary *wd = @{@"@model_path/weights/weight.bin": @{@"offset": @0, @"data": wb}}; run_test("FP16 baseline", ic, oc, sp, mil, wd); } // ── Test 2: INT8 affine dequantize ── { NSString *mil = gen_int8_dequant_conv(ic, oc, sp); NSData *wb = build_int8_blob(oc, ic); NSData *scb = build_scale_blob(oc); NSData *zpb = build_zp_int8_blob(oc); NSDictionary *wd = @{ @"@model_path/weights/weight.bin": @{@"offset": @0, @"data": wb}, @"@model_path/weights/scale.bin": @{@"offset": @0, @"data": scb}, @"@model_path/weights/zp.bin": @{@"offset": @0, @"data": zpb} }; run_test("INT8 affine dequant", ic, oc, sp, mil, wd); } // ── Test 3: INT4 (uint4) affine dequantize ── { NSString *mil = gen_int4_dequant_conv(ic, oc, sp); NSData *wb = build_int4_blob(oc, ic); NSData *scb = build_scale_blob(oc); NSData *zpb = build_zp_int4_blob(oc); NSDictionary *wd = @{ @"@model_path/weights/weight.bin": @{@"offset": @0, @"data": wb}, @"@model_path/weights/scale.bin": @{@"offset": @0, @"data": scb}, @"@model_path/weights/zp.bin": @{@"offset": @0, @"data": zpb} }; run_test("UINT4 affine dequant", ic, oc, sp, mil, wd); } // ── Test 4: INT4 blockwise (block_size=32) ── if (ic % 32 == 0) { int block_size = 32; int n_blocks = ic / block_size; NSString *mil = gen_int4_blockwise_conv(ic, oc, sp, block_size); NSData *wb = build_int4_blob(oc, ic); NSData *scb = build_blockwise_scale_blob(oc, n_blocks); NSDictionary *wd = @{ @"@model_path/weights/weight.bin": @{@"offset": @0, @"data": wb}, @"@model_path/weights/scale.bin": @{@"offset": @0, @"data": scb} }; run_test("UINT4 blockwise(32)", ic, oc, sp, mil, wd); } // ── Test 5: LUT (4-bit palettized) ── { NSString *mil = gen_lut4_conv(ic, oc, sp); NSData *wb = build_lut_index_blob(oc, ic); NSData *lut = build_lut_blob(oc); NSDictionary *wd = @{ @"@model_path/weights/weight.bin": @{@"offset": @0, @"data": wb}, @"@model_path/weights/lut.bin": @{@"offset": @0, @"data": lut} }; run_test("LUT4 palettized", ic, oc, sp, mil, wd); } } // ── Summary interpretation ── printf("\n\n╔══════════════════════════════════════════════════════╗\n"); printf("║ Interpretation Guide ║\n"); printf("╠══════════════════════════════════════════════════════╣\n"); printf("║ If int8 ≈ 2x fp16 TFLOPS → native int8 execution ║\n"); printf("║ If int4 ≈ 4x fp16 TFLOPS → native int4 execution ║\n"); printf("║ If int8 ≈ fp16 TFLOPS → dequant-to-fp16 only ║\n"); printf("║ If COMPILE FAIL → type not supported in MIL ║\n"); printf("║ If EVAL FAIL → compiles but ANE rejects ║\n"); printf("╚══════════════════════════════════════════════════════╝\n"); } return 0; }