From 7b6a18a059d63d700b7134da119738ae5c4010ef Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 3 Mar 2026 01:02:05 +0000 Subject: [PATCH] Add ANE int8/int4 quantization probe MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Probe whether Apple Neural Engine executes quantized ops natively (faster int8-int8 compute path) or just dequantizes to fp16 at load time. Tests 5 approaches at transformer-representative dimensions: 1. FP16 baseline conv (baked weights) 2. INT8 via constexpr_affine_dequantize (per-channel scale+zp) 3. UINT4 via constexpr_affine_dequantize (per-channel) 4. UINT4 via constexpr_blockwise_shift_scale (block_size=32) 5. 4-bit palettized via constexpr_lut_to_dense (16-entry LUT) Each test compiles MIL → ANE kernel, benchmarks 100 evals, reports TFLOPS. If int8 shows ~2x fp16 TFLOPS, ANE has native int8 compute. If same TFLOPS, it's dequant-only (still useful for memory savings). Build: xcrun clang -O2 -fobjc-arc -o quant_probe quant_probe.m \ -framework Foundation -framework IOSurface -ldl https://claude.ai/code/session_01U5HLjsm4iUzL9iDaHbxeRB --- quant_probe.m | 551 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 551 insertions(+) create mode 100644 quant_probe.m diff --git a/quant_probe.m b/quant_probe.m new file mode 100644 index 0000000..88f3a25 --- /dev/null +++ b/quant_probe.m @@ -0,0 +1,551 @@ +// quant_probe.m — Probe whether ANE executes int8/int4 quantized ops natively +// Tests: (1) fp16 baseline conv, (2) int8 via constexpr_affine_dequantize, +// (3) int4 via constexpr_affine_dequantize, (4) raw int8 conv weight, +// (5) uint8 palettized via constexpr_lut_to_dense +// If ANE hardware does native quantized execution (not just dequant-to-fp16), +// we expect 2-4x speedup over fp16 at same dimensions. +#import +#import +#import +#import +#import +#import +#include +#include +#include + +// ── ANE private API boilerplate ────────────────────────────────────────────── +static Class g_D, g_I, g_AR, g_AIO; +static mach_timebase_info_data_t g_tb; + +static void ane_init(void) { + dlopen("/System/Library/PrivateFrameworks/AppleNeuralEngine.framework/" + "AppleNeuralEngine", RTLD_NOW); + g_D = NSClassFromString(@"_ANEInMemoryModelDescriptor"); + g_I = NSClassFromString(@"_ANEInMemoryModel"); + g_AR = NSClassFromString(@"_ANERequest"); + g_AIO = NSClassFromString(@"_ANEIOSurfaceObject"); +} + +static double tb_ms(uint64_t t) { + return (double)t * g_tb.numer / g_tb.denom / 1e6; +} + +static IOSurfaceRef make_surface(size_t bytes) { + if (bytes < 49152) bytes = 49152; + return IOSurfaceCreate((__bridge CFDictionaryRef)@{ + (id)kIOSurfaceWidth: @(bytes), (id)kIOSurfaceHeight: @1, + (id)kIOSurfaceBytesPerElement: @1, (id)kIOSurfaceBytesPerRow: @(bytes), + (id)kIOSurfaceAllocSize: @(bytes), (id)kIOSurfacePixelFormat: @0 + }); +} + +// ── Weight blob builders ───────────────────────────────────────────────────── + +// FP16 blob: global header (64B) + chunk header (64B) + fp16 data +static NSData *build_fp16_blob(int oc, int ic) { + NSUInteger wsize = (NSUInteger)oc * ic * 2; + NSUInteger total = 128 + wsize; + uint8_t *buf = (uint8_t*)calloc(total, 1); + buf[0] = 1; buf[4] = 2; + buf[64] = 0xEF; buf[65] = 0xBE; buf[66] = 0xAD; buf[67] = 0xDE; buf[68] = 1; + *(uint32_t*)(buf + 72) = (uint32_t)wsize; + *(uint32_t*)(buf + 80) = 128; + _Float16 *fp16 = (_Float16*)(buf + 128); + for (NSUInteger i = 0; i < (NSUInteger)oc * ic; i++) + fp16[i] = (_Float16)(((float)arc4random() / UINT32_MAX - 0.5f) * 0.1f); + return [NSData dataWithBytesNoCopy:buf length:total freeWhenDone:YES]; +} + +// INT8 blob: same header structure, but int8 data (1 byte per weight) +static NSData *build_int8_blob(int oc, int ic) { + NSUInteger wsize = (NSUInteger)oc * ic; + NSUInteger total = 128 + wsize; + uint8_t *buf = (uint8_t*)calloc(total, 1); + buf[0] = 1; buf[4] = 2; + buf[64] = 0xEF; buf[65] = 0xBE; buf[66] = 0xAD; buf[67] = 0xDE; buf[68] = 1; + *(uint32_t*)(buf + 72) = (uint32_t)wsize; + *(uint32_t*)(buf + 80) = 128; + int8_t *i8 = (int8_t*)(buf + 128); + for (NSUInteger i = 0; i < (NSUInteger)oc * ic; i++) + i8[i] = (int8_t)(arc4random() % 256 - 128); + return [NSData dataWithBytesNoCopy:buf length:total freeWhenDone:YES]; +} + +// INT4 blob: packed nibbles (2 weights per byte), row-major [oc, ic/2] +static NSData *build_int4_blob(int oc, int ic) { + NSUInteger wsize = (NSUInteger)oc * ic / 2; + NSUInteger total = 128 + wsize; + uint8_t *buf = (uint8_t*)calloc(total, 1); + buf[0] = 1; buf[4] = 2; + buf[64] = 0xEF; buf[65] = 0xBE; buf[66] = 0xAD; buf[67] = 0xDE; buf[68] = 1; + *(uint32_t*)(buf + 72) = (uint32_t)wsize; + *(uint32_t*)(buf + 80) = 128; + uint8_t *packed = buf + 128; + for (NSUInteger i = 0; i < wsize; i++) + packed[i] = (uint8_t)(arc4random() & 0xFF); + return [NSData dataWithBytesNoCopy:buf length:total freeWhenDone:YES]; +} + +// Scale+ZP blob for affine dequantize: fp16 scale per output channel + int8 zp +static NSData *build_scale_blob(int oc) { + NSUInteger wsize = (NSUInteger)oc * 2; // fp16 per channel + NSUInteger total = 128 + wsize; + uint8_t *buf = (uint8_t*)calloc(total, 1); + buf[0] = 1; buf[4] = 2; + buf[64] = 0xEF; buf[65] = 0xBE; buf[66] = 0xAD; buf[67] = 0xDE; buf[68] = 1; + *(uint32_t*)(buf + 72) = (uint32_t)wsize; + *(uint32_t*)(buf + 80) = 128; + _Float16 *s = (_Float16*)(buf + 128); + for (int i = 0; i < oc; i++) + s[i] = (_Float16)(0.01f); + return [NSData dataWithBytesNoCopy:buf length:total freeWhenDone:YES]; +} + +static NSData *build_zp_int8_blob(int oc) { + NSUInteger wsize = (NSUInteger)oc; + NSUInteger total = 128 + wsize; + uint8_t *buf = (uint8_t*)calloc(total, 1); + buf[0] = 1; buf[4] = 2; + buf[64] = 0xEF; buf[65] = 0xBE; buf[66] = 0xAD; buf[67] = 0xDE; buf[68] = 1; + *(uint32_t*)(buf + 72) = (uint32_t)wsize; + *(uint32_t*)(buf + 80) = 128; + // zero-points all 0 + return [NSData dataWithBytesNoCopy:buf length:total freeWhenDone:YES]; +} + +static NSData *build_zp_int4_blob(int oc) { + // For int4, zero points are also int4 packed or per-channel uint8 + // Use uint8 zero-point (one per output channel) + NSUInteger wsize = (NSUInteger)oc; + NSUInteger total = 128 + wsize; + uint8_t *buf = (uint8_t*)calloc(total, 1); + buf[0] = 1; buf[4] = 2; + buf[64] = 0xEF; buf[65] = 0xBE; buf[66] = 0xAD; buf[67] = 0xDE; buf[68] = 1; + *(uint32_t*)(buf + 72) = (uint32_t)wsize; + *(uint32_t*)(buf + 80) = 128; + memset(buf + 128, 8, wsize); // zero-point = 8 for uint4 center + return [NSData dataWithBytesNoCopy:buf length:total freeWhenDone:YES]; +} + +// ── Compile + eval helpers ─────────────────────────────────────────────────── + +typedef struct { id model; NSString *td; bool ok; } Kern; + +static Kern try_compile(NSString *mil, NSDictionary *wd) { + Kern k = {nil, nil, false}; + NSData *md = [mil dataUsingEncoding:NSUTF8StringEncoding]; + NSError *e = nil; + + id desc = ((id(*)(Class,SEL,id,id,id))objc_msgSend)( + g_D, @selector(modelWithMILText:weights:optionsPlist:), md, wd ?: @{}, nil); + if (!desc) { printf(" descriptor=NULL\n"); return k; } + + id mdl = ((id(*)(Class,SEL,id))objc_msgSend)( + g_I, @selector(inMemoryModelWithDescriptor:), desc); + id hx = ((id(*)(id,SEL))objc_msgSend)(mdl, @selector(hexStringIdentifier)); + NSString *td = [NSTemporaryDirectory() stringByAppendingPathComponent:hx]; + NSFileManager *fm = [NSFileManager defaultManager]; + [fm createDirectoryAtPath:[td stringByAppendingPathComponent:@"weights"] + withIntermediateDirectories:YES attributes:nil error:nil]; + [md writeToFile:[td stringByAppendingPathComponent:@"model.mil"] atomically:YES]; + for (NSString *path in wd) { + NSString *rel = [path stringByReplacingOccurrencesOfString:@"@model_path/" withString:@""]; + [wd[path][@"data"] writeToFile:[td stringByAppendingPathComponent:rel] atomically:YES]; + } + + if (!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)( + mdl, @selector(compileWithQoS:options:error:), 21, @{}, &e)) { + NSString *desc_str = [e localizedDescription] ?: @"unknown"; + if ([desc_str length] > 200) desc_str = [desc_str substringToIndex:200]; + printf(" compile FAIL: %s\n", [desc_str UTF8String]); + [fm removeItemAtPath:td error:nil]; + return k; + } + if (!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)( + mdl, @selector(loadWithQoS:options:error:), 21, @{}, &e)) { + printf(" load FAIL\n"); + [fm removeItemAtPath:td error:nil]; + return k; + } + k.model = mdl; k.td = td; k.ok = true; + return k; +} + +static void kern_free(Kern *k) { + if (!k->ok) return; + NSError *e = nil; + ((BOOL(*)(id,SEL,unsigned int,NSError**))objc_msgSend)( + k->model, @selector(unloadWithQoS:error:), 21, &e); + [[NSFileManager defaultManager] removeItemAtPath:k->td error:nil]; + k->ok = false; +} + +// Benchmark: returns ms/eval, or -1 on failure +static double bench_kern(Kern *k, size_t inBytes, size_t outBytes, int warmup, int iters) { + if (!k->ok) return -1; + IOSurfaceRef ioIn = make_surface(inBytes); + IOSurfaceRef ioOut = make_surface(outBytes); + + id wIn = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioIn); + id wOut = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioOut); + id req = ((id(*)(Class,SEL,id,id,id,id,id,id,id))objc_msgSend)(g_AR, + @selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:), + @[wIn], @[@0], @[wOut], @[@0], nil, nil, @0); + + NSError *e = nil; + for (int i = 0; i < warmup; i++) + ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)( + k->model, @selector(evaluateWithQoS:options:request:error:), 21, @{}, req, &e); + + uint64_t t0 = mach_absolute_time(); + for (int i = 0; i < iters; i++) { + BOOL ok = ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)( + k->model, @selector(evaluateWithQoS:options:request:error:), 21, @{}, req, &e); + if (!ok) { CFRelease(ioIn); CFRelease(ioOut); return -1; } + } + double ms = tb_ms(mach_absolute_time() - t0) / iters; + + CFRelease(ioIn); CFRelease(ioOut); + return ms; +} + +// ── MIL generators ─────────────────────────────────────────────────────────── + +#define MIL_HDR \ + @"program(1.3)\n[buildInfo = dict({{\"coremlc-component-MIL\", \"3510.2.1\"}, " \ + "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, " \ + "{\"coremltools-version\", \"9.0\"}})]\n{\n" + +// Test 1: FP16 baseline conv (baked weights) +static NSString *gen_fp16_conv(int ic, int oc, int sp) { + return [NSString stringWithFormat: + @"%@" + " func main(tensor x) {\n" + " string pt = const()[name=string(\"pt\"), val=string(\"valid\")];\n" + " tensor st = const()[name=string(\"st\"), val=tensor([1,1])];\n" + " tensor pd = const()[name=string(\"pd\"), val=tensor([0,0,0,0])];\n" + " tensor dl = const()[name=string(\"dl\"), val=tensor([1,1])];\n" + " int32 gr = const()[name=string(\"gr\"), val=int32(1)];\n" + " string to16 = const()[name=string(\"to16\"), val=string(\"fp16\")];\n" + " tensor x16 = cast(dtype=to16, x=x)[name=string(\"cx\")];\n" + " tensor W = const()[name=string(\"W\"), " + "val=tensor(BLOBFILE(path=string(\"@model_path/weights/weight.bin\"), offset=uint64(64)))];\n" + " tensor y16 = conv(dilations=dl, groups=gr, pad=pd, pad_type=pt, strides=st, weight=W, x=x16)[name=string(\"cv\")];\n" + " string to32 = const()[name=string(\"to32\"), val=string(\"fp32\")];\n" + " tensor y = cast(dtype=to32, x=y16)[name=string(\"co\")];\n" + " } -> (y);\n}\n", + MIL_HDR, ic, sp, ic, sp, oc, ic, oc, ic, oc, sp, oc, sp]; +} + +// Test 2: INT8 weights via constexpr_affine_dequantize → fp16 conv +// This is how coremltools emits int8 quantized models +// dequant formula: fp16_weight = scale * (int8_weight - zero_point) +static NSString *gen_int8_dequant_conv(int ic, int oc, int sp) { + return [NSString stringWithFormat: + @"%@" + " func main(tensor x) {\n" + " string pt = const()[name=string(\"pt\"), val=string(\"valid\")];\n" + " tensor st = const()[name=string(\"st\"), val=tensor([1,1])];\n" + " tensor pd = const()[name=string(\"pd\"), val=tensor([0,0,0,0])];\n" + " tensor dl = const()[name=string(\"dl\"), val=tensor([1,1])];\n" + " int32 gr = const()[name=string(\"gr\"), val=int32(1)];\n" + " string to16 = const()[name=string(\"to16\"), val=string(\"fp16\")];\n" + " tensor x16 = cast(dtype=to16, x=x)[name=string(\"cx\")];\n" + " tensor Wq = const()[name=string(\"Wq\"), " + "val=tensor(BLOBFILE(path=string(\"@model_path/weights/weight.bin\"), offset=uint64(64)))];\n" + " tensor sc = const()[name=string(\"sc\"), " + "val=tensor(BLOBFILE(path=string(\"@model_path/weights/scale.bin\"), offset=uint64(64)))];\n" + " tensor zp = const()[name=string(\"zp\"), " + "val=tensor(BLOBFILE(path=string(\"@model_path/weights/zp.bin\"), offset=uint64(64)))];\n" + " int32 ax = const()[name=string(\"ax\"), val=int32(0)];\n" + " tensor W = constexpr_affine_dequantize(axis=ax, zero_point=zp, quantized_data=Wq, scale=sc)[name=string(\"dq\")];\n" + " tensor y16 = conv(dilations=dl, groups=gr, pad=pd, pad_type=pt, strides=st, weight=W, x=x16)[name=string(\"cv\")];\n" + " string to32 = const()[name=string(\"to32\"), val=string(\"fp32\")];\n" + " tensor y = cast(dtype=to32, x=y16)[name=string(\"co\")];\n" + " } -> (y);\n}\n", + MIL_HDR, ic, sp, ic, sp, + oc, ic, oc, ic, + oc, oc, + oc, oc, + oc, ic, + oc, sp, + oc, sp]; +} + +// Test 3: INT4 (uint4) weights via constexpr_affine_dequantize → fp16 conv +// uint4 packed: 2 values per byte, axis=1 dequantize +static NSString *gen_int4_dequant_conv(int ic, int oc, int sp) { + // For uint4, quantized_data shape is [oc, ic/2, 1, 1] packed + // But MIL may want the logical shape [oc, ic, 1, 1] with uint4 type + return [NSString stringWithFormat: + @"%@" + " func main(tensor x) {\n" + " string pt = const()[name=string(\"pt\"), val=string(\"valid\")];\n" + " tensor st = const()[name=string(\"st\"), val=tensor([1,1])];\n" + " tensor pd = const()[name=string(\"pd\"), val=tensor([0,0,0,0])];\n" + " tensor dl = const()[name=string(\"dl\"), val=tensor([1,1])];\n" + " int32 gr = const()[name=string(\"gr\"), val=int32(1)];\n" + " string to16 = const()[name=string(\"to16\"), val=string(\"fp16\")];\n" + " tensor x16 = cast(dtype=to16, x=x)[name=string(\"cx\")];\n" + " tensor Wq = const()[name=string(\"Wq\"), " + "val=tensor(BLOBFILE(path=string(\"@model_path/weights/weight.bin\"), offset=uint64(64)))];\n" + " tensor sc = const()[name=string(\"sc\"), " + "val=tensor(BLOBFILE(path=string(\"@model_path/weights/scale.bin\"), offset=uint64(64)))];\n" + " tensor zp = const()[name=string(\"zp\"), " + "val=tensor(BLOBFILE(path=string(\"@model_path/weights/zp.bin\"), offset=uint64(64)))];\n" + " int32 ax = const()[name=string(\"ax\"), val=int32(0)];\n" + " tensor W = constexpr_affine_dequantize(axis=ax, zero_point=zp, quantized_data=Wq, scale=sc)[name=string(\"dq\")];\n" + " tensor y16 = conv(dilations=dl, groups=gr, pad=pd, pad_type=pt, strides=st, weight=W, x=x16)[name=string(\"cv\")];\n" + " string to32 = const()[name=string(\"to32\"), val=string(\"fp32\")];\n" + " tensor y = cast(dtype=to32, x=y16)[name=string(\"co\")];\n" + " } -> (y);\n}\n", + MIL_HDR, ic, sp, ic, sp, + oc, ic, oc, ic, + oc, oc, + oc, oc, + oc, ic, + oc, sp, + oc, sp]; +} + +// Test 4: Block-wise int4 quantization (constexpr_blockwise_shift_scale) +// This is the more modern approach used in coremltools 8+ +static NSString *gen_int4_blockwise_conv(int ic, int oc, int sp, int block_size) { + int n_blocks = ic / block_size; + return [NSString stringWithFormat: + @"%@" + " func main(tensor x) {\n" + " string pt = const()[name=string(\"pt\"), val=string(\"valid\")];\n" + " tensor st = const()[name=string(\"st\"), val=tensor([1,1])];\n" + " tensor pd = const()[name=string(\"pd\"), val=tensor([0,0,0,0])];\n" + " tensor dl = const()[name=string(\"dl\"), val=tensor([1,1])];\n" + " int32 gr = const()[name=string(\"gr\"), val=int32(1)];\n" + " string to16 = const()[name=string(\"to16\"), val=string(\"fp16\")];\n" + " tensor x16 = cast(dtype=to16, x=x)[name=string(\"cx\")];\n" + " tensor Wq = const()[name=string(\"Wq\"), " + "val=tensor(BLOBFILE(path=string(\"@model_path/weights/weight.bin\"), offset=uint64(64)))];\n" + " tensor ss = const()[name=string(\"ss\"), " + "val=tensor(BLOBFILE(path=string(\"@model_path/weights/scale.bin\"), offset=uint64(64)))];\n" + " tensor W = constexpr_blockwise_shift_scale(data=Wq, scale=ss)[name=string(\"dq\")];\n" + " tensor y16 = conv(dilations=dl, groups=gr, pad=pd, pad_type=pt, strides=st, weight=W, x=x16)[name=string(\"cv\")];\n" + " string to32 = const()[name=string(\"to32\"), val=string(\"fp32\")];\n" + " tensor y = cast(dtype=to32, x=y16)[name=string(\"co\")];\n" + " } -> (y);\n}\n", + MIL_HDR, ic, sp, ic, sp, + oc, ic, oc, ic, + oc, n_blocks, oc, n_blocks, + oc, ic, + oc, sp, + oc, sp]; +} + +// Test 5: Palettized (LUT) weights via constexpr_lut_to_dense (iOS16) +// 4-bit indices packed into bytes, 16-entry fp16 lookup table +// indices: packed byte tensor of size ceil(4 * oc * ic / 8) = oc*ic/2 bytes +// lut: [1, 1, 16] for shared LUT across all channels +// shape: [oc, ic, 1, 1] output shape +static NSString *gen_lut4_conv(int ic, int oc, int sp) { + int packed_bytes = oc * ic / 2; // 4-bit, 2 per byte + return [NSString stringWithFormat: + @"%@" + " func main(tensor x) {\n" + " string pt = const()[name=string(\"pt\"), val=string(\"valid\")];\n" + " tensor st = const()[name=string(\"st\"), val=tensor([1,1])];\n" + " tensor pd = const()[name=string(\"pd\"), val=tensor([0,0,0,0])];\n" + " tensor dl = const()[name=string(\"dl\"), val=tensor([1,1])];\n" + " int32 gr = const()[name=string(\"gr\"), val=int32(1)];\n" + " string to16 = const()[name=string(\"to16\"), val=string(\"fp16\")];\n" + " tensor x16 = cast(dtype=to16, x=x)[name=string(\"cx\")];\n" + " tensor idx = const()[name=string(\"idx\"), " + "val=tensor(BLOBFILE(path=string(\"@model_path/weights/weight.bin\"), offset=uint64(64)))];\n" + " tensor lut = const()[name=string(\"lut\"), " + "val=tensor(BLOBFILE(path=string(\"@model_path/weights/lut.bin\"), offset=uint64(64)))];\n" + " tensor shp = const()[name=string(\"shp\"), val=tensor([%d, %d, 1, 1])];\n" + " tensor W = constexpr_lut_to_dense(indices=idx, lut=lut, shape=shp)[name=string(\"dq\")];\n" + " tensor y16 = conv(dilations=dl, groups=gr, pad=pd, pad_type=pt, strides=st, weight=W, x=x16)[name=string(\"cv\")];\n" + " string to32 = const()[name=string(\"to32\"), val=string(\"fp32\")];\n" + " tensor y = cast(dtype=to32, x=y16)[name=string(\"co\")];\n" + " } -> (y);\n}\n", + MIL_HDR, ic, sp, ic, sp, + packed_bytes, packed_bytes, + oc, ic, + oc, ic, + oc, sp, + oc, sp]; +} + +// LUT blob: fp16 lookup table [1, 1, 16] — shared across all channels +static NSData *build_lut_blob(int oc) { + (void)oc; // shared LUT, oc not needed + NSUInteger wsize = 1 * 1 * 16 * 2; // [1,1,16] fp16 + NSUInteger total = 128 + wsize; + uint8_t *buf = (uint8_t*)calloc(total, 1); + buf[0] = 1; buf[4] = 2; + buf[64] = 0xEF; buf[65] = 0xBE; buf[66] = 0xAD; buf[67] = 0xDE; buf[68] = 1; + *(uint32_t*)(buf + 72) = (uint32_t)wsize; + *(uint32_t*)(buf + 80) = 128; + _Float16 *fp16 = (_Float16*)(buf + 128); + for (int j = 0; j < 16; j++) + fp16[j] = (_Float16)((j - 8) * 0.01f); + return [NSData dataWithBytesNoCopy:buf length:total freeWhenDone:YES]; +} + +// Packed 4-bit index blob for LUT: oc*ic/2 bytes (2 indices per byte) +static NSData *build_lut_index_blob(int oc, int ic) { + NSUInteger wsize = (NSUInteger)oc * ic / 2; + NSUInteger total = 128 + wsize; + uint8_t *buf = (uint8_t*)calloc(total, 1); + buf[0] = 1; buf[4] = 2; + buf[64] = 0xEF; buf[65] = 0xBE; buf[66] = 0xAD; buf[67] = 0xDE; buf[68] = 1; + *(uint32_t*)(buf + 72) = (uint32_t)wsize; + *(uint32_t*)(buf + 80) = 128; + uint8_t *packed = buf + 128; + for (NSUInteger i = 0; i < wsize; i++) + packed[i] = (uint8_t)(arc4random() & 0xFF); // random 4-bit pairs + return [NSData dataWithBytesNoCopy:buf length:total freeWhenDone:YES]; +} + +// Blockwise scale blob: [oc, n_blocks, 1, 1] fp16 +static NSData *build_blockwise_scale_blob(int oc, int n_blocks) { + NSUInteger wsize = (NSUInteger)oc * n_blocks * 2; + NSUInteger total = 128 + wsize; + uint8_t *buf = (uint8_t*)calloc(total, 1); + buf[0] = 1; buf[4] = 2; + buf[64] = 0xEF; buf[65] = 0xBE; buf[66] = 0xAD; buf[67] = 0xDE; buf[68] = 1; + *(uint32_t*)(buf + 72) = (uint32_t)wsize; + *(uint32_t*)(buf + 80) = 128; + _Float16 *fp16 = (_Float16*)(buf + 128); + for (NSUInteger i = 0; i < (NSUInteger)oc * n_blocks; i++) + fp16[i] = (_Float16)(0.01f); + return [NSData dataWithBytesNoCopy:buf length:total freeWhenDone:YES]; +} + +// ── Main ───────────────────────────────────────────────────────────────────── + +static void run_test(const char *name, int ic, int oc, int sp, + NSString *mil, NSDictionary *wd) { + printf("\n [%s] %dx%d sp=%d\n", name, oc, ic, sp); + Kern k = try_compile(mil, wd); + if (!k.ok) { + printf(" RESULT: COMPILE FAILED\n"); + return; + } + printf(" compile+load: OK\n"); + + size_t inBytes = (size_t)ic * sp * 4; + size_t outBytes = (size_t)oc * sp * 4; + double ms = bench_kern(&k, inBytes, outBytes, 10, 100); + if (ms < 0) { + printf(" RESULT: EVAL FAILED\n"); + } else { + double gflops = 2.0 * oc * ic * sp / 1e9; + double tflops = gflops / ms; + printf(" %.3f ms/eval (%.2f GFLOP → %.3f TFLOPS)\n", ms, gflops, tflops); + } + kern_free(&k); +} + +int main(int argc, char **argv) { + @autoreleasepool { + mach_timebase_info(&g_tb); + ane_init(); + + printf("╔══════════════════════════════════════════════════════════════╗\n"); + printf("║ ANE Quantization Probe — int8 / int4 / LUT on Neural Engine ║\n"); + printf("╚══════════════════════════════════════════════════════════════╝\n"); + printf("\nGoal: Determine if ANE executes quantized ops natively or just\n"); + printf("dequantizes to fp16. Native execution → 2-4x speedup over fp16.\n"); + printf("Dequant-only → same speed (compute-bound) but smaller weight blobs.\n\n"); + + // Test dimensions - representative of transformer layers + typedef struct { int ic, oc, sp; const char *desc; } Cfg; + Cfg cfgs[] = { + {768, 768, 64, "Stories110M attn proj"}, + {768, 2048, 64, "Stories110M FFN up"}, + {2048, 768, 64, "Stories110M FFN down"}, + {1024, 1024, 64, "1K square"}, + {2048, 2048, 64, "2K square (stress)"}, + }; + int ncfg = sizeof(cfgs) / sizeof(cfgs[0]); + + for (int ci = 0; ci < ncfg; ci++) { + int ic = cfgs[ci].ic, oc = cfgs[ci].oc, sp = cfgs[ci].sp; + printf("\n━━━ %s (%dx%d, seq=%d) ━━━\n", cfgs[ci].desc, oc, ic, sp); + + // ── Test 1: FP16 baseline ── + { + NSString *mil = gen_fp16_conv(ic, oc, sp); + NSData *wb = build_fp16_blob(oc, ic); + NSDictionary *wd = @{@"@model_path/weights/weight.bin": @{@"offset": @0, @"data": wb}}; + run_test("FP16 baseline", ic, oc, sp, mil, wd); + } + + // ── Test 2: INT8 affine dequantize ── + { + NSString *mil = gen_int8_dequant_conv(ic, oc, sp); + NSData *wb = build_int8_blob(oc, ic); + NSData *scb = build_scale_blob(oc); + NSData *zpb = build_zp_int8_blob(oc); + NSDictionary *wd = @{ + @"@model_path/weights/weight.bin": @{@"offset": @0, @"data": wb}, + @"@model_path/weights/scale.bin": @{@"offset": @0, @"data": scb}, + @"@model_path/weights/zp.bin": @{@"offset": @0, @"data": zpb} + }; + run_test("INT8 affine dequant", ic, oc, sp, mil, wd); + } + + // ── Test 3: INT4 (uint4) affine dequantize ── + { + NSString *mil = gen_int4_dequant_conv(ic, oc, sp); + NSData *wb = build_int4_blob(oc, ic); + NSData *scb = build_scale_blob(oc); + NSData *zpb = build_zp_int4_blob(oc); + NSDictionary *wd = @{ + @"@model_path/weights/weight.bin": @{@"offset": @0, @"data": wb}, + @"@model_path/weights/scale.bin": @{@"offset": @0, @"data": scb}, + @"@model_path/weights/zp.bin": @{@"offset": @0, @"data": zpb} + }; + run_test("UINT4 affine dequant", ic, oc, sp, mil, wd); + } + + // ── Test 4: INT4 blockwise (block_size=32) ── + if (ic % 32 == 0) { + int block_size = 32; + int n_blocks = ic / block_size; + NSString *mil = gen_int4_blockwise_conv(ic, oc, sp, block_size); + NSData *wb = build_int4_blob(oc, ic); + NSData *scb = build_blockwise_scale_blob(oc, n_blocks); + NSDictionary *wd = @{ + @"@model_path/weights/weight.bin": @{@"offset": @0, @"data": wb}, + @"@model_path/weights/scale.bin": @{@"offset": @0, @"data": scb} + }; + run_test("UINT4 blockwise(32)", ic, oc, sp, mil, wd); + } + + // ── Test 5: LUT (4-bit palettized) ── + { + NSString *mil = gen_lut4_conv(ic, oc, sp); + NSData *wb = build_lut_index_blob(oc, ic); + NSData *lut = build_lut_blob(oc); + NSDictionary *wd = @{ + @"@model_path/weights/weight.bin": @{@"offset": @0, @"data": wb}, + @"@model_path/weights/lut.bin": @{@"offset": @0, @"data": lut} + }; + run_test("LUT4 palettized", ic, oc, sp, mil, wd); + } + } + + // ── Summary interpretation ── + printf("\n\n╔══════════════════════════════════════════════════════╗\n"); + printf("║ Interpretation Guide ║\n"); + printf("╠══════════════════════════════════════════════════════╣\n"); + printf("║ If int8 ≈ 2x fp16 TFLOPS → native int8 execution ║\n"); + printf("║ If int4 ≈ 4x fp16 TFLOPS → native int4 execution ║\n"); + printf("║ If int8 ≈ fp16 TFLOPS → dequant-to-fp16 only ║\n"); + printf("║ If COMPILE FAIL → type not supported in MIL ║\n"); + printf("║ If EVAL FAIL → compiles but ANE rejects ║\n"); + printf("╚══════════════════════════════════════════════════════╝\n"); + } + return 0; +}