From 20cd236f61e9d48f9d03b950a6898a6e8984088f Mon Sep 17 00:00:00 2001
From: maderix <maderix@max.local>
Date: Mon, 9 Mar 2026 19:47:01 -0700
Subject: [PATCH] Add INT8 W8A8 support: 1.88x ANE throughput via
 quantize/dequantize MIL ops
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- ane_int8_bench.m: standalone FP16 vs INT8 W8A8 benchmark (35.1 vs 18.6 TOPS on M4)
- bridge: add int8 weight blob builders (ane_bridge_build_weight_blob_int8, quantized)
- bridge: fix weight dict nil → @{} (prevents silent compile failure)
- README: update with Qwen3-0.6B, GQA, GPU↔ANE pipeline, INT8 results, file structure
---
 README.md           | 153 ++++++++++++++++---------
 ane_int8_bench.m    | 268 ++++++++++++++++++++++++++++++++++++++++++++
 bridge/ane_bridge.h |  13 +++
 bridge/ane_bridge.m |  45 +++++++-
 4 files changed, 427 insertions(+), 52 deletions(-)
 create mode 100644 ane_int8_bench.m

diff --git a/README.md b/README.md
index ed2362d..e64f2f6 100644
--- a/README.md
+++ b/README.md
@@ -55,29 +55,46 @@ This is MIT licensed for a reason. Everyone now has access to AI-assisted develo
 
 ## What This Is
 
-A from-scratch implementation of transformer training (forward + backward pass) running on the ANE in Apple Silicon. The ANE is a 15.8 TFLOPS (M4) inference accelerator that Apple does not expose for training. This project reverse-engineers the `_ANEClient` / `_ANECompiler` private APIs and the MIL (Model Intermediate Language) format to run custom compute graphs — including backpropagation — directly on ANE hardware.
+A from-scratch implementation of transformer training (forward + backward pass) running on the ANE in Apple Silicon. The ANE is a 15.8 TFLOPS FP16 (M4) inference accelerator that Apple does not expose for training. This project reverse-engineers the `_ANEClient` / `_ANECompiler` private APIs and the MIL (Model Intermediate Language) format to run custom compute graphs — including backpropagation — directly on ANE hardware.
+
+**Current results:**
+
+| Model | Params | ms/step | Pipeline |
+|-------|--------|---------|----------|
+| Stories110M (12L, dim=768, MHA 12/12) | 109M | **91 ms** | Dynamic (no recompile) |
+| Qwen3-0.6B (28L, dim=1024, GQA 16/8) | 596M | **412 ms** | Dynamic (no recompile) |
 
-**Current results — Stories110M (12-layer, dim=768, seq=256, 109M params):**
-- Static pipeline: **91 ms/step** (M3 Ultra), **106 ms/step** (M4)
-- Dynamic pipeline: **110 ms/step**, no recompilation
-- 72 ANE kernels per step (static), 9 shared kernels (dynamic)
 - All forward and backward dx passes on ANE, dW gradients on CPU (Accelerate cblas)
 - Adam optimizer, gradient accumulation, checkpoint/resume via exec() restart
+- GQA (Grouped-Query Attention) support with per-head tiling/reduction
+- GPU↔ANE zero-copy pipeline via shared IOSurface (GPU prefill → ANE decode)
+
+**INT8 W8A8 quantization — 1.88x throughput (M4, H16G):**
+
+| Config | FP16 | INT8 W8A8 | Speedup |
+|--------|------|-----------|---------|
+| 128x conv 512ch 64x64 | 18.6 TOPS, 14.8ms | 35.1 TOPS, 7.8ms | **1.88x** |
+| 64x conv 512ch 64x64 | 18.4 TOPS, 7.5ms | 34.1 TOPS, 4.0ms | **1.85x** |
+
+INT8 activations halve L2 SRAM bandwidth between tiles via MIL `quantize`/`dequantize` ops. Weights use `constexpr_affine_dequantize` (int8 stored, fp16 at compile time).
 
 ## Architecture
 
-The training loop uses 6 ANE kernels per step:
+The dynamic pipeline uses shared ANE kernels with weights packed into spatial dimensions (no recompilation when weights change):
 
-| Kernel | Function | Weights |
-|--------|----------|---------|
-| `kFwdAttn` | RMSNorm + QKV projection + SDPA + output projection | Wq, Wk, Wv, Wo, rms1, mask |
-| `kFwdFFN` | RMSNorm + SwiGLU FFN (W1, W3, SiLU, W2) | W1, W2, W3, rms2 |
-| `kFFNBwd` | FFN backward (W2^T + SiLU_bwd + W1^T + W3^T) | W2^T, W1^T, W3^T |
-| `kSdpaBwd1` | Wo^T + SDPA backward part 1 (dV, probs, dp) | Wo^T, mask |
-| `kSdpaBwd2` | SDPA backward part 2 (softmax grad, dQ, dK) | — |
-| `kQKVb` | QKV backward (Wq^T + Wk^T + Wv^T → dx) | Wq^T, Wk^T, Wv^T |
+**MHA models (Stories110M) — 6 kernels per layer:**
 
-CPU handles: RMSNorm backward, residual connections, loss computation, dW gradient accumulation (cblas_sgemm), Adam optimizer updates.
+| Kernel | Function |
+|--------|----------|
+| `sdpaFwd` | QKV projection + SDPA + output projection |
+| `ffnFused` | SwiGLU FFN (W1, W3, SiLU, W2) |
+| `ffnBwdW2t` / `ffnBwdW13t` | FFN backward (split for memory) |
+| `sdpaBwd1` / `sdpaBwd2` | SDPA backward |
+
+**GQA models (Qwen3-0.6B) — 10 kernels per layer:**
+Adds separate `woFwd`, `qBwd`, `kvBwd` kernels for grouped-query attention (Q_DIM ≠ DIM).
+
+CPU handles: RMSNorm forward/backward, residual connections (DeepNet α scaling), loss computation, dW gradient accumulation (cblas_sgemm), Adam optimizer updates.
 
 Key optimizations:
 - **Channel-first CPU layout** — matches ANE IOSurface `[1,C,1,S]` format, eliminates all transpose overhead
@@ -92,22 +109,33 @@ Key optimizations:
 ## File Structure
 
 ```
-├── api_exploration.m       # Initial ANE API discovery
-├── inmem_basic.m           # In-memory MIL compilation proof-of-concept
-├── inmem_bench.m           # ANE dispatch latency benchmarks
-├── inmem_peak.m            # Peak TFLOPS measurement (2048x2048 matmul)
-├── sram_bench.m            # ANE SRAM bandwidth probing
-├── sram_probe.m            # SRAM size/layout exploration
+├── api_exploration.m           # Initial ANE API discovery
+├── inmem_basic.m               # In-memory MIL compilation proof-of-concept
+├── inmem_bench.m               # ANE dispatch latency benchmarks
+├── inmem_peak.m                # Peak TFLOPS measurement (2048x2048 matmul)
+├── ane_int8_bench.m            # INT8 W8A8 vs FP16 throughput benchmark
+├── sram_bench.m                # ANE SRAM bandwidth probing
+├── sram_probe.m                # SRAM size/layout exploration
+├── gpu_ane_share.m             # GPU↔ANE zero-copy IOSurface demo
+├── gpu_prefill_ane_decode.m    # GPU prefill → ANE decode pipeline
+├── bridge/
+│   ├── ane_bridge.h            # C-callable ANE API (compile, eval, I/O)
+│   ├── ane_bridge.m            # Bridge implementation (int8 + fp16 weight blobs)
+│   └── Makefile
 └── training/
-    ├── ane_runtime.h       # ANE private API wrapper (compile, eval, IOSurface)
-    ├── ane_mil_gen.h       # MIL program generation helpers
-    ├── model.h             # Model weight initialization and blob builders
-    ├── forward.h           # Forward pass MIL generators
-    ├── backward.h          # Backward pass MIL generators
-    ├── train.m             # Minimal training loop (early prototype)
-    ├── tiny_train.m        # 2-layer tiny model training
-    ├── train_large.m       # Main: single-layer dim=768 training (optimized)
-    ├── test_*.m            # Unit tests for individual kernels
+    ├── ane_runtime.h           # ANE private API wrapper (compile, eval, IOSurface)
+    ├── ane_classifier.h        # Classifier fwd (32K conv), softmax, rmsnorm on ANE
+    ├── train_large.m           # Static pipeline (weights as constants, recompiles)
+    ├── training_dynamic/
+    │   ├── train.m             # Dynamic training loop (model-agnostic)
+    │   ├── config.h            # Derived sizes, structs, alloc helpers
+    │   ├── mil_dynamic.h       # MIL generators for dynamic weight kernels (GQA-aware)
+    │   ├── io.h                # IOSurface I/O, weight staging, GQA tile/reduce
+    │   ├── models/
+    │   │   ├── stories110m.h   # Stories110M config (12L, MHA)
+    │   │   └── qwen3_06b.h    # Qwen3-0.6B config (28L, GQA)
+    │   └── Makefile
+    ├── dashboard.py            # Live training dashboard (blessed TUI)
     └── Makefile
 ```
 
@@ -124,13 +152,24 @@ See [training/README.md](training/README.md) for detailed training instructions.
 Requires macOS 15+ on Apple Silicon (tested on M4).
 
 ```bash
-# Build the main training program
-xcrun clang -O2 -framework Foundation -framework IOSurface \
-  -framework CoreML -framework Accelerate -ldl -lobjc \
-  -o train_large training/train_large.m
+# Dynamic pipeline (recommended) — model selected at build time
+cd training/training_dynamic
+make MODEL=stories110m    # Stories110M (12L, MHA, 109M params)
+make MODEL=qwen3_06b      # Qwen3-0.6B (28L, GQA, 596M params)
+./train --scratch          # train from random init
+./train --resume           # resume from checkpoint
 
-# Run
-./train_large
+# Static pipeline (legacy — recompiles weights each step)
+cd training && make train_large
+./train_large ane_stories110M_ckpt.bin 256 100 1e-4
+
+# INT8 benchmark
+xcrun clang -O2 -fobjc-arc -framework Foundation -framework IOSurface -ldl \
+  -o ane_int8_bench ane_int8_bench.m
+./ane_int8_bench
+
+# Bridge library (C-callable ANE API)
+cd bridge && make
 ```
 
 No external dependencies. Uses only system frameworks + private ANE APIs resolved at runtime via `objc_msgSend`.
@@ -139,28 +178,40 @@ No external dependencies. Uses only system frameworks + private ANE APIs resolve
 
 1. **MIL generation** — Objective-C code constructs MIL program text at runtime, specifying convolutions (for linear layers), matmul (for attention), softmax, element-wise ops
 2. **In-memory compilation** — `_ANEInMemoryModelDescriptor` compiles MIL text + weight blobs directly to ANE programs, no disk mlmodelc needed
-3. **IOSurface I/O** — Input/output tensors passed via IOSurface shared memory in `[1, channels, 1, spatial]` format (fp16)
-4. **Weight embedding** — Weights baked into ANE programs as BLOBFILE constants; recompiled each batch when weights change
+3. **IOSurface I/O** — Input/output tensors passed via IOSurface shared memory in `[1, channels, 1, spatial]` format (fp16 or fp32; fp16 direct I/O is ~37% faster)
+4. **Dynamic weights** — Activations and weights packed into a single spatial input dimension, sliced apart inside the MIL kernel. Weights change without recompilation.
 5. **Gradient flow** — Forward taps expose intermediates needed for backward; backward kernels compute dx (input gradients) on ANE; dW (weight gradients) computed on CPU via cblas
+6. **INT8 quantization** — `constexpr_affine_dequantize` for int8 weights, `quantize`/`dequantize` between layers for int8 activation caching in L2 SRAM (1.88x throughput)
 
 ## Limitations
 
-- **SDPA causal masking** — ANE hardware ignores `attn_mask` in SDPA ops; causal attention is decomposed into separate Q@K^T (ANE) → mask+softmax (ANE via add+softmax) → scores@V (ANE)
+- **SDPA causal masking** — ANE hardware ignores `attn_mask` in SDPA ops; causal attention is decomposed into separate Q@K^T (ANE) → mask+softmax (CPU) → scores@V (ANE)
 - **~119 compile limit** — ANE compiler leaks resources; worked around via `exec()` restart with checkpoint
-- **Compile overhead** — Static pipeline recompiles 60+ kernels every 10 steps (~3.7s); dynamic pipeline avoids this
-- **Low utilization** — Training sustains ~1-2 TFLOPS out of 15.8+ peak due to CPU fallbacks and I/O overhead
+- **FP16 gradient underflow** — backward matmuls underflow in fp16; fixed with global loss scaling (`256 * NLAYERS`)
+- **Single-input constraint** — multi-input ANE requests cause 0x1d error; inputs packed into spatial dimension instead
 
-## Performance History
+## Performance
 
-| Optimization | ms/step | ANE util |
-|---|---|---|
-| Baseline (vDSP transpose) | 33.5 | 3.1% |
-| Channel-first layout | 20.3 | 5.2% |
-| vDSP vectorized RMSNorm | 14.2 | 7.4% |
-| GCD async cblas overlap | 11.4 | 9.2% |
-| ANE RMSNorm fusion | 11.4 | 9.2% |
-| Wo^T fusion (7→6 kernels) | 11.4 | 9.2% |
-| Deferred cblas wait | **9.3** | **11.2%** |
+**Training throughput (M4):**
+
+| Model | Params | ms/step | Layers | Kernels/layer |
+|-------|--------|---------|--------|---------------|
+| Stories110M | 109M | 91 ms | 12 | 6 (MHA) |
+| Qwen3-0.6B | 596M | 412 ms | 28 | 10 (GQA) |
+
+**ANE peak throughput (M4, H16G):**
+
+| Precision | Peak TOPS | Config |
+|-----------|-----------|--------|
+| FP16 | 18.6 | 128x conv 512ch 64x64 |
+| INT8 W8A8 | 35.1 | 128x conv 512ch 64x64 |
+
+**GPU↔ANE inference pipeline (M4, seq=256):**
+
+| Model | GPU Prefill | ANE Decode | Total |
+|-------|------------|------------|-------|
+| Stories110M | 6.7ms | 1.9ms | 8.8ms |
+| Qwen3-0.6B | 9.7ms | 2.3ms | 12.0ms |
 
 ## Disclaimer
 
diff --git a/ane_int8_bench.m b/ane_int8_bench.m
new file mode 100644
index 0000000..ff41f65
--- /dev/null
+++ b/ane_int8_bench.m
@@ -0,0 +1,268 @@
+// ane_int8_bench.m — INT8 W8A8 benchmark on ANE via _ANEInMemoryModel
+// Build: xcrun clang -O2 -fobjc-arc -framework Foundation -framework IOSurface -ldl -o ane_int8_bench ane_int8_bench.m
+// Usage: ./ane_int8_bench
+//
+// Tests FP16 vs W8A8 (int8 weights + int8 activation caching) throughput.
+// Key MIL ops: constexpr_affine_dequantize, quantize, dequantize
+#import <Foundation/Foundation.h>
+#import <objc/runtime.h>
+#import <objc/message.h>
+#import <dlfcn.h>
+#import <mach/mach_time.h>
+#import <IOSurface/IOSurface.h>
+
+static mach_timebase_info_data_t g_tb;
+static double ticksToMs(uint64_t t) { return (double)t * g_tb.numer / g_tb.denom / 1e6; }
+
+// Weight blob for int8 weights (1 byte per element)
+static NSData *buildWeightBlobInt8(int ch, int depth) {
+    NSUInteger wsize = ch * ch * 1;
+    NSUInteger chunkSize = 64 + wsize;
+    NSUInteger total = 64 + chunkSize * depth;
+    uint8_t *buf = calloc(total, 1);
+    buf[0] = 0x01; buf[4] = 0x02;
+    for (int i = 0; i < depth; i++) {
+        uint8_t *chunk = buf + 64 + i * chunkSize;
+        chunk[0]=0xEF; chunk[1]=0xBE; chunk[2]=0xAD; chunk[3]=0xDE;
+        chunk[4]=0x01; chunk[10]=0x08;
+        int8_t *data = (int8_t*)(chunk + 64);
+        for (NSUInteger j = 0; j < wsize; j++) data[j] = (int8_t)(arc4random() % 256 - 128);
+    }
+    return [NSData dataWithBytesNoCopy:buf length:total freeWhenDone:YES];
+}
+
+// Weight blob for fp16 weights (2 bytes per element)
+static NSData *buildWeightBlobFP16(int ch, int depth) {
+    NSUInteger wsize = ch * ch * 2;
+    NSUInteger chunkSize = 64 + wsize;
+    NSUInteger total = 64 + chunkSize * depth;
+    uint8_t *buf = calloc(total, 1);
+    buf[0] = 0x01; buf[4] = 0x02;
+    for (int i = 0; i < depth; i++) {
+        uint8_t *chunk = buf + 64 + i * chunkSize;
+        chunk[0]=0xEF; chunk[1]=0xBE; chunk[2]=0xAD; chunk[3]=0xDE;
+        chunk[4]=0x01; chunk[10]=0x10;
+        _Float16 *data = (_Float16*)(chunk + 64);
+        for (NSUInteger j = 0; j < (NSUInteger)(ch*ch); j++) data[j] = (_Float16)(((float)(arc4random()%1000) - 500.0f) * 0.001f);
+    }
+    return [NSData dataWithBytesNoCopy:buf length:total freeWhenDone:YES];
+}
+
+// Generate W8A8 INT8 MIL: conv with int8 weights + quantize/dequantize between layers
+static NSString *genMILInt8(int ch, int sp, int depth) {
+    NSMutableString *m = [NSMutableString string];
+    [m appendString:@"program(1.3)\n[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, {\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, {\"coremltools-version\", \"9.0\"}})]\n{\n"];
+    [m appendFormat:@"    func main<ios18>(tensor<fp16, [1, %d, %d, %d]> x) {\n", ch, sp, sp];
+    // Conv constants
+    [m appendString:@"        string c_pad_type = const()[name = string(\"c_pad_type\"), val = string(\"valid\")];\n"
+        @"        tensor<int32, [2]> c_strides = const()[name = string(\"c_strides\"), val = tensor<int32, [2]>([1, 1])];\n"
+        @"        tensor<int32, [4]> c_pad = const()[name = string(\"c_pad\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
+        @"        tensor<int32, [2]> c_dilations = const()[name = string(\"c_dilations\"), val = tensor<int32, [2]>([1, 1])];\n"
+        @"        int32 c_groups = const()[name = string(\"c_groups\"), val = int32(1)];\n"];
+    // Quantize/dequantize scale
+    [m appendString:@"        fp16 q_scale = const()[name = string(\"q_scale\"), val = fp16(0x1p-3)];\n"
+                    @"        string q_dtype = const()[name = string(\"q_dtype\"), val = string(\"int8\")];\n"
+                    @"        fp16 dq_scale = const()[name = string(\"dq_scale\"), val = fp16(0x1p-3)];\n"];
+
+    NSUInteger cs = 64 + ch * ch * 1;  // int8 chunk size
+    NSString *prev = @"x";
+    for (int i = 0; i < depth; i++) {
+        // constexpr_affine_dequantize: int8 weights → fp16 at compile time
+        [m appendFormat:
+            @"        tensor<fp16, [%d, %d, 1, 1]> W%d = constexpr_affine_dequantize()"
+            @"[axis = int32(0), name = string(\"W%d\"), "
+            @"quantized_data = tensor<int8, [%d, %d, 1, 1]>"
+            @"(BLOBFILE(path = string(\"@model_path/weights/weight.bin\"), offset = uint64(%lu))), "
+            @"scale = fp16(0x1p-3), zero_point = int8(0)];\n",
+            ch, ch, i, i, ch, ch, (unsigned long)(64 + i * cs)];
+        // conv
+        NSString *conv_out = [NSString stringWithFormat:@"c%d", i];
+        [m appendFormat:@"        tensor<fp16, [1, %d, %d, %d]> %@ = conv(dilations = c_dilations, groups = c_groups, pad = c_pad, pad_type = c_pad_type, strides = c_strides, weight = W%d, x = %@)[name = string(\"%@\")];\n",
+            ch, sp, sp, conv_out, i, prev, conv_out];
+
+        if (i < depth - 1) {
+            // quantize: fp16 → int8
+            NSString *q_out = [NSString stringWithFormat:@"q%d", i];
+            [m appendFormat:@"        tensor<int8, [1, %d, %d, %d]> %@ = quantize(input = %@, output_dtype = q_dtype, scale = q_scale)[name = string(\"%@\")];\n",
+                ch, sp, sp, q_out, conv_out, q_out];
+            // dequantize: int8 → fp16
+            NSString *dq_out = [NSString stringWithFormat:@"dq%d", i];
+            [m appendFormat:@"        tensor<fp16, [1, %d, %d, %d]> %@ = dequantize(input = %@, scale = dq_scale)[name = string(\"%@\")];\n",
+                ch, sp, sp, dq_out, q_out, dq_out];
+            prev = dq_out;
+        } else {
+            prev = conv_out;
+        }
+    }
+    [m appendFormat:@"    } -> (%@);\n}\n", prev];
+    return m;
+}
+
+// Generate FP16 baseline MIL: pure fp16 conv chain
+static NSString *genMILFP16(int ch, int sp, int depth) {
+    NSMutableString *m = [NSMutableString string];
+    [m appendString:@"program(1.3)\n[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, {\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, {\"coremltools-version\", \"9.0\"}})]\n{\n"];
+    [m appendFormat:@"    func main<ios18>(tensor<fp16, [1, %d, %d, %d]> x) {\n", ch, sp, sp];
+    [m appendString:@"        string c_pad_type = const()[name = string(\"c_pad_type\"), val = string(\"valid\")];\n"
+        @"        tensor<int32, [2]> c_strides = const()[name = string(\"c_strides\"), val = tensor<int32, [2]>([1, 1])];\n"
+        @"        tensor<int32, [4]> c_pad = const()[name = string(\"c_pad\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
+        @"        tensor<int32, [2]> c_dilations = const()[name = string(\"c_dilations\"), val = tensor<int32, [2]>([1, 1])];\n"
+        @"        int32 c_groups = const()[name = string(\"c_groups\"), val = int32(1)];\n"];
+
+    NSUInteger cs = 64 + ch * ch * 2;  // fp16 chunk size
+    NSString *prev = @"x";
+    for (int i = 0; i < depth; i++) {
+        // fp16 weights from blob
+        [m appendFormat:
+            @"        tensor<fp16, [%d, %d, 1, 1]> W%d = const()"
+            @"[name = string(\"W%d\"), "
+            @"val = tensor<fp16, [%d, %d, 1, 1]>"
+            @"(BLOBFILE(path = string(\"@model_path/weights/weight.bin\"), offset = uint64(%lu)))];\n",
+            ch, ch, i, i, ch, ch, (unsigned long)(64 + i * cs)];
+        NSString *conv_out = [NSString stringWithFormat:@"c%d", i];
+        [m appendFormat:@"        tensor<fp16, [1, %d, %d, %d]> %@ = conv(dilations = c_dilations, groups = c_groups, pad = c_pad, pad_type = c_pad_type, strides = c_strides, weight = W%d, x = %@)[name = string(\"%@\")];\n",
+            ch, sp, sp, conv_out, i, prev, conv_out];
+        prev = conv_out;
+    }
+    [m appendFormat:@"    } -> (%@);\n}\n", prev];
+    return m;
+}
+
+static double benchModel(NSString *milStr, NSData *wb, int ch, int sp, const char *label) {
+    @autoreleasepool {
+        NSError *e = nil;
+        NSData *milData = [milStr dataUsingEncoding:NSUTF8StringEncoding];
+        Class D = NSClassFromString(@"_ANEInMemoryModelDescriptor");
+        Class I = NSClassFromString(@"_ANEInMemoryModel");
+        Class AR = NSClassFromString(@"_ANERequest");
+        Class AIO = NSClassFromString(@"_ANEIOSurfaceObject");
+
+        id desc = ((id(*)(Class,SEL,id,id,id))objc_msgSend)(D,
+            @selector(modelWithMILText:weights:optionsPlist:), milData,
+            @{@"@model_path/weights/weight.bin": @{@"offset": @0, @"data": wb}}, nil);
+        if (!desc) { printf("  %s: desc FAIL\n", label); return -1; }
+
+        id mdl = ((id(*)(Class,SEL,id))objc_msgSend)(I, @selector(inMemoryModelWithDescriptor:), desc);
+        if (!mdl) { printf("  %s: mdl FAIL\n", label); return -2; }
+
+        id hx = ((id(*)(id,SEL))objc_msgSend)(mdl, @selector(hexStringIdentifier));
+        NSString *td = [NSTemporaryDirectory() stringByAppendingPathComponent:hx];
+        NSFileManager *fm = [NSFileManager defaultManager];
+        [fm createDirectoryAtPath:[td stringByAppendingPathComponent:@"weights"]
+      withIntermediateDirectories:YES attributes:nil error:nil];
+        [milData writeToFile:[td stringByAppendingPathComponent:@"model.mil"] atomically:YES];
+        [wb writeToFile:[td stringByAppendingPathComponent:@"weights/weight.bin"] atomically:YES];
+
+        if (!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(
+                mdl, @selector(compileWithQoS:options:error:), 0, @{}, &e)) {
+            printf("  %s: compile FAIL: %s\n", label, e ? [[e description] UTF8String] : "?");
+            [fm removeItemAtPath:td error:nil];
+            return -3;
+        }
+        if (!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(
+                mdl, @selector(loadWithQoS:options:error:), 0, @{}, &e)) {
+            printf("  %s: load FAIL\n", label);
+            [fm removeItemAtPath:td error:nil];
+            return -4;
+        }
+
+        NSUInteger bytes = (NSUInteger)ch * sp * sp * 2;  // fp16 I/O
+        IOSurfaceRef ioI = IOSurfaceCreate((__bridge CFDictionaryRef)@{
+            (id)kIOSurfaceWidth: @(bytes), (id)kIOSurfaceHeight: @1,
+            (id)kIOSurfaceBytesPerElement: @1, (id)kIOSurfaceBytesPerRow: @(bytes),
+            (id)kIOSurfaceAllocSize: @(bytes), (id)kIOSurfacePixelFormat: @0});
+        IOSurfaceRef ioO = IOSurfaceCreate((__bridge CFDictionaryRef)@{
+            (id)kIOSurfaceWidth: @(bytes), (id)kIOSurfaceHeight: @1,
+            (id)kIOSurfaceBytesPerElement: @1, (id)kIOSurfaceBytesPerRow: @(bytes),
+            (id)kIOSurfaceAllocSize: @(bytes), (id)kIOSurfacePixelFormat: @0});
+
+        id wI = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(AIO, @selector(objectWithIOSurface:), ioI);
+        id wO = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(AIO, @selector(objectWithIOSurface:), ioO);
+        id req = ((id(*)(Class,SEL,id,id,id,id,id,id,id))objc_msgSend)(AR,
+            @selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:),
+            @[wI], @[@0], @[wO], @[@0], nil, nil, @0);
+
+        // Warmup
+        for (int i = 0; i < 10; i++)
+            ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)(
+                mdl, @selector(evaluateWithQoS:options:request:error:), 0, @{}, req, &e);
+
+        int iters = 50;
+        uint64_t t0 = mach_absolute_time();
+        for (int i = 0; i < iters; i++)
+            ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)(
+                mdl, @selector(evaluateWithQoS:options:request:error:), 0, @{}, req, &e);
+        double ms = ticksToMs(mach_absolute_time() - t0) / iters;
+
+        ((BOOL(*)(id,SEL,unsigned int,NSError**))objc_msgSend)(
+            mdl, @selector(unloadWithQoS:error:), 0, &e);
+        CFRelease(ioI); CFRelease(ioO);
+        [fm removeItemAtPath:td error:nil];
+        return ms;
+    }
+}
+
+int main(void) {
+    mach_timebase_info(&g_tb);
+    dlopen("/System/Library/PrivateFrameworks/AppleNeuralEngine.framework/AppleNeuralEngine", RTLD_NOW);
+
+    // Query HW info
+    Class DI = NSClassFromString(@"_ANEDeviceInfo");
+    const char *ane_type = "unknown";
+    if (DI) {
+        id subType = ((id(*)(Class,SEL))objc_msgSend)(DI, @selector(aneSubType));
+        if (subType) ane_type = [[subType description] UTF8String];
+    }
+
+    printf("=== ANE INT8 W8A8 Benchmark (M4, %s) ===\n\n", ane_type);
+    printf("%-30s %7s %7s %9s %7s %7s\n", "Config", "W(MB)", "GOP", "ms/eval", "TOPS", "Ratio");
+    printf("--------------------------------------------------------------------------------\n");
+
+    typedef struct { int ch; int sp; int depth; } Config;
+    Config configs[] = {
+        {512, 64, 128},
+        {512, 64, 64},
+        {256, 64, 256},
+        {256, 64, 128},
+        {384, 64, 128},
+    };
+    int ncfg = sizeof(configs) / sizeof(configs[0]);
+
+    for (int ci = 0; ci < ncfg; ci++) {
+        int ch = configs[ci].ch, sp = configs[ci].sp, depth = configs[ci].depth;
+        double gop = 2.0 * ch * ch * sp * sp * depth / 1e9;
+
+        // FP16
+        double w_fp16 = (double)ch * ch * 2 * depth / 1024 / 1024;
+        NSString *milFP16 = genMILFP16(ch, sp, depth);
+        NSData *wbFP16 = buildWeightBlobFP16(ch, depth);
+        char lbl[64];
+        snprintf(lbl, 64, "FP16 %dx conv %dch", depth, ch);
+        double ms_fp16 = benchModel(milFP16, wbFP16, ch, sp, lbl);
+
+        // INT8 W8A8
+        double w_int8 = (double)ch * ch * 1 * depth / 1024 / 1024;
+        NSString *milInt8 = genMILInt8(ch, sp, depth);
+        NSData *wbInt8 = buildWeightBlobInt8(ch, depth);
+        snprintf(lbl, 64, "W8A8 %dx conv %dch", depth, ch);
+        double ms_int8 = benchModel(milInt8, wbInt8, ch, sp, lbl);
+
+        if (ms_fp16 > 0 && ms_int8 > 0) {
+            double tops_fp16 = gop / ms_fp16;
+            double tops_int8 = gop / ms_int8;
+            double ratio = ms_fp16 / ms_int8;
+            printf("FP16 %-25s %6.1f  %6.2f  %7.3f ms %6.2f\n",
+                   [NSString stringWithFormat:@"%dx conv %dch %dx%d", depth, ch, sp, sp].UTF8String,
+                   w_fp16, gop, ms_fp16, tops_fp16);
+            printf("W8A8 %-25s %6.1f  %6.2f  %7.3f ms %6.2f  %.2fx\n",
+                   [NSString stringWithFormat:@"%dx conv %dch %dx%d", depth, ch, sp, sp].UTF8String,
+                   w_int8, gop, ms_int8, tops_int8, ratio);
+            printf("\n");
+        } else {
+            printf("  %dx conv %dch: FP16=%.1f INT8=%.1f (FAIL)\n", depth, ch, ms_fp16, ms_int8);
+        }
+    }
+
+    printf("=== Done ===\n");
+    return 0;
+}
diff --git a/bridge/ane_bridge.h b/bridge/ane_bridge.h
index 3e8ff47..50295d5 100644
--- a/bridge/ane_bridge.h
+++ b/bridge/ane_bridge.h
@@ -77,6 +77,19 @@ uint8_t *ane_bridge_build_weight_blob(const float *src, int rows, int cols,
 uint8_t *ane_bridge_build_weight_blob_transposed(const float *src, int rows, int cols,
                                                    size_t *out_len);
 
+// Build an int8 weight blob in ANE format (64-byte header + int8 data per chunk)
+// src: int8 weights [rows x cols], scale: dequantization scale, zero_point: int8 zero
+// For use with constexpr_affine_dequantize in MIL
+// Returns allocated buffer and sets out_len. Caller must free().
+uint8_t *ane_bridge_build_weight_blob_int8(const int8_t *src, int rows, int cols,
+                                            size_t *out_len);
+
+// Quantize float32 weights to int8 and build ANE blob in one step
+// Computes per-channel (axis=0) scale = max(abs(row)) / 127
+// Returns allocated buffer, sets out_len and out_scale. Caller must free().
+uint8_t *ane_bridge_build_weight_blob_quantized(const float *src, int rows, int cols,
+                                                 float *out_scale, size_t *out_len);
+
 // Free a blob allocated by ane_bridge_build_weight_blob*
 void ane_bridge_free_blob(void *ptr);
 
diff --git a/bridge/ane_bridge.m b/bridge/ane_bridge.m
index 2b27ddc..dac5030 100644
--- a/bridge/ane_bridge.m
+++ b/bridge/ane_bridge.m
@@ -93,7 +93,7 @@ ANEKernelHandle *ane_bridge_compile_multi_weights(
 
         id desc = ((id(*)(Class,SEL,id,id,id))objc_msgSend)(
             g_ANEDesc, @selector(modelWithMILText:weights:optionsPlist:),
-            milData, wdict.count > 0 ? wdict : nil, nil);
+            milData, wdict.count > 0 ? wdict : @{}, nil);
         if (!desc) {
             fprintf(stderr, "ane_bridge: modelWithMILText failed\n");
             return NULL;
@@ -326,3 +326,46 @@ uint8_t *ane_bridge_build_weight_blob_transposed(const float *src, int rows, int
     *out_len = total;
     return buf;
 }
+
+uint8_t *ane_bridge_build_weight_blob_int8(const int8_t *src, int rows, int cols,
+                                            size_t *out_len) {
+    int wsize = rows * cols;  // 1 byte per int8 element
+    int total = 64 + wsize;   // 64-byte header + data
+    uint8_t *buf = (uint8_t *)calloc(total, 1);
+
+    // ANE int8 blob header
+    buf[0] = 0xEF; buf[1] = 0xBE; buf[2] = 0xAD; buf[3] = 0xDE;
+    buf[4] = 0x01;
+    buf[10] = 0x08;  // 8-bit element marker
+
+    memcpy(buf + 64, src, wsize);
+    *out_len = total;
+    return buf;
+}
+
+uint8_t *ane_bridge_build_weight_blob_quantized(const float *src, int rows, int cols,
+                                                 float *out_scale, size_t *out_len) {
+    // Find global max abs for symmetric quantization
+    float max_abs = 0.0f;
+    for (int i = 0; i < rows * cols; i++) {
+        float a = src[i] < 0 ? -src[i] : src[i];
+        if (a > max_abs) max_abs = a;
+    }
+    float scale = max_abs / 127.0f;
+    if (scale == 0.0f) scale = 1.0f;
+
+    // Quantize to int8
+    int wsize = rows * cols;
+    int8_t *qdata = (int8_t *)malloc(wsize);
+    for (int i = 0; i < wsize; i++) {
+        float v = src[i] / scale;
+        if (v > 127.0f) v = 127.0f;
+        if (v < -128.0f) v = -128.0f;
+        qdata[i] = (int8_t)(v + (v >= 0 ? 0.5f : -0.5f));
+    }
+
+    uint8_t *blob = ane_bridge_build_weight_blob_int8(qdata, rows, cols, out_len);
+    free(qdata);
+    *out_scale = scale;
+    return blob;
+}