From 050bc4fdf0fc28095039d421f325f787486a3726 Mon Sep 17 00:00:00 2001 From: maderix Date: Wed, 4 Mar 2026 05:30:00 -0800 Subject: [PATCH] Add cross-generation ANE benchmark report from issue #3 Community-submitted results for M1 Pro/Max, M3 Pro, M4 Pro/Max, M5. Includes training performance, peak throughput, MIL compatibility matrix, and structured JSON data. --- benchmarks/ANE_BENCHMARK_REPORT.md | 135 +++++++++++++++++++++++++++++ benchmarks/community_results.json | 106 ++++++++++++++++++++++ 2 files changed, 241 insertions(+) create mode 100644 benchmarks/ANE_BENCHMARK_REPORT.md create mode 100644 benchmarks/community_results.json diff --git a/benchmarks/ANE_BENCHMARK_REPORT.md b/benchmarks/ANE_BENCHMARK_REPORT.md new file mode 100644 index 0000000..75ac6c8 --- /dev/null +++ b/benchmarks/ANE_BENCHMARK_REPORT.md @@ -0,0 +1,135 @@ +# Apple Neural Engine — Cross-Generation Benchmark Report + +Community-submitted benchmark data from [Issue #3](https://github.com/maderix/ANE/issues/3). +All results use Stories110M (12-layer transformer, 109M params, dim=768, seq=256). + +## Training Performance (Static Pipeline) + +``` +Chip ms/step ANE ms Compile/10 ANE TFLOPS Util% Contributor +───────────────────────────────────────────────────────────────────────────────── +M1 Pro 148-163 32-35 7.9-8.5s 0.57-0.63 3.6-4.0 @moriwang +M1 Max 143-167 35-45 ~7.1s 0.54-0.65 3.4-4.1 @andyg5000 +M3 Ultra* 91 ~10 ~3.7s 0.88 5.6 (repo ref) +M4 Pro 69-73 8.9 ~3.5s 1.28 8.1 @srt54558 +M4 Max 64 10.2 ~3.5s 1.45 9.2 @SethBurkart123 +M5 101-120 9.1-9.8 3.2-3.4s 0.77-0.91 4.9-5.8 @GitBubble +``` + +*M3 Ultra = reference platform this project was developed on. + +## Peak ANE Throughput (inmem_peak, 128x conv 512ch sp64) + +``` +Chip TFLOPS Rated TOPS Utilization +─────────────────────────────────────────────────── +M1 Pro FAIL 11 - (MIL compat issue) +M1 Max FAIL 11 - (MIL compat issue) +M3 Pro 9.98 15.8 63% +M4 Pro 12.57 38 33% +M4 Max 10.93 38 29% +M5 12.17 ~19* 64% +M5 (other) 12.44 ~19* 65% +``` + +*M5 ANE TOPS not officially disclosed; ~19 TOPS estimated from measured peak. + +## Comparative Chart + +``` +ANE Training Speed (ms/step, lower is better) +══════════════════════════════════════════════════════════════ + +M1 Pro ████████████████████████████████████████░░░░ 148-163 ms +M1 Max ██████████████████████████████████████░░░░░░ 143-167 ms +M3 Ultra ██████████████████░░░░░░░░░░░░░░░░░░░░░░░░░ 91 ms +M4 Pro ██████████████░░░░░░░░░░░░░░░░░░░░░░░░░░░░░ 69-73 ms +M4 Max ████████████░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░ 64 ms +M5 ████████████████████████░░░░░░░░░░░░░░░░░░░░ 101-120 ms + + 0 50 100 150 200 + + +Peak ANE Throughput (TFLOPS, higher is better) +══════════════════════════════════════════════════════════════ + +M1 Pro FAIL (MIL compat) +M1 Max FAIL (MIL compat) +M3 Pro ████████████████████░░░░░░░░░░░░░░░░░░░░░░░░ 9.98 +M4 Pro ████████████████████████████████░░░░░░░░░░░░░ 12.57 +M4 Max ██████████████████████░░░░░░░░░░░░░░░░░░░░░░ 10.93 +M5 █████████████████████████░░░░░░░░░░░░░░░░░░░ 12.17 + + 0 3 6 9 12 15 18 + + +ANE Sustained Throughput (TFLOPS, 5s window) +══════════════════════════════════════════════════════════════ + +M3 Pro ██████████████████████████████████████████████ 15.04 (95.2%) + + 0 3 6 9 12 15 18 + (Only M3 Pro submitted sustained benchmark) +``` + +## Key Findings + +### M1/M1 Pro/M1 Max +- **Standalone benchmarks fail** — `ane_mil_gen.h` single-blob weight format rejected +- **Training works** via `stories_mil.h` (separate per-matrix weight blobs) +- ANE compiler handles weight blobs differently from M4+ +- Training at 148-167 ms/step, ~0.6 TFLOPS + +### M3 Pro +- **Only ch=512 compiles** — 52 channel values tested (1-4096), only 512 accepted +- Fixed 512-wide lane structure in SRAM tiling +- **Peak: 16.77 TFLOPS** (106% of rated 15.8 TOPS) at 128x conv 512ch sp2048 +- **Sustained: 15.04 TFLOPS** over 5 seconds (95.2% utilization) +- Spatial dimension is the key to peak throughput (sp64→sp2048 = 2x improvement) + +### M4 Pro / M4 Max +- Flexible channel support (256/384/512/768+) +- M4 Pro: peak 12.57 TFLOPS, training at 72.5 ms/step +- M4 Max: peak 10.93 TFLOPS, training at 64 ms/step (fastest overall) +- `sram_probe` and `inmem_bench` fail on M4 Pro (same MIL compat issue) + +### M5 +- Training works out of the box with existing `program(1.3)` MIL +- Training speed 101-120 ms/step (slower than M4 Max, comparable to M3 Ultra) +- Peak ANE throughput ~12.2-12.4 TFLOPS (similar to M4 Pro) +- ANE appears to be same H16 family as M4 +- **M5 Pro/Max not yet benchmarked** — Fusion Architecture may change ANE behavior + +### Cross-Generation MIL Compatibility + +``` +Feature M1 M3 M4 M5 +───────────────────────────────────────────────────────── +program(1.3) / ios18 PARTIAL YES YES YES +Single-blob weights FAIL YES YES YES +Per-matrix weight blobs YES YES YES YES +Channel flexibility ? ch=512 FLEX FLEX +BLOBFILE offset refs FAIL YES YES YES +``` + +## macOS Compatibility Issues + +- **macOS 26.x** — `[MLModel compileModelAtURL:]` broken for standalone benchmarks + (fixed in PR #27: switched to in-memory MIL compilation) +- **macOS 15.x** — Works for all M-series with correct MIL format +- M1 generation requires `stories_mil.h` path, not `ane_mil_gen.h` + +## How to Contribute + +Run on your hardware and post results to [Issue #3](https://github.com/maderix/ANE/issues/3): + +```bash +cd training && make train_large +./train_large ane_stories110M_ckpt.bin 256 20 1e-4 +``` + +Include: chip model, macOS version, full output with JSON lines. + +--- +*Report compiled 2026-03-04 from community submissions.* +*Contributors: @SethBurkart123, @srt54558, @andyg5000, @moriwang, @D-Ogi, @GitBubble, @elijah-pelton* diff --git a/benchmarks/community_results.json b/benchmarks/community_results.json new file mode 100644 index 0000000..a9eaaa8 --- /dev/null +++ b/benchmarks/community_results.json @@ -0,0 +1,106 @@ +{ + "report_date": "2026-03-04", + "source": "https://github.com/maderix/ANE/issues/3", + "model": "Stories110M (12-layer transformer, 109M params)", + "config": {"dim": 768, "hidden": 2048, "heads": 12, "seq": 256, "vocab": 32000, "layers": 12}, + "training_results": [ + { + "chip": "M1 Pro", + "cores": "10-core CPU", + "ram_gb": 32, + "macos": "15.0", + "ms_per_step": [148, 163], + "ane_ms": [32, 35], + "compile_ms": [7900, 8500], + "ane_tflops": [0.57, 0.63], + "ane_util_pct": [3.6, 4.0], + "benchmarks_pass": false, + "notes": "Standalone benchmarks fail (MIL compat). Training works via stories_mil.h.", + "contributor": "moriwang" + }, + { + "chip": "M1 Max", + "cores": "10-core CPU", + "ram_gb": 64, + "macos": "15.6.1", + "ms_per_step": [143, 167], + "ane_ms": [35, 45], + "compile_ms": [7100, 7100], + "ane_tflops": [0.54, 0.65], + "ane_util_pct": [3.4, 4.1], + "benchmarks_pass": false, + "notes": "Same MIL compat issue as M1 Pro.", + "contributor": "andyg5000" + }, + { + "chip": "M3 Pro", + "cores": "12-core CPU", + "ram_gb": 36, + "macos": "15.7.4", + "peak_tflops": 16.77, + "sustained_tflops": 15.04, + "sustained_util_pct": 95.2, + "channel_constraint": "ch=512 only", + "notes": "Only ch=512 compiles. 52 values tested. Peak at 128x conv 512ch sp2048.", + "contributor": "D-Ogi" + }, + { + "chip": "M4 Pro", + "cores": "unknown", + "ram_gb": null, + "macos": null, + "ms_per_step": [69, 73], + "ane_ms": [8.9, 8.9], + "compile_ms": [3465, 3465], + "ane_tflops": [1.28, 1.28], + "ane_util_pct": [8.1, 8.1], + "peak_tflops_inmem": 12.57, + "notes": "sram_probe and inmem_bench fail. inmem_peak and training work.", + "contributor": "srt54558" + }, + { + "chip": "M4 Max", + "cores": "unknown", + "ram_gb": null, + "macos": null, + "ms_per_step": [64, 64], + "ane_ms": [10.2, 10.2], + "compile_ms": [3531, 3531], + "ane_tflops": [1.45, 1.45], + "ane_util_pct": [9.2, 9.2], + "peak_tflops_inmem": 10.93, + "notes": "Fastest training ms/step overall.", + "contributor": "SethBurkart123" + }, + { + "chip": "M5", + "cores": "10-core (4P+6E)", + "ram_gb": 16, + "macos": "26.3", + "ms_per_step": [101, 120], + "ane_ms": [9.1, 9.8], + "compile_ms": [3200, 3400], + "ane_tflops": [0.77, 0.91], + "ane_util_pct": [4.9, 5.8], + "peak_tflops_inmem": 12.44, + "notes": "H16 ANE family (same as M4). Training works with existing program(1.3) MIL.", + "contributor": "GitBubble" + }, + { + "chip": "M5", + "cores": "unknown", + "ram_gb": 32, + "macos": "26.4", + "peak_tflops_inmem": 12.17, + "notes": "inmem_peak only, no training data submitted.", + "contributor": "elijah-pelton" + } + ], + "neural_engine_specs": { + "M1": {"cores": 16, "rated_tops": 11}, + "M2": {"cores": 16, "rated_tops": 15.8}, + "M3": {"cores": 16, "rated_tops": 15.8}, + "M4": {"cores": 16, "rated_tops": 38}, + "M5": {"cores": 16, "rated_tops": null, "estimated_tops": 19} + } +}