{ "report_date": "2026-03-04", "source": "https://github.com/maderix/ANE/issues/3", "model": "Stories110M (12-layer transformer, 109M params)", "config": {"dim": 768, "hidden": 2048, "heads": 12, "seq": 256, "vocab": 32000, "layers": 12}, "training_results": [ { "chip": "M1 Pro", "cores": "10-core CPU", "ram_gb": 32, "macos": "15.0", "ms_per_step": [148, 163], "ane_ms": [32, 35], "compile_ms": [7900, 8500], "ane_tflops": [0.57, 0.63], "ane_util_pct": [3.6, 4.0], "benchmarks_pass": false, "notes": "Standalone benchmarks fail (MIL compat). Training works via stories_mil.h.", "contributor": "moriwang" }, { "chip": "M1 Max", "cores": "10-core CPU", "ram_gb": 64, "macos": "15.6.1", "ms_per_step": [143, 167], "ane_ms": [35, 45], "compile_ms": [7100, 7100], "ane_tflops": [0.54, 0.65], "ane_util_pct": [3.4, 4.1], "benchmarks_pass": false, "notes": "Same MIL compat issue as M1 Pro.", "contributor": "andyg5000" }, { "chip": "M3 Pro", "cores": "12-core CPU", "ram_gb": 36, "macos": "15.7.4", "peak_tflops": 16.77, "sustained_tflops": 15.04, "sustained_util_pct": 95.2, "channel_constraint": "ch=512 only", "notes": "Only ch=512 compiles. 52 values tested. Peak at 128x conv 512ch sp2048.", "contributor": "D-Ogi" }, { "chip": "M4 Pro", "cores": "unknown", "ram_gb": null, "macos": null, "ms_per_step": [69, 73], "ane_ms": [8.9, 8.9], "compile_ms": [3465, 3465], "ane_tflops": [1.28, 1.28], "ane_util_pct": [8.1, 8.1], "peak_tflops_inmem": 12.57, "notes": "sram_probe and inmem_bench fail. inmem_peak and training work.", "contributor": "srt54558" }, { "chip": "M4 Max", "cores": "unknown", "ram_gb": null, "macos": null, "ms_per_step": [64, 64], "ane_ms": [10.2, 10.2], "compile_ms": [3531, 3531], "ane_tflops": [1.45, 1.45], "ane_util_pct": [9.2, 9.2], "peak_tflops_inmem": 10.93, "notes": "Fastest training ms/step overall.", "contributor": "SethBurkart123" }, { "chip": "M5", "cores": "10-core (4P+6E)", "ram_gb": 16, "macos": "26.3", "ms_per_step": [101, 120], "ane_ms": [9.1, 9.8], "compile_ms": [3200, 3400], "ane_tflops": [0.77, 0.91], "ane_util_pct": [4.9, 5.8], "peak_tflops_inmem": 12.44, "notes": "H16 ANE family (same as M4). Training works with existing program(1.3) MIL.", "contributor": "GitBubble" }, { "chip": "M5", "cores": "unknown", "ram_gb": 32, "macos": "26.4", "peak_tflops_inmem": 12.17, "notes": "inmem_peak only, no training data submitted.", "contributor": "elijah-pelton" } ], "neural_engine_specs": { "M1": {"ne_cores": 16, "rated_tops": 11}, "M1_Max": {"ne_cores": 16, "rated_tops": 11}, "M1_Ultra": {"ne_cores": 32, "rated_tops": 22}, "M2": {"ne_cores": 16, "rated_tops": 15.8}, "M2_Max": {"ne_cores": 16, "rated_tops": 15.8}, "M2_Ultra": {"ne_cores": 32, "rated_tops": 31.6}, "M3": {"ne_cores": 16, "rated_tops": 15.8}, "M3_Max": {"ne_cores": 16, "rated_tops": 15.8}, "M3_Ultra": {"ne_cores": 32, "rated_tops": 31.6}, "M4": {"ne_cores": 16, "rated_tops": 38, "note": "INT8/mixed-precision spec"}, "M4_Max": {"ne_cores": 16, "rated_tops": 38, "note": "INT8/mixed-precision spec"}, "M5": {"ne_cores": 16, "rated_tops": null, "estimated_tops": 19} } }