ANE/benchmarks/community_results.json

{
  "report_date": "2026-03-04",
  "source": "https://github.com/maderix/ANE/issues/3",
  "model": "Stories110M (12-layer transformer, 109M params)",
  "config": {"dim": 768, "hidden": 2048, "heads": 12, "seq": 256, "vocab": 32000, "layers": 12},
  "training_results": [
    {
      "chip": "M1 Pro",
      "cores": "10-core CPU",
      "ram_gb": 32,
      "macos": "15.0",
      "ms_per_step": [148, 163],
      "ane_ms": [32, 35],
      "compile_ms": [7900, 8500],
      "ane_tflops": [0.57, 0.63],
      "ane_util_pct": [3.6, 4.0],
      "benchmarks_pass": false,
      "notes": "Standalone benchmarks fail (MIL compat). Training works via stories_mil.h.",
      "contributor": "moriwang"
    },
    {
      "chip": "M1 Max",
      "cores": "10-core CPU",
      "ram_gb": 64,
      "macos": "15.6.1",
      "ms_per_step": [143, 167],
      "ane_ms": [35, 45],
      "compile_ms": [7100, 7100],
      "ane_tflops": [0.54, 0.65],
      "ane_util_pct": [3.4, 4.1],
      "benchmarks_pass": false,
      "notes": "Same MIL compat issue as M1 Pro.",
      "contributor": "andyg5000"
    },
    {
      "chip": "M3 Pro",
      "cores": "12-core CPU",
      "ram_gb": 36,
      "macos": "15.7.4",
      "peak_tflops": 16.77,
      "sustained_tflops": 15.04,
      "sustained_util_pct": 95.2,
      "channel_constraint": "ch=512 only",
      "notes": "Only ch=512 compiles. 52 values tested. Peak at 128x conv 512ch sp2048.",
      "contributor": "D-Ogi"
    },
    {
      "chip": "M4 Pro",
      "cores": "unknown",
      "ram_gb": null,
      "macos": null,
      "ms_per_step": [69, 73],
      "ane_ms": [8.9, 8.9],
      "compile_ms": [3465, 3465],
      "ane_tflops": [1.28, 1.28],
      "ane_util_pct": [8.1, 8.1],
      "peak_tflops_inmem": 12.57,
      "notes": "sram_probe and inmem_bench fail. inmem_peak and training work.",
      "contributor": "srt54558"
    },
    {
      "chip": "M4 Max",
      "cores": "unknown",
      "ram_gb": null,
      "macos": null,
      "ms_per_step": [64, 64],
      "ane_ms": [10.2, 10.2],
      "compile_ms": [3531, 3531],
      "ane_tflops": [1.45, 1.45],
      "ane_util_pct": [9.2, 9.2],
      "peak_tflops_inmem": 10.93,
      "notes": "Fastest training ms/step overall.",
      "contributor": "SethBurkart123"
    },
    {
      "chip": "M5",
      "cores": "10-core (4P+6E)",
      "ram_gb": 16,
      "macos": "26.3",
      "ms_per_step": [101, 120],
      "ane_ms": [9.1, 9.8],
      "compile_ms": [3200, 3400],
      "ane_tflops": [0.77, 0.91],
      "ane_util_pct": [4.9, 5.8],
      "peak_tflops_inmem": 12.44,
      "notes": "H16 ANE family (same as M4). Training works with existing program(1.3) MIL.",
      "contributor": "GitBubble"
    },
    {
      "chip": "M5",
      "cores": "unknown",
      "ram_gb": 32,
      "macos": "26.4",
      "peak_tflops_inmem": 12.17,
      "notes": "inmem_peak only, no training data submitted.",
      "contributor": "elijah-pelton"
    }
  ],
  "neural_engine_specs": {
    "M1":       {"ne_cores": 16, "rated_tops": 11},
    "M1_Max":   {"ne_cores": 16, "rated_tops": 11},
    "M1_Ultra": {"ne_cores": 32, "rated_tops": 22},
    "M2":       {"ne_cores": 16, "rated_tops": 15.8},
    "M2_Max":   {"ne_cores": 16, "rated_tops": 15.8},
    "M2_Ultra": {"ne_cores": 32, "rated_tops": 31.6},
    "M3":       {"ne_cores": 16, "rated_tops": 15.8},
    "M3_Max":   {"ne_cores": 16, "rated_tops": 15.8},
    "M3_Ultra": {"ne_cores": 32, "rated_tops": 31.6},
    "M4":       {"ne_cores": 16, "rated_tops": 38, "note": "INT8/mixed-precision spec"},
    "M4_Max":   {"ne_cores": 16, "rated_tops": 38, "note": "INT8/mixed-precision spec"},
    "M5":       {"ne_cores": 16, "rated_tops": null, "estimated_tops": 19}
  }
}