From 9832240e72f2fc9dd9f57eac6cabfc709ae0405a Mon Sep 17 00:00:00 2001 From: Erik Bray Date: Tue, 3 Mar 2026 14:29:11 +0100 Subject: [PATCH] [feat] Community benchmark system: standardized JSON output, auto-submit to dashboard, aggregation script, M4 Max reference result --- community_benchmarks/README.md | 111 ++++++ .../apple_m4_max_20260303.json | 67 ++++ scripts/aggregate_benchmarks.py | 153 +++++++ scripts/run_community_benchmark.sh | 375 ++++++++++++++++++ 4 files changed, 706 insertions(+) create mode 100644 community_benchmarks/README.md create mode 100644 community_benchmarks/apple_m4_max_20260303.json create mode 100644 scripts/aggregate_benchmarks.py create mode 100755 scripts/run_community_benchmark.sh diff --git a/community_benchmarks/README.md b/community_benchmarks/README.md new file mode 100644 index 0000000..f9e08c6 --- /dev/null +++ b/community_benchmarks/README.md @@ -0,0 +1,111 @@ +# ANE Community Benchmarks + +Standardized benchmark results from different Apple Silicon machines, contributed by the community. + +## How to Run + +```bash +# Full benchmark (SRAM probe + peak TFLOPS + training) +bash scripts/run_community_benchmark.sh + +# Quick benchmark (skip training -- useful if you don't have training data) +bash scripts/run_community_benchmark.sh --skip-training + +# Custom training steps +bash scripts/run_community_benchmark.sh --steps 50 +``` + +This produces a JSON file in `community_benchmarks/` named `_.json`. + +### Prerequisites + +- macOS on Apple Silicon (M1/M2/M3/M4/M5) +- Xcode command line tools (`xcode-select --install`) +- Python 3.11-3.13 with `coremltools` (auto-installed into a temp venv) +- For training benchmarks: run `cd training && make data` first + +## How to Submit + +### Option 1: Pull Request + +1. Fork this repo +2. Run the benchmark: `bash scripts/run_community_benchmark.sh` +3. Commit the generated JSON file from `community_benchmarks/` +4. Open a PR + +### Option 2: GitHub Issue + +1. Run the benchmark +2. Open a [new issue](../../issues/new) with title "Benchmark: [Your Chip]" +3. Paste the contents of your JSON file + +## Viewing Aggregated Results + +```bash +python3 scripts/aggregate_benchmarks.py +``` + +This reads all JSON files in `community_benchmarks/` and prints a markdown comparison table. + +## JSON Schema (v1) + +Each submission contains: + +```json +{ + "schema_version": 1, + "timestamp": "2026-03-03T12:00:00Z", + "system": { + "chip": "Apple M4 Max", + "machine": "Mac16,5", + "macos_version": "26.2", + "memory_gb": 128, + "neural_engine_cores": "16" + }, + "benchmarks": { + "sram_probe": [ + {"channels": 256, "weight_mb": 0.1, "ms_per_eval": 0.378, "tflops": 0.02, "gflops_per_mb": 177.7}, + ... + ], + "inmem_peak": [ + {"depth": 128, "channels": 512, "spatial": 64, "weight_mb": 64.0, "gflops": 4.29, "ms_per_eval": 0.385, "tflops": 11.14}, + ... + ], + "training_cpu_classifier": { + "ms_per_step": 72.4, + "ane_tflops_sustained": 1.29, + "ane_util_pct": 8.1, + "compile_pct": 79.7 + }, + "training_ane_classifier": { + "ms_per_step": 62.9, + "ane_tflops_sustained": 1.68, + "ane_util_pct": 10.6, + "compile_pct": 84.5 + } + }, + "summary": { + "peak_tflops": 11.14, + "sram_spill_start_channels": 4096, + "training_ms_per_step_cpu": 72.4, + "training_ms_per_step_ane": 62.9, + "training_ane_tflops": 1.68, + "training_ane_util_pct": 10.6 + } +} +``` + +## What We're Measuring + +| Benchmark | What it tells us | +|-----------|-----------------| +| **sram_probe** | ANE SRAM capacity -- where weight spilling starts | +| **inmem_peak** | Maximum achievable TFLOPS via programmatic MIL | +| **training (CPU cls)** | End-to-end training perf with CPU classifier | +| **training (ANE cls)** | End-to-end training perf with ANE-offloaded classifier | + +Key metrics to compare across chips: +- **Peak TFLOPS**: raw ANE compute capability +- **SRAM spill point**: determines max efficient kernel size +- **Training ms/step**: real-world training performance +- **ANE utilization %**: how much of peak we actually use diff --git a/community_benchmarks/apple_m4_max_20260303.json b/community_benchmarks/apple_m4_max_20260303.json new file mode 100644 index 0000000..34f6da0 --- /dev/null +++ b/community_benchmarks/apple_m4_max_20260303.json @@ -0,0 +1,67 @@ +{ + "schema_version": 1, + "timestamp": "2026-03-03T11:46:08Z", + "system": { + "chip": "Apple M4 Max", + "machine": "Mac16,5", + "macos_version": "26.2", + "macos_build": "25C56", + "cpu_cores": 16, + "memory_gb": 128, + "neural_engine_cores": "16" + }, + "benchmarks": { + "sram_probe": [ + {"channels": 256, "weight_mb": 0.1, "ms_per_eval": 0.378, "tflops": 0.02, "gflops_per_mb": 177.7}, + {"channels": 512, "weight_mb": 0.5, "ms_per_eval": 0.431, "tflops": 0.08, "gflops_per_mb": 155.6}, + {"channels": 1024, "weight_mb": 2.0, "ms_per_eval": 0.411, "tflops": 0.33, "gflops_per_mb": 163.5}, + {"channels": 1536, "weight_mb": 4.5, "ms_per_eval": 0.493, "tflops": 0.61, "gflops_per_mb": 136.1}, + {"channels": 2048, "weight_mb": 8.0, "ms_per_eval": 0.410, "tflops": 1.31, "gflops_per_mb": 163.9}, + {"channels": 2560, "weight_mb": 12.5, "ms_per_eval": 0.237, "tflops": 3.53, "gflops_per_mb": 282.6}, + {"channels": 3072, "weight_mb": 18.0, "ms_per_eval": 0.335, "tflops": 3.60, "gflops_per_mb": 200.1}, + {"channels": 3584, "weight_mb": 24.5, "ms_per_eval": 0.414, "tflops": 3.97, "gflops_per_mb": 162.1}, + {"channels": 4096, "weight_mb": 32.0, "ms_per_eval": 1.134, "tflops": 1.89, "gflops_per_mb": 59.2}, + {"channels": 4608, "weight_mb": 40.5, "ms_per_eval": 0.563, "tflops": 4.83, "gflops_per_mb": 119.2}, + {"channels": 5120, "weight_mb": 50.0, "ms_per_eval": 0.659, "tflops": 5.09, "gflops_per_mb": 101.8}, + {"channels": 6144, "weight_mb": 72.0, "ms_per_eval": 0.844, "tflops": 5.73, "gflops_per_mb": 79.5}, + {"channels": 8192, "weight_mb": 128.0, "ms_per_eval": 4.203, "tflops": 1.02, "gflops_per_mb": 8.0} + ], + "inmem_peak": [ + {"depth": 32, "channels": 512, "spatial": 64, "weight_mb": 16.0, "gflops": 1.07, "ms_per_eval": 0.408, "tflops": 2.63}, + {"depth": 48, "channels": 512, "spatial": 64, "weight_mb": 24.0, "gflops": 1.61, "ms_per_eval": 0.262, "tflops": 6.15}, + {"depth": 64, "channels": 512, "spatial": 64, "weight_mb": 32.0, "gflops": 2.15, "ms_per_eval": 0.244, "tflops": 8.80}, + {"depth": 96, "channels": 512, "spatial": 64, "weight_mb": 48.0, "gflops": 3.22, "ms_per_eval": 0.326, "tflops": 9.89}, + {"depth": 128, "channels": 512, "spatial": 64, "weight_mb": 64.0, "gflops": 4.29, "ms_per_eval": 0.385, "tflops": 11.14}, + {"depth": 64, "channels": 256, "spatial": 64, "weight_mb": 8.0, "gflops": 0.54, "ms_per_eval": 0.365, "tflops": 1.47}, + {"depth": 128, "channels": 256, "spatial": 64, "weight_mb": 16.0, "gflops": 1.07, "ms_per_eval": 0.454, "tflops": 2.37}, + {"depth": 256, "channels": 256, "spatial": 64, "weight_mb": 32.0, "gflops": 2.15, "ms_per_eval": 0.351, "tflops": 6.11}, + {"depth": 64, "channels": 384, "spatial": 64, "weight_mb": 18.0, "gflops": 1.21, "ms_per_eval": 0.429, "tflops": 2.82}, + {"depth": 128, "channels": 384, "spatial": 64, "weight_mb": 36.0, "gflops": 2.42, "ms_per_eval": 0.354, "tflops": 6.82} + ], + "training_cpu_classifier": { + "ms_per_step": 72.4, + "ane_tflops_sustained": 1.29, + "total_tflops": 2.41, + "ane_util_pct": 8.1, + "compile_pct": 79.7, + "train_pct": 16.4 + }, + "training_ane_classifier": { + "ms_per_step": 62.9, + "ane_tflops_sustained": 1.68, + "total_tflops": 2.77, + "ane_util_pct": 10.6, + "compile_pct": 84.5, + "train_pct": 12.5 + } + }, + "summary": { + "peak_tflops": 11.14, + "sram_peak_efficiency_gflops_per_mb": 282.6, + "sram_spill_start_channels": 4096, + "training_ms_per_step_cpu": 72.4, + "training_ms_per_step_ane": 62.9, + "training_ane_tflops": 1.68, + "training_ane_util_pct": 10.6 + } +} diff --git a/scripts/aggregate_benchmarks.py b/scripts/aggregate_benchmarks.py new file mode 100644 index 0000000..7908bf4 --- /dev/null +++ b/scripts/aggregate_benchmarks.py @@ -0,0 +1,153 @@ +#!/usr/bin/env python3 +"""Aggregate community benchmark JSON files into summary tables. + +Usage: + python3 scripts/aggregate_benchmarks.py [community_benchmarks/] + +Reads all .json files from the given directory (default: community_benchmarks/) +and produces: + 1. A markdown summary table to stdout + 2. A combined JSON file at community_benchmarks/SUMMARY.json +""" + +import json +import os +import sys +from pathlib import Path + +def load_submissions(directory): + submissions = [] + for f in sorted(Path(directory).glob("*.json")): + if f.name == "SUMMARY.json": + continue + try: + with open(f) as fh: + data = json.load(fh) + if data.get("schema_version") != 1: + print(f" SKIP {f.name}: unknown schema_version", file=sys.stderr) + continue + data["_filename"] = f.name + submissions.append(data) + except (json.JSONDecodeError, KeyError) as e: + print(f" SKIP {f.name}: {e}", file=sys.stderr) + return submissions + +def format_table(submissions): + lines = [] + lines.append("# ANE Community Benchmark Results\n") + lines.append(f"Total submissions: {len(submissions)}\n") + + header = ( + "| Chip | Machine | macOS | Memory | " + "Peak TFLOPS | SRAM Spill (ch) | " + "Train ms/step (CPU) | Train ms/step (ANE) | " + "ANE TFLOPS | ANE Util % | Date |" + ) + sep = "|" + "|".join(["---"] * 11) + "|" + lines.append(header) + lines.append(sep) + + for s in submissions: + sys_info = s.get("system", {}) + summary = s.get("summary", {}) + + def fmt(v, suffix=""): + if v is None: + return "-" + if isinstance(v, float): + return f"{v:.2f}{suffix}" + return str(v) + + row = "| {} | {} | {} | {} GB | {} | {} | {} | {} | {} | {} | {} |".format( + sys_info.get("chip", "?"), + sys_info.get("machine", "?"), + sys_info.get("macos_version", "?"), + sys_info.get("memory_gb", "?"), + fmt(summary.get("peak_tflops")), + summary.get("sram_spill_start_channels") or "-", + fmt(summary.get("training_ms_per_step_cpu")), + fmt(summary.get("training_ms_per_step_ane")), + fmt(summary.get("training_ane_tflops")), + fmt(summary.get("training_ane_util_pct"), "%"), + s.get("timestamp", "?")[:10], + ) + lines.append(row) + + lines.append("") + + if submissions: + lines.append("## SRAM Probe Comparison\n") + all_channels = set() + for s in submissions: + for probe in s.get("benchmarks", {}).get("sram_probe", []): + all_channels.add(probe["channels"]) + all_channels = sorted(all_channels) + + if all_channels: + header_cols = ["Channels (W MB)"] + [ + s.get("system", {}).get("chip", "?").replace("Apple ", "") + for s in submissions + ] + lines.append("| " + " | ".join(header_cols) + " |") + lines.append("|" + "|".join(["---"] * len(header_cols)) + "|") + + for ch in all_channels: + row_parts = [] + weight_mb = None + for s in submissions: + probe_data = {p["channels"]: p for p in s.get("benchmarks", {}).get("sram_probe", [])} + if ch in probe_data: + p = probe_data[ch] + if weight_mb is None: + weight_mb = p["weight_mb"] + row_parts.append(f"{p['tflops']:.2f} TFLOPS ({p['ms_per_eval']:.3f} ms)") + else: + row_parts.append("-") + + ch_label = f"{ch} ({weight_mb:.1f} MB)" if weight_mb else str(ch) + lines.append("| " + ch_label + " | " + " | ".join(row_parts) + " |") + lines.append("") + + return "\n".join(lines) + +def main(): + directory = sys.argv[1] if len(sys.argv) > 1 else "community_benchmarks" + + if not os.path.isdir(directory): + print(f"Directory not found: {directory}", file=sys.stderr) + print("Run the community benchmark first:", file=sys.stderr) + print(" bash scripts/run_community_benchmark.sh", file=sys.stderr) + sys.exit(1) + + submissions = load_submissions(directory) + if not submissions: + print("No valid benchmark submissions found.", file=sys.stderr) + sys.exit(1) + + table = format_table(submissions) + print(table) + + summary_path = os.path.join(directory, "SUMMARY.json") + combined = { + "generated": submissions[0].get("timestamp", ""), + "count": len(submissions), + "submissions": [ + { + "chip": s.get("system", {}).get("chip"), + "machine": s.get("system", {}).get("machine"), + "macos_version": s.get("system", {}).get("macos_version"), + "memory_gb": s.get("system", {}).get("memory_gb"), + "summary": s.get("summary", {}), + "timestamp": s.get("timestamp"), + "filename": s.get("_filename"), + } + for s in submissions + ], + } + with open(summary_path, "w") as f: + json.dump(combined, f, indent=2) + f.write("\n") + print(f"\nSummary JSON written to: {summary_path}", file=sys.stderr) + +if __name__ == "__main__": + main() diff --git a/scripts/run_community_benchmark.sh b/scripts/run_community_benchmark.sh new file mode 100755 index 0000000..3b01e02 --- /dev/null +++ b/scripts/run_community_benchmark.sh @@ -0,0 +1,375 @@ +#!/bin/bash +# run_community_benchmark.sh -- Standardized ANE benchmark for community submissions +# +# Runs a focused set of benchmarks and outputs a single JSON file that can be +# submitted to the community_benchmarks/ directory via PR or GitHub issue. +# +# Usage: +# bash scripts/run_community_benchmark.sh [--steps N] [--skip-training] +# +# Output: +# community_benchmarks/_.json + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +ROOT_DIR="$(cd "$SCRIPT_DIR/.." && pwd)" +TRAINING_DIR="$ROOT_DIR/training" + +STEPS=20 +SKIP_TRAINING=false + +while [[ $# -gt 0 ]]; do + case "$1" in + --steps) STEPS="$2"; shift 2 ;; + --skip-training) SKIP_TRAINING=true; shift ;; + --help|-h) + echo "Usage: bash scripts/run_community_benchmark.sh [--steps N] [--skip-training]" + echo " --steps N Training steps (default: 20)" + echo " --skip-training Skip training benchmarks (useful if no training data)" + exit 0 ;; + *) echo "Unknown option: $1"; exit 1 ;; + esac +done + +# ── Collect system info ── + +CHIP=$(sysctl -n machdep.cpu.brand_string 2>/dev/null || echo "unknown") +MACHINE=$(sysctl -n hw.model 2>/dev/null || echo "unknown") +MACOS_VER=$(sw_vers -productVersion 2>/dev/null || echo "unknown") +MACOS_BUILD=$(sw_vers -buildVersion 2>/dev/null || echo "unknown") +NCPU=$(sysctl -n hw.ncpu 2>/dev/null || echo "0") +MEM_BYTES=$(sysctl -n hw.memsize 2>/dev/null || echo "0") +MEM_GB=$(echo "scale=0; $MEM_BYTES / 1073741824" | bc 2>/dev/null || echo "0") +NEURAL_CORES=$(sysctl -n hw.optional.ane.num_cores 2>/dev/null || echo "unknown") +DATE_ISO=$(date -u +"%Y-%m-%dT%H:%M:%SZ") +DATE_SHORT=$(date +"%Y%m%d") + +CHIP_SLUG=$(echo "$CHIP" | tr ' ' '_' | tr -d '()' | tr '[:upper:]' '[:lower:]') + +echo "=== ANE Community Benchmark ===" +echo "Chip: $CHIP" +echo "Machine: $MACHINE" +echo "macOS: $MACOS_VER ($MACOS_BUILD)" +echo "Memory: ${MEM_GB} GB" +echo "CPUs: $NCPU" +echo "ANE cores: $NEURAL_CORES" +echo "" + +# ── Prerequisites ── + +if [[ "$(uname)" != "Darwin" ]]; then + echo "ERROR: macOS required"; exit 1 +fi +if ! sysctl -n hw.optional.arm64 2>/dev/null | grep -q 1; then + echo "ERROR: Apple Silicon required"; exit 1 +fi +if ! xcrun --find clang >/dev/null 2>&1; then + echo "ERROR: Xcode CLI tools required. Run: xcode-select --install"; exit 1 +fi + +CC="xcrun clang" +CFLAGS="-O2 -fobjc-arc -fstack-protector-strong -framework Foundation -framework CoreML -framework IOSurface -ldl" + +# ── Ask for GitHub username (optional) ── + +echo "Enter your GitHub username (optional, press Enter to skip):" +read -r GH_USERNAME +GH_USERNAME=$(echo "$GH_USERNAME" | tr -d '[:space:]' | sed 's/[^a-zA-Z0-9_-]//g' | cut -c1-39) + +if [[ -n "$GH_USERNAME" ]]; then + echo "Username: $GH_USERNAME" +else + echo "Submitting anonymously" +fi +echo "" + +# ── Temp file for collecting JSON fragments ── + +TMPJSON=$(mktemp /tmp/ane_bench_XXXXXX.json) +trap "rm -f $TMPJSON" EXIT + +# Start building the JSON result +USERNAME_LINE="" +if [[ -n "$GH_USERNAME" ]]; then + USERNAME_LINE="\"username\": \"$GH_USERNAME\"," +fi + +cat > "$TMPJSON" << HEADER +{ + "schema_version": 1, + $USERNAME_LINE + "timestamp": "$DATE_ISO", + "system": { + "chip": "$CHIP", + "machine": "$MACHINE", + "macos_version": "$MACOS_VER", + "macos_build": "$MACOS_BUILD", + "cpu_cores": $NCPU, + "memory_gb": $MEM_GB, + "neural_engine_cores": "$NEURAL_CORES" + }, +HEADER + +# ── 1. SRAM Probe ── + +echo "--- Running sram_probe ---" +SRAM_JSON="[]" + +# Generate mlpackage models if needed +if ! ls /tmp/ane_sram_*ch_*sp.mlpackage >/dev/null 2>&1; then + echo " Generating mlpackage models..." + VENV_PYTHON="" + if [[ -x /tmp/ane_venv/bin/python3 ]]; then + VENV_PYTHON="/tmp/ane_venv/bin/python3" + else + for pyver in 3.12 3.13 3.11; do + PY="/opt/homebrew/opt/python@${pyver}/bin/python${pyver}" + if [[ -x "$PY" ]]; then + "$PY" -m venv /tmp/ane_venv && /tmp/ane_venv/bin/pip install -q coremltools numpy 2>/dev/null + VENV_PYTHON="/tmp/ane_venv/bin/python3" + break + fi + done + fi + if [[ -n "$VENV_PYTHON" ]]; then + "$VENV_PYTHON" "$SCRIPT_DIR/gen_mlpackages.py" 2>/dev/null && echo " mlpackage models generated" || echo " WARNING: mlpackage generation failed" + fi +fi + +if ls /tmp/ane_sram_*ch_*sp.mlpackage >/dev/null 2>&1; then + cd "$ROOT_DIR" + $CC $CFLAGS -o sram_probe sram_probe.m 2>/dev/null + + SRAM_OUTPUT=$(./sram_probe 2>&1) || true + echo " sram_probe complete" + + SRAM_JSON=$(echo "$SRAM_OUTPUT" | python3 -c " +import sys, json, re +results = [] +for line in sys.stdin: + line = line.strip() + m = re.match(r'\s*(\d+)\s+ch\s+([\d.]+)\s+([\d.]+)\s+ms\s+([\d.]+)\s+([\d.]+)', line) + if m: + results.append({ + 'channels': int(m.group(1)), + 'weight_mb': float(m.group(2)), + 'ms_per_eval': float(m.group(3)), + 'tflops': float(m.group(4)), + 'gflops_per_mb': float(m.group(5)) + }) +print(json.dumps(results)) +" 2>/dev/null || echo "[]") +else + echo " SKIPPED: no mlpackage models" +fi + +# ── 2. InMem Peak ── + +echo "--- Running inmem_peak ---" +PEAK_JSON="[]" + +cd "$ROOT_DIR" +$CC $CFLAGS -o inmem_peak inmem_peak.m 2>/dev/null + +PEAK_OUTPUT=$(./inmem_peak 2>&1) || true +echo " inmem_peak complete" + +PEAK_JSON=$(echo "$PEAK_OUTPUT" | python3 -c " +import sys, json, re +results = [] +for line in sys.stdin: + line = line.strip() + m = re.match(r'(\d+)x\s+conv\s+(\d+)ch\s+sp(\d+)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)\s+ms\s+([\d.]+)', line) + if m: + results.append({ + 'depth': int(m.group(1)), + 'channels': int(m.group(2)), + 'spatial': int(m.group(3)), + 'weight_mb': float(m.group(4)), + 'gflops': float(m.group(5)), + 'ms_per_eval': float(m.group(6)), + 'tflops': float(m.group(7)) + }) +print(json.dumps(results)) +" 2>/dev/null || echo "[]") + +# ── 3. Training (optional) ── + +echo "--- Running training benchmark ($STEPS steps) ---" +TRAIN_CPU_JSON="{}" +TRAIN_ANE_JSON="{}" + +if ! $SKIP_TRAINING; then + cd "$TRAINING_DIR" + + # Build training binaries + make train_large train_large_ane 2>/dev/null || true + + if [[ -x ./train_large ]]; then + TRAIN_OUTPUT=$(./train_large --steps "$STEPS" 2>&1) || true + echo " train_large complete" + + TRAIN_CPU_JSON=$(echo "$TRAIN_OUTPUT" | python3 -c " +import sys, json, re +result = {} +for line in sys.stdin: + line = line.strip() + if line.startswith('{\"type\":\"perf\"'): + d = json.loads(line) + result['ane_tflops'] = d.get('ane_tflops') + result['ane_util_pct'] = d.get('ane_util_pct') + m = re.match(r'Avg train:\s+([\d.]+)\s+ms/step', line) + if m: result['ms_per_step'] = float(m.group(1)) + m = re.match(r'ANE TFLOPS:\s+([\d.]+)', line) + if m: result['ane_tflops_sustained'] = float(m.group(1)) + m = re.match(r'Total TFLOPS:\s+([\d.]+)', line) + if m: result['total_tflops'] = float(m.group(1)) + m = re.match(r'ANE utilization:\s+([\d.]+)%', line) + if m: result['ane_util_pct'] = float(m.group(1)) + m = re.match(r'Compile time:\s+\d+\s+ms\s+\(([\d.]+)%\)', line) + if m: result['compile_pct'] = float(m.group(1)) + m = re.match(r'Train time:\s+\d+\s+ms\s+\(([\d.]+)%\)', line) + if m: result['train_pct'] = float(m.group(1)) +print(json.dumps(result)) +" 2>/dev/null || echo "{}") + fi + + if [[ -x ./train_large_ane ]]; then + TRAIN_ANE_OUTPUT=$(./train_large_ane --steps "$STEPS" 2>&1) || true + echo " train_large_ane complete" + + TRAIN_ANE_JSON=$(echo "$TRAIN_ANE_OUTPUT" | python3 -c " +import sys, json, re +result = {} +for line in sys.stdin: + line = line.strip() + m = re.match(r'Avg train:\s+([\d.]+)\s+ms/step', line) + if m: result['ms_per_step'] = float(m.group(1)) + m = re.match(r'ANE TFLOPS:\s+([\d.]+)', line) + if m: result['ane_tflops_sustained'] = float(m.group(1)) + m = re.match(r'Total TFLOPS:\s+([\d.]+)', line) + if m: result['total_tflops'] = float(m.group(1)) + m = re.match(r'ANE utilization:\s+([\d.]+)%', line) + if m: result['ane_util_pct'] = float(m.group(1)) + m = re.match(r'Compile time:\s+\d+\s+ms\s+\(([\d.]+)%\)', line) + if m: result['compile_pct'] = float(m.group(1)) + m = re.match(r'Train time:\s+\d+\s+ms\s+\(([\d.]+)%\)', line) + if m: result['train_pct'] = float(m.group(1)) +print(json.dumps(result)) +" 2>/dev/null || echo "{}") + fi +else + echo " SKIPPED (--skip-training)" +fi + +# ── Assemble final JSON ── + +OUTDIR="$ROOT_DIR/community_benchmarks" +mkdir -p "$OUTDIR" +OUTFILE="$OUTDIR/${CHIP_SLUG}_${DATE_SHORT}.json" +if [[ -f "$OUTFILE" ]]; then + i=2 + while [[ -f "${OUTFILE%.json}_${i}.json" ]]; do i=$((i+1)); done + OUTFILE="${OUTFILE%.json}_${i}.json" +fi + +python3 -c " +import json, sys + +with open('$TMPJSON') as f: + partial = f.read() + +sram = json.loads('''$SRAM_JSON''') +peak = json.loads('''$PEAK_JSON''') +train_cpu = json.loads('''$TRAIN_CPU_JSON''') +train_ane = json.loads('''$TRAIN_ANE_JSON''') + +peak_tflops = max((r['tflops'] for r in peak), default=0) +sram_peak_eff = max((r['gflops_per_mb'] for r in sram), default=0) +sram_spill_ch = 0 +prev_tflops = 0 +for r in sorted(sram, key=lambda x: x['channels']): + if prev_tflops > 0 and r['tflops'] < prev_tflops * 0.6: + sram_spill_ch = r['channels'] + break + prev_tflops = max(prev_tflops, r['tflops']) + +result = json.loads(partial + '\"_\": 0}') +del result['_'] + +result['benchmarks'] = { + 'sram_probe': sram, + 'inmem_peak': peak, + 'training_cpu_classifier': train_cpu, + 'training_ane_classifier': train_ane +} + +result['summary'] = { + 'peak_tflops': round(peak_tflops, 2), + 'sram_peak_efficiency_gflops_per_mb': round(sram_peak_eff, 1), + 'sram_spill_start_channels': sram_spill_ch, + 'training_ms_per_step_cpu': train_cpu.get('ms_per_step'), + 'training_ms_per_step_ane': train_ane.get('ms_per_step'), + 'training_ane_tflops': train_ane.get('ane_tflops_sustained') or train_cpu.get('ane_tflops_sustained'), + 'training_ane_util_pct': train_ane.get('ane_util_pct') or train_cpu.get('ane_util_pct') +} + +with open('$OUTFILE', 'w') as f: + json.dump(result, f, indent=2) + f.write('\n') + +print(json.dumps(result['summary'], indent=2)) +" + +echo "" +echo "=== Benchmark complete ===" +echo "Results saved to: $OUTFILE" +echo "" + +# ── Optional: submit to community database ── + +DASHBOARD_URL="${ANE_DASHBOARD_URL:-https://web-lac-sigma-61.vercel.app}" +SUBMIT_URL="$DASHBOARD_URL/api/submit" + +echo "Would you like to submit your results to the ANE community benchmark database? (y/N)" +read -r SUBMIT_ANSWER + +if [[ "$SUBMIT_ANSWER" =~ ^[Yy]$ ]]; then + echo "Submitting to $SUBMIT_URL ..." + + HTTP_RESPONSE=$(curl -s -w "\n%{http_code}" \ + -X POST "$SUBMIT_URL" \ + -H "Content-Type: application/json" \ + -d @"$OUTFILE" 2>/dev/null) || true + + HTTP_BODY=$(echo "$HTTP_RESPONSE" | sed '$d') + HTTP_CODE=$(echo "$HTTP_RESPONSE" | tail -1) + + case "$HTTP_CODE" in + 201) + SUBMIT_ID=$(echo "$HTTP_BODY" | python3 -c "import sys,json; print(json.load(sys.stdin).get('id',''))" 2>/dev/null || echo "") + echo "Submitted successfully! (ID: $SUBMIT_ID)" + echo "View results at: $DASHBOARD_URL" + ;; + 409) + echo "Already submitted (duplicate detected within the last hour)." + echo "View results at: $DASHBOARD_URL" + ;; + 429) + echo "Rate limited -- too many submissions. Try again later." + echo "You can also submit via GitHub PR instead (see below)." + ;; + *) + echo "Submission failed (HTTP $HTTP_CODE). You can submit manually instead." + ;; + esac + echo "" +fi + +echo "Alternative submission methods:" +echo " 1. Fork https://github.com/maderix/ANE" +echo " 2. Add $OUTFILE to your fork" +echo " 3. Open a Pull Request" +echo "" +echo "Or paste the contents of $OUTFILE in a GitHub issue."