mirror of https://github.com/maderix/ANE.git
[feat] Community benchmark system: standardized JSON output, auto-submit to dashboard, aggregation script, M4 Max reference result
This commit is contained in:
parent
517f1e45bb
commit
9832240e72
|
|
@ -0,0 +1,111 @@
|
|||
# ANE Community Benchmarks
|
||||
|
||||
Standardized benchmark results from different Apple Silicon machines, contributed by the community.
|
||||
|
||||
## How to Run
|
||||
|
||||
```bash
|
||||
# Full benchmark (SRAM probe + peak TFLOPS + training)
|
||||
bash scripts/run_community_benchmark.sh
|
||||
|
||||
# Quick benchmark (skip training -- useful if you don't have training data)
|
||||
bash scripts/run_community_benchmark.sh --skip-training
|
||||
|
||||
# Custom training steps
|
||||
bash scripts/run_community_benchmark.sh --steps 50
|
||||
```
|
||||
|
||||
This produces a JSON file in `community_benchmarks/` named `<chip>_<date>.json`.
|
||||
|
||||
### Prerequisites
|
||||
|
||||
- macOS on Apple Silicon (M1/M2/M3/M4/M5)
|
||||
- Xcode command line tools (`xcode-select --install`)
|
||||
- Python 3.11-3.13 with `coremltools` (auto-installed into a temp venv)
|
||||
- For training benchmarks: run `cd training && make data` first
|
||||
|
||||
## How to Submit
|
||||
|
||||
### Option 1: Pull Request
|
||||
|
||||
1. Fork this repo
|
||||
2. Run the benchmark: `bash scripts/run_community_benchmark.sh`
|
||||
3. Commit the generated JSON file from `community_benchmarks/`
|
||||
4. Open a PR
|
||||
|
||||
### Option 2: GitHub Issue
|
||||
|
||||
1. Run the benchmark
|
||||
2. Open a [new issue](../../issues/new) with title "Benchmark: [Your Chip]"
|
||||
3. Paste the contents of your JSON file
|
||||
|
||||
## Viewing Aggregated Results
|
||||
|
||||
```bash
|
||||
python3 scripts/aggregate_benchmarks.py
|
||||
```
|
||||
|
||||
This reads all JSON files in `community_benchmarks/` and prints a markdown comparison table.
|
||||
|
||||
## JSON Schema (v1)
|
||||
|
||||
Each submission contains:
|
||||
|
||||
```json
|
||||
{
|
||||
"schema_version": 1,
|
||||
"timestamp": "2026-03-03T12:00:00Z",
|
||||
"system": {
|
||||
"chip": "Apple M4 Max",
|
||||
"machine": "Mac16,5",
|
||||
"macos_version": "26.2",
|
||||
"memory_gb": 128,
|
||||
"neural_engine_cores": "16"
|
||||
},
|
||||
"benchmarks": {
|
||||
"sram_probe": [
|
||||
{"channels": 256, "weight_mb": 0.1, "ms_per_eval": 0.378, "tflops": 0.02, "gflops_per_mb": 177.7},
|
||||
...
|
||||
],
|
||||
"inmem_peak": [
|
||||
{"depth": 128, "channels": 512, "spatial": 64, "weight_mb": 64.0, "gflops": 4.29, "ms_per_eval": 0.385, "tflops": 11.14},
|
||||
...
|
||||
],
|
||||
"training_cpu_classifier": {
|
||||
"ms_per_step": 72.4,
|
||||
"ane_tflops_sustained": 1.29,
|
||||
"ane_util_pct": 8.1,
|
||||
"compile_pct": 79.7
|
||||
},
|
||||
"training_ane_classifier": {
|
||||
"ms_per_step": 62.9,
|
||||
"ane_tflops_sustained": 1.68,
|
||||
"ane_util_pct": 10.6,
|
||||
"compile_pct": 84.5
|
||||
}
|
||||
},
|
||||
"summary": {
|
||||
"peak_tflops": 11.14,
|
||||
"sram_spill_start_channels": 4096,
|
||||
"training_ms_per_step_cpu": 72.4,
|
||||
"training_ms_per_step_ane": 62.9,
|
||||
"training_ane_tflops": 1.68,
|
||||
"training_ane_util_pct": 10.6
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## What We're Measuring
|
||||
|
||||
| Benchmark | What it tells us |
|
||||
|-----------|-----------------|
|
||||
| **sram_probe** | ANE SRAM capacity -- where weight spilling starts |
|
||||
| **inmem_peak** | Maximum achievable TFLOPS via programmatic MIL |
|
||||
| **training (CPU cls)** | End-to-end training perf with CPU classifier |
|
||||
| **training (ANE cls)** | End-to-end training perf with ANE-offloaded classifier |
|
||||
|
||||
Key metrics to compare across chips:
|
||||
- **Peak TFLOPS**: raw ANE compute capability
|
||||
- **SRAM spill point**: determines max efficient kernel size
|
||||
- **Training ms/step**: real-world training performance
|
||||
- **ANE utilization %**: how much of peak we actually use
|
||||
|
|
@ -0,0 +1,67 @@
|
|||
{
|
||||
"schema_version": 1,
|
||||
"timestamp": "2026-03-03T11:46:08Z",
|
||||
"system": {
|
||||
"chip": "Apple M4 Max",
|
||||
"machine": "Mac16,5",
|
||||
"macos_version": "26.2",
|
||||
"macos_build": "25C56",
|
||||
"cpu_cores": 16,
|
||||
"memory_gb": 128,
|
||||
"neural_engine_cores": "16"
|
||||
},
|
||||
"benchmarks": {
|
||||
"sram_probe": [
|
||||
{"channels": 256, "weight_mb": 0.1, "ms_per_eval": 0.378, "tflops": 0.02, "gflops_per_mb": 177.7},
|
||||
{"channels": 512, "weight_mb": 0.5, "ms_per_eval": 0.431, "tflops": 0.08, "gflops_per_mb": 155.6},
|
||||
{"channels": 1024, "weight_mb": 2.0, "ms_per_eval": 0.411, "tflops": 0.33, "gflops_per_mb": 163.5},
|
||||
{"channels": 1536, "weight_mb": 4.5, "ms_per_eval": 0.493, "tflops": 0.61, "gflops_per_mb": 136.1},
|
||||
{"channels": 2048, "weight_mb": 8.0, "ms_per_eval": 0.410, "tflops": 1.31, "gflops_per_mb": 163.9},
|
||||
{"channels": 2560, "weight_mb": 12.5, "ms_per_eval": 0.237, "tflops": 3.53, "gflops_per_mb": 282.6},
|
||||
{"channels": 3072, "weight_mb": 18.0, "ms_per_eval": 0.335, "tflops": 3.60, "gflops_per_mb": 200.1},
|
||||
{"channels": 3584, "weight_mb": 24.5, "ms_per_eval": 0.414, "tflops": 3.97, "gflops_per_mb": 162.1},
|
||||
{"channels": 4096, "weight_mb": 32.0, "ms_per_eval": 1.134, "tflops": 1.89, "gflops_per_mb": 59.2},
|
||||
{"channels": 4608, "weight_mb": 40.5, "ms_per_eval": 0.563, "tflops": 4.83, "gflops_per_mb": 119.2},
|
||||
{"channels": 5120, "weight_mb": 50.0, "ms_per_eval": 0.659, "tflops": 5.09, "gflops_per_mb": 101.8},
|
||||
{"channels": 6144, "weight_mb": 72.0, "ms_per_eval": 0.844, "tflops": 5.73, "gflops_per_mb": 79.5},
|
||||
{"channels": 8192, "weight_mb": 128.0, "ms_per_eval": 4.203, "tflops": 1.02, "gflops_per_mb": 8.0}
|
||||
],
|
||||
"inmem_peak": [
|
||||
{"depth": 32, "channels": 512, "spatial": 64, "weight_mb": 16.0, "gflops": 1.07, "ms_per_eval": 0.408, "tflops": 2.63},
|
||||
{"depth": 48, "channels": 512, "spatial": 64, "weight_mb": 24.0, "gflops": 1.61, "ms_per_eval": 0.262, "tflops": 6.15},
|
||||
{"depth": 64, "channels": 512, "spatial": 64, "weight_mb": 32.0, "gflops": 2.15, "ms_per_eval": 0.244, "tflops": 8.80},
|
||||
{"depth": 96, "channels": 512, "spatial": 64, "weight_mb": 48.0, "gflops": 3.22, "ms_per_eval": 0.326, "tflops": 9.89},
|
||||
{"depth": 128, "channels": 512, "spatial": 64, "weight_mb": 64.0, "gflops": 4.29, "ms_per_eval": 0.385, "tflops": 11.14},
|
||||
{"depth": 64, "channels": 256, "spatial": 64, "weight_mb": 8.0, "gflops": 0.54, "ms_per_eval": 0.365, "tflops": 1.47},
|
||||
{"depth": 128, "channels": 256, "spatial": 64, "weight_mb": 16.0, "gflops": 1.07, "ms_per_eval": 0.454, "tflops": 2.37},
|
||||
{"depth": 256, "channels": 256, "spatial": 64, "weight_mb": 32.0, "gflops": 2.15, "ms_per_eval": 0.351, "tflops": 6.11},
|
||||
{"depth": 64, "channels": 384, "spatial": 64, "weight_mb": 18.0, "gflops": 1.21, "ms_per_eval": 0.429, "tflops": 2.82},
|
||||
{"depth": 128, "channels": 384, "spatial": 64, "weight_mb": 36.0, "gflops": 2.42, "ms_per_eval": 0.354, "tflops": 6.82}
|
||||
],
|
||||
"training_cpu_classifier": {
|
||||
"ms_per_step": 72.4,
|
||||
"ane_tflops_sustained": 1.29,
|
||||
"total_tflops": 2.41,
|
||||
"ane_util_pct": 8.1,
|
||||
"compile_pct": 79.7,
|
||||
"train_pct": 16.4
|
||||
},
|
||||
"training_ane_classifier": {
|
||||
"ms_per_step": 62.9,
|
||||
"ane_tflops_sustained": 1.68,
|
||||
"total_tflops": 2.77,
|
||||
"ane_util_pct": 10.6,
|
||||
"compile_pct": 84.5,
|
||||
"train_pct": 12.5
|
||||
}
|
||||
},
|
||||
"summary": {
|
||||
"peak_tflops": 11.14,
|
||||
"sram_peak_efficiency_gflops_per_mb": 282.6,
|
||||
"sram_spill_start_channels": 4096,
|
||||
"training_ms_per_step_cpu": 72.4,
|
||||
"training_ms_per_step_ane": 62.9,
|
||||
"training_ane_tflops": 1.68,
|
||||
"training_ane_util_pct": 10.6
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,153 @@
|
|||
#!/usr/bin/env python3
|
||||
"""Aggregate community benchmark JSON files into summary tables.
|
||||
|
||||
Usage:
|
||||
python3 scripts/aggregate_benchmarks.py [community_benchmarks/]
|
||||
|
||||
Reads all .json files from the given directory (default: community_benchmarks/)
|
||||
and produces:
|
||||
1. A markdown summary table to stdout
|
||||
2. A combined JSON file at community_benchmarks/SUMMARY.json
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
def load_submissions(directory):
|
||||
submissions = []
|
||||
for f in sorted(Path(directory).glob("*.json")):
|
||||
if f.name == "SUMMARY.json":
|
||||
continue
|
||||
try:
|
||||
with open(f) as fh:
|
||||
data = json.load(fh)
|
||||
if data.get("schema_version") != 1:
|
||||
print(f" SKIP {f.name}: unknown schema_version", file=sys.stderr)
|
||||
continue
|
||||
data["_filename"] = f.name
|
||||
submissions.append(data)
|
||||
except (json.JSONDecodeError, KeyError) as e:
|
||||
print(f" SKIP {f.name}: {e}", file=sys.stderr)
|
||||
return submissions
|
||||
|
||||
def format_table(submissions):
|
||||
lines = []
|
||||
lines.append("# ANE Community Benchmark Results\n")
|
||||
lines.append(f"Total submissions: {len(submissions)}\n")
|
||||
|
||||
header = (
|
||||
"| Chip | Machine | macOS | Memory | "
|
||||
"Peak TFLOPS | SRAM Spill (ch) | "
|
||||
"Train ms/step (CPU) | Train ms/step (ANE) | "
|
||||
"ANE TFLOPS | ANE Util % | Date |"
|
||||
)
|
||||
sep = "|" + "|".join(["---"] * 11) + "|"
|
||||
lines.append(header)
|
||||
lines.append(sep)
|
||||
|
||||
for s in submissions:
|
||||
sys_info = s.get("system", {})
|
||||
summary = s.get("summary", {})
|
||||
|
||||
def fmt(v, suffix=""):
|
||||
if v is None:
|
||||
return "-"
|
||||
if isinstance(v, float):
|
||||
return f"{v:.2f}{suffix}"
|
||||
return str(v)
|
||||
|
||||
row = "| {} | {} | {} | {} GB | {} | {} | {} | {} | {} | {} | {} |".format(
|
||||
sys_info.get("chip", "?"),
|
||||
sys_info.get("machine", "?"),
|
||||
sys_info.get("macos_version", "?"),
|
||||
sys_info.get("memory_gb", "?"),
|
||||
fmt(summary.get("peak_tflops")),
|
||||
summary.get("sram_spill_start_channels") or "-",
|
||||
fmt(summary.get("training_ms_per_step_cpu")),
|
||||
fmt(summary.get("training_ms_per_step_ane")),
|
||||
fmt(summary.get("training_ane_tflops")),
|
||||
fmt(summary.get("training_ane_util_pct"), "%"),
|
||||
s.get("timestamp", "?")[:10],
|
||||
)
|
||||
lines.append(row)
|
||||
|
||||
lines.append("")
|
||||
|
||||
if submissions:
|
||||
lines.append("## SRAM Probe Comparison\n")
|
||||
all_channels = set()
|
||||
for s in submissions:
|
||||
for probe in s.get("benchmarks", {}).get("sram_probe", []):
|
||||
all_channels.add(probe["channels"])
|
||||
all_channels = sorted(all_channels)
|
||||
|
||||
if all_channels:
|
||||
header_cols = ["Channels (W MB)"] + [
|
||||
s.get("system", {}).get("chip", "?").replace("Apple ", "")
|
||||
for s in submissions
|
||||
]
|
||||
lines.append("| " + " | ".join(header_cols) + " |")
|
||||
lines.append("|" + "|".join(["---"] * len(header_cols)) + "|")
|
||||
|
||||
for ch in all_channels:
|
||||
row_parts = []
|
||||
weight_mb = None
|
||||
for s in submissions:
|
||||
probe_data = {p["channels"]: p for p in s.get("benchmarks", {}).get("sram_probe", [])}
|
||||
if ch in probe_data:
|
||||
p = probe_data[ch]
|
||||
if weight_mb is None:
|
||||
weight_mb = p["weight_mb"]
|
||||
row_parts.append(f"{p['tflops']:.2f} TFLOPS ({p['ms_per_eval']:.3f} ms)")
|
||||
else:
|
||||
row_parts.append("-")
|
||||
|
||||
ch_label = f"{ch} ({weight_mb:.1f} MB)" if weight_mb else str(ch)
|
||||
lines.append("| " + ch_label + " | " + " | ".join(row_parts) + " |")
|
||||
lines.append("")
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
def main():
|
||||
directory = sys.argv[1] if len(sys.argv) > 1 else "community_benchmarks"
|
||||
|
||||
if not os.path.isdir(directory):
|
||||
print(f"Directory not found: {directory}", file=sys.stderr)
|
||||
print("Run the community benchmark first:", file=sys.stderr)
|
||||
print(" bash scripts/run_community_benchmark.sh", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
submissions = load_submissions(directory)
|
||||
if not submissions:
|
||||
print("No valid benchmark submissions found.", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
table = format_table(submissions)
|
||||
print(table)
|
||||
|
||||
summary_path = os.path.join(directory, "SUMMARY.json")
|
||||
combined = {
|
||||
"generated": submissions[0].get("timestamp", ""),
|
||||
"count": len(submissions),
|
||||
"submissions": [
|
||||
{
|
||||
"chip": s.get("system", {}).get("chip"),
|
||||
"machine": s.get("system", {}).get("machine"),
|
||||
"macos_version": s.get("system", {}).get("macos_version"),
|
||||
"memory_gb": s.get("system", {}).get("memory_gb"),
|
||||
"summary": s.get("summary", {}),
|
||||
"timestamp": s.get("timestamp"),
|
||||
"filename": s.get("_filename"),
|
||||
}
|
||||
for s in submissions
|
||||
],
|
||||
}
|
||||
with open(summary_path, "w") as f:
|
||||
json.dump(combined, f, indent=2)
|
||||
f.write("\n")
|
||||
print(f"\nSummary JSON written to: {summary_path}", file=sys.stderr)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
@ -0,0 +1,375 @@
|
|||
#!/bin/bash
|
||||
# run_community_benchmark.sh -- Standardized ANE benchmark for community submissions
|
||||
#
|
||||
# Runs a focused set of benchmarks and outputs a single JSON file that can be
|
||||
# submitted to the community_benchmarks/ directory via PR or GitHub issue.
|
||||
#
|
||||
# Usage:
|
||||
# bash scripts/run_community_benchmark.sh [--steps N] [--skip-training]
|
||||
#
|
||||
# Output:
|
||||
# community_benchmarks/<chip>_<date>.json
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
|
||||
ROOT_DIR="$(cd "$SCRIPT_DIR/.." && pwd)"
|
||||
TRAINING_DIR="$ROOT_DIR/training"
|
||||
|
||||
STEPS=20
|
||||
SKIP_TRAINING=false
|
||||
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case "$1" in
|
||||
--steps) STEPS="$2"; shift 2 ;;
|
||||
--skip-training) SKIP_TRAINING=true; shift ;;
|
||||
--help|-h)
|
||||
echo "Usage: bash scripts/run_community_benchmark.sh [--steps N] [--skip-training]"
|
||||
echo " --steps N Training steps (default: 20)"
|
||||
echo " --skip-training Skip training benchmarks (useful if no training data)"
|
||||
exit 0 ;;
|
||||
*) echo "Unknown option: $1"; exit 1 ;;
|
||||
esac
|
||||
done
|
||||
|
||||
# ── Collect system info ──
|
||||
|
||||
CHIP=$(sysctl -n machdep.cpu.brand_string 2>/dev/null || echo "unknown")
|
||||
MACHINE=$(sysctl -n hw.model 2>/dev/null || echo "unknown")
|
||||
MACOS_VER=$(sw_vers -productVersion 2>/dev/null || echo "unknown")
|
||||
MACOS_BUILD=$(sw_vers -buildVersion 2>/dev/null || echo "unknown")
|
||||
NCPU=$(sysctl -n hw.ncpu 2>/dev/null || echo "0")
|
||||
MEM_BYTES=$(sysctl -n hw.memsize 2>/dev/null || echo "0")
|
||||
MEM_GB=$(echo "scale=0; $MEM_BYTES / 1073741824" | bc 2>/dev/null || echo "0")
|
||||
NEURAL_CORES=$(sysctl -n hw.optional.ane.num_cores 2>/dev/null || echo "unknown")
|
||||
DATE_ISO=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
|
||||
DATE_SHORT=$(date +"%Y%m%d")
|
||||
|
||||
CHIP_SLUG=$(echo "$CHIP" | tr ' ' '_' | tr -d '()' | tr '[:upper:]' '[:lower:]')
|
||||
|
||||
echo "=== ANE Community Benchmark ==="
|
||||
echo "Chip: $CHIP"
|
||||
echo "Machine: $MACHINE"
|
||||
echo "macOS: $MACOS_VER ($MACOS_BUILD)"
|
||||
echo "Memory: ${MEM_GB} GB"
|
||||
echo "CPUs: $NCPU"
|
||||
echo "ANE cores: $NEURAL_CORES"
|
||||
echo ""
|
||||
|
||||
# ── Prerequisites ──
|
||||
|
||||
if [[ "$(uname)" != "Darwin" ]]; then
|
||||
echo "ERROR: macOS required"; exit 1
|
||||
fi
|
||||
if ! sysctl -n hw.optional.arm64 2>/dev/null | grep -q 1; then
|
||||
echo "ERROR: Apple Silicon required"; exit 1
|
||||
fi
|
||||
if ! xcrun --find clang >/dev/null 2>&1; then
|
||||
echo "ERROR: Xcode CLI tools required. Run: xcode-select --install"; exit 1
|
||||
fi
|
||||
|
||||
CC="xcrun clang"
|
||||
CFLAGS="-O2 -fobjc-arc -fstack-protector-strong -framework Foundation -framework CoreML -framework IOSurface -ldl"
|
||||
|
||||
# ── Ask for GitHub username (optional) ──
|
||||
|
||||
echo "Enter your GitHub username (optional, press Enter to skip):"
|
||||
read -r GH_USERNAME
|
||||
GH_USERNAME=$(echo "$GH_USERNAME" | tr -d '[:space:]' | sed 's/[^a-zA-Z0-9_-]//g' | cut -c1-39)
|
||||
|
||||
if [[ -n "$GH_USERNAME" ]]; then
|
||||
echo "Username: $GH_USERNAME"
|
||||
else
|
||||
echo "Submitting anonymously"
|
||||
fi
|
||||
echo ""
|
||||
|
||||
# ── Temp file for collecting JSON fragments ──
|
||||
|
||||
TMPJSON=$(mktemp /tmp/ane_bench_XXXXXX.json)
|
||||
trap "rm -f $TMPJSON" EXIT
|
||||
|
||||
# Start building the JSON result
|
||||
USERNAME_LINE=""
|
||||
if [[ -n "$GH_USERNAME" ]]; then
|
||||
USERNAME_LINE="\"username\": \"$GH_USERNAME\","
|
||||
fi
|
||||
|
||||
cat > "$TMPJSON" << HEADER
|
||||
{
|
||||
"schema_version": 1,
|
||||
$USERNAME_LINE
|
||||
"timestamp": "$DATE_ISO",
|
||||
"system": {
|
||||
"chip": "$CHIP",
|
||||
"machine": "$MACHINE",
|
||||
"macos_version": "$MACOS_VER",
|
||||
"macos_build": "$MACOS_BUILD",
|
||||
"cpu_cores": $NCPU,
|
||||
"memory_gb": $MEM_GB,
|
||||
"neural_engine_cores": "$NEURAL_CORES"
|
||||
},
|
||||
HEADER
|
||||
|
||||
# ── 1. SRAM Probe ──
|
||||
|
||||
echo "--- Running sram_probe ---"
|
||||
SRAM_JSON="[]"
|
||||
|
||||
# Generate mlpackage models if needed
|
||||
if ! ls /tmp/ane_sram_*ch_*sp.mlpackage >/dev/null 2>&1; then
|
||||
echo " Generating mlpackage models..."
|
||||
VENV_PYTHON=""
|
||||
if [[ -x /tmp/ane_venv/bin/python3 ]]; then
|
||||
VENV_PYTHON="/tmp/ane_venv/bin/python3"
|
||||
else
|
||||
for pyver in 3.12 3.13 3.11; do
|
||||
PY="/opt/homebrew/opt/python@${pyver}/bin/python${pyver}"
|
||||
if [[ -x "$PY" ]]; then
|
||||
"$PY" -m venv /tmp/ane_venv && /tmp/ane_venv/bin/pip install -q coremltools numpy 2>/dev/null
|
||||
VENV_PYTHON="/tmp/ane_venv/bin/python3"
|
||||
break
|
||||
fi
|
||||
done
|
||||
fi
|
||||
if [[ -n "$VENV_PYTHON" ]]; then
|
||||
"$VENV_PYTHON" "$SCRIPT_DIR/gen_mlpackages.py" 2>/dev/null && echo " mlpackage models generated" || echo " WARNING: mlpackage generation failed"
|
||||
fi
|
||||
fi
|
||||
|
||||
if ls /tmp/ane_sram_*ch_*sp.mlpackage >/dev/null 2>&1; then
|
||||
cd "$ROOT_DIR"
|
||||
$CC $CFLAGS -o sram_probe sram_probe.m 2>/dev/null
|
||||
|
||||
SRAM_OUTPUT=$(./sram_probe 2>&1) || true
|
||||
echo " sram_probe complete"
|
||||
|
||||
SRAM_JSON=$(echo "$SRAM_OUTPUT" | python3 -c "
|
||||
import sys, json, re
|
||||
results = []
|
||||
for line in sys.stdin:
|
||||
line = line.strip()
|
||||
m = re.match(r'\s*(\d+)\s+ch\s+([\d.]+)\s+([\d.]+)\s+ms\s+([\d.]+)\s+([\d.]+)', line)
|
||||
if m:
|
||||
results.append({
|
||||
'channels': int(m.group(1)),
|
||||
'weight_mb': float(m.group(2)),
|
||||
'ms_per_eval': float(m.group(3)),
|
||||
'tflops': float(m.group(4)),
|
||||
'gflops_per_mb': float(m.group(5))
|
||||
})
|
||||
print(json.dumps(results))
|
||||
" 2>/dev/null || echo "[]")
|
||||
else
|
||||
echo " SKIPPED: no mlpackage models"
|
||||
fi
|
||||
|
||||
# ── 2. InMem Peak ──
|
||||
|
||||
echo "--- Running inmem_peak ---"
|
||||
PEAK_JSON="[]"
|
||||
|
||||
cd "$ROOT_DIR"
|
||||
$CC $CFLAGS -o inmem_peak inmem_peak.m 2>/dev/null
|
||||
|
||||
PEAK_OUTPUT=$(./inmem_peak 2>&1) || true
|
||||
echo " inmem_peak complete"
|
||||
|
||||
PEAK_JSON=$(echo "$PEAK_OUTPUT" | python3 -c "
|
||||
import sys, json, re
|
||||
results = []
|
||||
for line in sys.stdin:
|
||||
line = line.strip()
|
||||
m = re.match(r'(\d+)x\s+conv\s+(\d+)ch\s+sp(\d+)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)\s+ms\s+([\d.]+)', line)
|
||||
if m:
|
||||
results.append({
|
||||
'depth': int(m.group(1)),
|
||||
'channels': int(m.group(2)),
|
||||
'spatial': int(m.group(3)),
|
||||
'weight_mb': float(m.group(4)),
|
||||
'gflops': float(m.group(5)),
|
||||
'ms_per_eval': float(m.group(6)),
|
||||
'tflops': float(m.group(7))
|
||||
})
|
||||
print(json.dumps(results))
|
||||
" 2>/dev/null || echo "[]")
|
||||
|
||||
# ── 3. Training (optional) ──
|
||||
|
||||
echo "--- Running training benchmark ($STEPS steps) ---"
|
||||
TRAIN_CPU_JSON="{}"
|
||||
TRAIN_ANE_JSON="{}"
|
||||
|
||||
if ! $SKIP_TRAINING; then
|
||||
cd "$TRAINING_DIR"
|
||||
|
||||
# Build training binaries
|
||||
make train_large train_large_ane 2>/dev/null || true
|
||||
|
||||
if [[ -x ./train_large ]]; then
|
||||
TRAIN_OUTPUT=$(./train_large --steps "$STEPS" 2>&1) || true
|
||||
echo " train_large complete"
|
||||
|
||||
TRAIN_CPU_JSON=$(echo "$TRAIN_OUTPUT" | python3 -c "
|
||||
import sys, json, re
|
||||
result = {}
|
||||
for line in sys.stdin:
|
||||
line = line.strip()
|
||||
if line.startswith('{\"type\":\"perf\"'):
|
||||
d = json.loads(line)
|
||||
result['ane_tflops'] = d.get('ane_tflops')
|
||||
result['ane_util_pct'] = d.get('ane_util_pct')
|
||||
m = re.match(r'Avg train:\s+([\d.]+)\s+ms/step', line)
|
||||
if m: result['ms_per_step'] = float(m.group(1))
|
||||
m = re.match(r'ANE TFLOPS:\s+([\d.]+)', line)
|
||||
if m: result['ane_tflops_sustained'] = float(m.group(1))
|
||||
m = re.match(r'Total TFLOPS:\s+([\d.]+)', line)
|
||||
if m: result['total_tflops'] = float(m.group(1))
|
||||
m = re.match(r'ANE utilization:\s+([\d.]+)%', line)
|
||||
if m: result['ane_util_pct'] = float(m.group(1))
|
||||
m = re.match(r'Compile time:\s+\d+\s+ms\s+\(([\d.]+)%\)', line)
|
||||
if m: result['compile_pct'] = float(m.group(1))
|
||||
m = re.match(r'Train time:\s+\d+\s+ms\s+\(([\d.]+)%\)', line)
|
||||
if m: result['train_pct'] = float(m.group(1))
|
||||
print(json.dumps(result))
|
||||
" 2>/dev/null || echo "{}")
|
||||
fi
|
||||
|
||||
if [[ -x ./train_large_ane ]]; then
|
||||
TRAIN_ANE_OUTPUT=$(./train_large_ane --steps "$STEPS" 2>&1) || true
|
||||
echo " train_large_ane complete"
|
||||
|
||||
TRAIN_ANE_JSON=$(echo "$TRAIN_ANE_OUTPUT" | python3 -c "
|
||||
import sys, json, re
|
||||
result = {}
|
||||
for line in sys.stdin:
|
||||
line = line.strip()
|
||||
m = re.match(r'Avg train:\s+([\d.]+)\s+ms/step', line)
|
||||
if m: result['ms_per_step'] = float(m.group(1))
|
||||
m = re.match(r'ANE TFLOPS:\s+([\d.]+)', line)
|
||||
if m: result['ane_tflops_sustained'] = float(m.group(1))
|
||||
m = re.match(r'Total TFLOPS:\s+([\d.]+)', line)
|
||||
if m: result['total_tflops'] = float(m.group(1))
|
||||
m = re.match(r'ANE utilization:\s+([\d.]+)%', line)
|
||||
if m: result['ane_util_pct'] = float(m.group(1))
|
||||
m = re.match(r'Compile time:\s+\d+\s+ms\s+\(([\d.]+)%\)', line)
|
||||
if m: result['compile_pct'] = float(m.group(1))
|
||||
m = re.match(r'Train time:\s+\d+\s+ms\s+\(([\d.]+)%\)', line)
|
||||
if m: result['train_pct'] = float(m.group(1))
|
||||
print(json.dumps(result))
|
||||
" 2>/dev/null || echo "{}")
|
||||
fi
|
||||
else
|
||||
echo " SKIPPED (--skip-training)"
|
||||
fi
|
||||
|
||||
# ── Assemble final JSON ──
|
||||
|
||||
OUTDIR="$ROOT_DIR/community_benchmarks"
|
||||
mkdir -p "$OUTDIR"
|
||||
OUTFILE="$OUTDIR/${CHIP_SLUG}_${DATE_SHORT}.json"
|
||||
if [[ -f "$OUTFILE" ]]; then
|
||||
i=2
|
||||
while [[ -f "${OUTFILE%.json}_${i}.json" ]]; do i=$((i+1)); done
|
||||
OUTFILE="${OUTFILE%.json}_${i}.json"
|
||||
fi
|
||||
|
||||
python3 -c "
|
||||
import json, sys
|
||||
|
||||
with open('$TMPJSON') as f:
|
||||
partial = f.read()
|
||||
|
||||
sram = json.loads('''$SRAM_JSON''')
|
||||
peak = json.loads('''$PEAK_JSON''')
|
||||
train_cpu = json.loads('''$TRAIN_CPU_JSON''')
|
||||
train_ane = json.loads('''$TRAIN_ANE_JSON''')
|
||||
|
||||
peak_tflops = max((r['tflops'] for r in peak), default=0)
|
||||
sram_peak_eff = max((r['gflops_per_mb'] for r in sram), default=0)
|
||||
sram_spill_ch = 0
|
||||
prev_tflops = 0
|
||||
for r in sorted(sram, key=lambda x: x['channels']):
|
||||
if prev_tflops > 0 and r['tflops'] < prev_tflops * 0.6:
|
||||
sram_spill_ch = r['channels']
|
||||
break
|
||||
prev_tflops = max(prev_tflops, r['tflops'])
|
||||
|
||||
result = json.loads(partial + '\"_\": 0}')
|
||||
del result['_']
|
||||
|
||||
result['benchmarks'] = {
|
||||
'sram_probe': sram,
|
||||
'inmem_peak': peak,
|
||||
'training_cpu_classifier': train_cpu,
|
||||
'training_ane_classifier': train_ane
|
||||
}
|
||||
|
||||
result['summary'] = {
|
||||
'peak_tflops': round(peak_tflops, 2),
|
||||
'sram_peak_efficiency_gflops_per_mb': round(sram_peak_eff, 1),
|
||||
'sram_spill_start_channels': sram_spill_ch,
|
||||
'training_ms_per_step_cpu': train_cpu.get('ms_per_step'),
|
||||
'training_ms_per_step_ane': train_ane.get('ms_per_step'),
|
||||
'training_ane_tflops': train_ane.get('ane_tflops_sustained') or train_cpu.get('ane_tflops_sustained'),
|
||||
'training_ane_util_pct': train_ane.get('ane_util_pct') or train_cpu.get('ane_util_pct')
|
||||
}
|
||||
|
||||
with open('$OUTFILE', 'w') as f:
|
||||
json.dump(result, f, indent=2)
|
||||
f.write('\n')
|
||||
|
||||
print(json.dumps(result['summary'], indent=2))
|
||||
"
|
||||
|
||||
echo ""
|
||||
echo "=== Benchmark complete ==="
|
||||
echo "Results saved to: $OUTFILE"
|
||||
echo ""
|
||||
|
||||
# ── Optional: submit to community database ──
|
||||
|
||||
DASHBOARD_URL="${ANE_DASHBOARD_URL:-https://web-lac-sigma-61.vercel.app}"
|
||||
SUBMIT_URL="$DASHBOARD_URL/api/submit"
|
||||
|
||||
echo "Would you like to submit your results to the ANE community benchmark database? (y/N)"
|
||||
read -r SUBMIT_ANSWER
|
||||
|
||||
if [[ "$SUBMIT_ANSWER" =~ ^[Yy]$ ]]; then
|
||||
echo "Submitting to $SUBMIT_URL ..."
|
||||
|
||||
HTTP_RESPONSE=$(curl -s -w "\n%{http_code}" \
|
||||
-X POST "$SUBMIT_URL" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d @"$OUTFILE" 2>/dev/null) || true
|
||||
|
||||
HTTP_BODY=$(echo "$HTTP_RESPONSE" | sed '$d')
|
||||
HTTP_CODE=$(echo "$HTTP_RESPONSE" | tail -1)
|
||||
|
||||
case "$HTTP_CODE" in
|
||||
201)
|
||||
SUBMIT_ID=$(echo "$HTTP_BODY" | python3 -c "import sys,json; print(json.load(sys.stdin).get('id',''))" 2>/dev/null || echo "")
|
||||
echo "Submitted successfully! (ID: $SUBMIT_ID)"
|
||||
echo "View results at: $DASHBOARD_URL"
|
||||
;;
|
||||
409)
|
||||
echo "Already submitted (duplicate detected within the last hour)."
|
||||
echo "View results at: $DASHBOARD_URL"
|
||||
;;
|
||||
429)
|
||||
echo "Rate limited -- too many submissions. Try again later."
|
||||
echo "You can also submit via GitHub PR instead (see below)."
|
||||
;;
|
||||
*)
|
||||
echo "Submission failed (HTTP $HTTP_CODE). You can submit manually instead."
|
||||
;;
|
||||
esac
|
||||
echo ""
|
||||
fi
|
||||
|
||||
echo "Alternative submission methods:"
|
||||
echo " 1. Fork https://github.com/maderix/ANE"
|
||||
echo " 2. Add $OUTFILE to your fork"
|
||||
echo " 3. Open a Pull Request"
|
||||
echo ""
|
||||
echo "Or paste the contents of $OUTFILE in a GitHub issue."
|
||||
Loading…
Reference in New Issue