mirror of https://github.com/maderix/ANE.git
394 lines
15 KiB
Bash
Executable File
394 lines
15 KiB
Bash
Executable File
#!/bin/bash
|
|
set -euo pipefail
|
|
|
|
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
|
|
|
|
# Load .env if present (LMS_API_KEY, LMS_PORT, LMS_MODEL)
|
|
if [ -f "$SCRIPT_DIR/.env" ]; then
|
|
set -a
|
|
source "$SCRIPT_DIR/.env"
|
|
set +a
|
|
fi
|
|
|
|
BINARY="$SCRIPT_DIR/qwen_ane"
|
|
WEIGHTS="$SCRIPT_DIR/qwen05b.bin"
|
|
MODEL_DIR="${MODEL_DIR:-$HOME/models/Qwen2.5-0.5B-Instruct}"
|
|
SOCK="/tmp/qwen_ane_bench.sock"
|
|
HTTP_PORT=8877
|
|
RESULTS_JSON="$SCRIPT_DIR/benchmark_results.json"
|
|
|
|
# --- Prompt suite ---
|
|
PROMPT_NAMES=( "tiny" "short" "medium" "long" "stress")
|
|
PROMPTS=( "Hi" "What is 2+2?" "Explain how neural networks work in 3 sentences." "Write a short story about a robot learning to paint. Include dialogue." "The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog.")
|
|
MAX_TOKENS=( 10 20 100 200 50)
|
|
|
|
info() { printf "\033[1;34m%s\033[0m\n" "$1"; }
|
|
dim() { printf "\033[2m%s\033[0m\n" "$1"; }
|
|
|
|
# Extract a numeric or string value from flat JSON. No python needed.
|
|
# Usage: json_val '{"key":123}' "key" → 123
|
|
json_val() {
|
|
local json="$1" key="$2"
|
|
echo "$json" | sed -n "s/.*\"$key\"[[:space:]]*:[[:space:]]*\"\{0,1\}\([^,\"}\]*\)\"\{0,1\}.*/\1/p" | head -1
|
|
}
|
|
|
|
# Extract the "text" field which may contain escaped chars and commas.
|
|
# Grabs everything between "text":" and the next unescaped quote.
|
|
json_text() {
|
|
local json="$1"
|
|
echo "$json" | sed -n 's/.*"text":"\(.*\)","prompt_tokens".*/\1/p' | sed 's/\\n/ /g; s/\\"//g'
|
|
}
|
|
|
|
# Truncate a float string to integer: "317.2" → "317"
|
|
trunc() { echo "${1%%.*}"; }
|
|
|
|
# Average an array of numbers using awk. Handles both ints and floats.
|
|
# Usage: shell_avg "1.5" "2.3" "3.1" → 2.3
|
|
shell_avg() { printf '%s\n' "$@" | awk '{s+=$1; n++} END {if(n>0) printf "%.1f", s/n; else print "0"}'; }
|
|
shell_avg_int() { printf '%s\n' "$@" | awk '{s+=$1; n++} END {if(n>0) printf "%.0f", s/n; else print "0"}'; }
|
|
|
|
# --- Preflight ---
|
|
if [ ! -f "$BINARY" ]; then
|
|
echo "Binary not found: $BINARY"
|
|
echo "Run setup.sh first: $SCRIPT_DIR/setup.sh"
|
|
exit 1
|
|
fi
|
|
if [ ! -f "$WEIGHTS" ]; then
|
|
echo "Weights not found: $WEIGHTS"
|
|
echo "Run setup.sh first: $SCRIPT_DIR/setup.sh"
|
|
exit 1
|
|
fi
|
|
|
|
# Detect hardware
|
|
CHIP=$(sysctl -n machdep.cpu.brand_string 2>/dev/null || echo "Unknown")
|
|
MACOS=$(sw_vers -productVersion 2>/dev/null || echo "Unknown")
|
|
MEM_BYTES=$(sysctl -n hw.memsize 2>/dev/null || echo "0")
|
|
MEM_GB=$((MEM_BYTES / 1073741824))
|
|
|
|
echo ""
|
|
info "=== ANE Inference Benchmark (qwen_ane) ==="
|
|
echo "Hardware: $CHIP"
|
|
echo "macOS: $MACOS"
|
|
echo "Memory: ${MEM_GB} GB"
|
|
echo "Model: Qwen2.5-0.5B-Instruct (BF16, 494M params)"
|
|
echo ""
|
|
|
|
# --- Phase 1: Server mode benchmark (HTTP API) ---
|
|
info "Phase 1: Server mode (persistent ANE kernels via HTTP API)"
|
|
dim "Starting server on port $HTTP_PORT..."
|
|
|
|
# Start HTTP server in background
|
|
"$BINARY" "$WEIGHTS" --http "$HTTP_PORT" --model-dir "$MODEL_DIR" > /tmp/qwen_bench_server.log 2>&1 &
|
|
SERVER_PID=$!
|
|
|
|
cleanup() {
|
|
kill "$SERVER_PID" 2>/dev/null || true
|
|
rm -f "$SOCK" /tmp/qwen_bench_server.log
|
|
}
|
|
trap cleanup EXIT
|
|
|
|
# Wait for READY
|
|
for i in $(seq 1 30); do
|
|
if grep -q "READY" /tmp/qwen_bench_server.log 2>/dev/null; then
|
|
break
|
|
fi
|
|
sleep 1
|
|
done
|
|
|
|
if ! grep -q "READY" /tmp/qwen_bench_server.log 2>/dev/null; then
|
|
echo "Server failed to start. Log:"
|
|
cat /tmp/qwen_bench_server.log
|
|
exit 1
|
|
fi
|
|
dim "Server ready (PID $SERVER_PID)"
|
|
echo ""
|
|
|
|
# Warmup: first request primes any remaining caches
|
|
dim "Warmup run (discarded)..."
|
|
curl -s "http://127.0.0.1:$HTTP_PORT/v1/completions" \
|
|
-H "Content-Type: application/json" \
|
|
-d '{"prompt":"warmup","max_tokens":5}' > /dev/null 2>&1
|
|
echo ""
|
|
|
|
# Print table header
|
|
printf "%-10s %5s %5s %10s %10s %10s %10s %10s %10s\n" \
|
|
"Prompt" "In" "Out" "Prefill" "Decode" "TTFT" "Infer" "Rndtrip" "Overhead"
|
|
printf "%-10s %5s %5s %10s %10s %10s %10s %10s %10s\n" \
|
|
"" "tok" "tok" "(t/s)" "(t/s)" "(ms)" "(ms)" "(ms)" "(ms)"
|
|
printf '%.0s─' {1..85}; echo ""
|
|
|
|
# Arrays for averages
|
|
declare -a P_TPS_ARR D_TPS_ARR INF_MS_ARR TTFT_MS_ARR RT_MS_ARR
|
|
|
|
JSON_ENTRIES=""
|
|
NUM_PROMPTS=${#PROMPTS[@]}
|
|
|
|
for i in $(seq 0 $((NUM_PROMPTS - 1))); do
|
|
NAME="${PROMPT_NAMES[$i]}"
|
|
PROMPT="${PROMPTS[$i]}"
|
|
MAXTOK="${MAX_TOKENS[$i]}"
|
|
|
|
RT_T0=$(perl -MTime::HiRes=time -e 'printf "%.3f", time')
|
|
RESP=$(curl -s "http://127.0.0.1:$HTTP_PORT/v1/completions" \
|
|
-H "Content-Type: application/json" \
|
|
-d "{\"prompt\": \"$PROMPT\", \"max_tokens\": $MAXTOK}" 2>&1)
|
|
RT_T1=$(perl -MTime::HiRes=time -e 'printf "%.3f", time')
|
|
RT_MS=$(echo "$RT_T0 $RT_T1" | awk '{printf "%.0f", ($2 - $1) * 1000}')
|
|
|
|
# Parse server JSON with pure shell -- no python
|
|
P_TOKENS=$(json_val "$RESP" "prompt_tokens")
|
|
G_TOKENS=$(json_val "$RESP" "gen_tokens")
|
|
P_TPS=$(json_val "$RESP" "prefill_tps")
|
|
D_TPS=$(json_val "$RESP" "decode_tps")
|
|
TTFT_MS=$(trunc "$(json_val "$RESP" "ttft_ms")")
|
|
INF_MS=$(trunc "$(json_val "$RESP" "inference_ms")")
|
|
TOTAL_MS=$(trunc "$(json_val "$RESP" "total_ms")")
|
|
TEXT=$(json_text "$RESP")
|
|
OVERHEAD=$((RT_MS - TOTAL_MS))
|
|
|
|
printf "%-10s %5s %5s %10s %10s %10s %10s %10s %10s\n" \
|
|
"$NAME" "$P_TOKENS" "$G_TOKENS" "$P_TPS" "$D_TPS" "$TTFT_MS" "$INF_MS" "$RT_MS" "$OVERHEAD"
|
|
|
|
P_TPS_ARR+=("$P_TPS")
|
|
D_TPS_ARR+=("$D_TPS")
|
|
INF_MS_ARR+=("$INF_MS")
|
|
TTFT_MS_ARR+=("$TTFT_MS")
|
|
RT_MS_ARR+=("$RT_MS")
|
|
|
|
# Build JSON entry
|
|
JSON_ENTRIES="$JSON_ENTRIES{\"name\":\"$NAME\",\"prompt_tokens\":$P_TOKENS,\"gen_tokens\":$G_TOKENS,\"prefill_tps\":$P_TPS,\"decode_tps\":$D_TPS,\"ttft_ms\":$TTFT_MS,\"inference_ms\":$INF_MS,\"roundtrip_ms\":$RT_MS},"
|
|
|
|
# Print response text indented below
|
|
echo " → $TEXT"
|
|
echo ""
|
|
done
|
|
|
|
printf '%.0s─' {1..85}; echo ""
|
|
|
|
# Averages (pure shell, no python)
|
|
AVG_P=$(shell_avg "${P_TPS_ARR[@]}")
|
|
AVG_D=$(shell_avg "${D_TPS_ARR[@]}")
|
|
AVG_INF=$(shell_avg_int "${INF_MS_ARR[@]}")
|
|
AVG_TTFT=$(shell_avg_int "${TTFT_MS_ARR[@]}")
|
|
AVG_RT=$(shell_avg_int "${RT_MS_ARR[@]}")
|
|
AVG_OVERHEAD=$((AVG_RT - AVG_INF))
|
|
printf "%-10s %5s %5s %10s %10s %10s %10s %10s %10s\n" "Average" "" "" "$AVG_P" "$AVG_D" "$AVG_TTFT" "$AVG_INF" "$AVG_RT" "$AVG_OVERHEAD"
|
|
echo ""
|
|
info "Infer = server-reported (pure processing). Rndtrip = wall-clock (what clients see)."
|
|
echo ""
|
|
|
|
# --- Phase 2: Cold start measurement ---
|
|
info "Phase 2: Cold start (single-shot, recompiles ANE kernels)"
|
|
|
|
# Kill server, run single-shot
|
|
kill "$SERVER_PID" 2>/dev/null || true
|
|
sleep 1
|
|
|
|
# Use perl for sub-second timing (available on all macOS, no python)
|
|
COLD_T0=$(perl -MTime::HiRes=time -e 'printf "%.3f", time')
|
|
COLD_OUT=$("$BINARY" "$WEIGHTS" "151644 8948 198 2610 525 264 10950 17847 13 151645 198 151644 872 198 13048 151645 198 151644 77091 198" 10 2>&1 || true)
|
|
COLD_T1=$(perl -MTime::HiRes=time -e 'printf "%.3f", time')
|
|
COLD_MS=$(echo "$COLD_T0 $COLD_T1" | awk '{printf "%.0f", ($2 - $1) * 1000}')
|
|
|
|
echo "Cold start latency: ${COLD_MS}ms (includes ANE kernel compilation)"
|
|
echo ""
|
|
|
|
# Re-start server for any additional tests
|
|
"$BINARY" "$WEIGHTS" --http "$HTTP_PORT" --model-dir "$MODEL_DIR" > /tmp/qwen_bench_server.log 2>&1 &
|
|
SERVER_PID=$!
|
|
|
|
# --- Phase 3: Repeated prompt (consistency check) ---
|
|
info "Phase 3: Decode speed consistency (5x same prompt)"
|
|
|
|
for retry in $(seq 1 15); do
|
|
if grep -q "READY" /tmp/qwen_bench_server.log 2>/dev/null; then break; fi
|
|
sleep 1
|
|
done
|
|
|
|
printf "%-6s %10s %10s %10s\n" "Run" "Prefill" "Decode" "Infer(ms)"
|
|
printf '%.0s─' {1..40}; echo ""
|
|
|
|
for run in $(seq 1 5); do
|
|
RESP=$(curl -s "http://127.0.0.1:$HTTP_PORT/v1/completions" \
|
|
-H "Content-Type: application/json" \
|
|
-d '{"prompt": "Count from 1 to 10", "max_tokens": 50}' 2>&1)
|
|
P=$(json_val "$RESP" "prefill_tps")
|
|
D=$(json_val "$RESP" "decode_tps")
|
|
IM=$(trunc "$(json_val "$RESP" "inference_ms")")
|
|
printf "%-6s %10s %10s %10s\n" "#$run" "$P" "$D" "$IM"
|
|
done
|
|
echo ""
|
|
|
|
# --- Save JSON results ---
|
|
JSON="{
|
|
\"hardware\": \"$CHIP\",
|
|
\"macos\": \"$MACOS\",
|
|
\"memory_gb\": $MEM_GB,
|
|
\"model\": \"Qwen2.5-0.5B-Instruct\",
|
|
\"mode\": \"http_server\",
|
|
\"cold_start_ms\": $COLD_MS,
|
|
\"avg_prefill_tps\": $AVG_P,
|
|
\"avg_decode_tps\": $AVG_D,
|
|
\"avg_inference_ms\": $AVG_INF,
|
|
\"avg_roundtrip_ms\": $AVG_RT,
|
|
\"avg_ttft_ms\": $AVG_TTFT,
|
|
\"results\": [${JSON_ENTRIES%,}]
|
|
}"
|
|
echo "$JSON" > "$RESULTS_JSON"
|
|
dim "Results saved to $RESULTS_JSON"
|
|
echo ""
|
|
|
|
# --- Phase 4: LM Studio comparison (if running) ---
|
|
LMS_PORT="${LMS_PORT:-1234}"
|
|
LMS_MODEL="${LMS_MODEL:-qwen2.5-0.5b-instruct}"
|
|
LMS_API_KEY="${LMS_API_KEY:-}"
|
|
|
|
# Check if LM Studio is running
|
|
LMS_REACHABLE=0
|
|
if curl -s --max-time 2 "http://localhost:$LMS_PORT/api/v1/chat" -H "Content-Type: application/json" -d '{}' >/dev/null 2>&1; then
|
|
LMS_REACHABLE=1
|
|
fi
|
|
|
|
if [ "$LMS_REACHABLE" -eq 1 ]; then
|
|
info "Phase 4: LM Studio comparison (localhost:$LMS_PORT)"
|
|
|
|
# If no API key, prompt for it
|
|
if [ -z "$LMS_API_KEY" ]; then
|
|
echo ""
|
|
echo " LM Studio requires an API key."
|
|
echo " Find it in LM Studio > Developer tab > API key"
|
|
echo " Or set LMS_API_KEY env var before running."
|
|
echo ""
|
|
printf " Enter LM Studio API key (or press Enter to skip): "
|
|
read -r LMS_API_KEY
|
|
if [ -z "$LMS_API_KEY" ]; then
|
|
dim "Skipping LM Studio benchmark."
|
|
LMS_REACHABLE=0
|
|
fi
|
|
fi
|
|
fi
|
|
|
|
if [ "$LMS_REACHABLE" -eq 1 ] && [ -n "$LMS_API_KEY" ]; then
|
|
echo ""
|
|
printf "%-10s %5s %5s %10s %10s %10s\n" \
|
|
"Prompt" "In" "Out" "Decode" "TTFT" "Rndtrip"
|
|
printf "%-10s %5s %5s %10s %10s %10s\n" \
|
|
"" "tok" "tok" "(t/s)" "(ms)" "(ms)"
|
|
printf '%.0s─' {1..55}; echo ""
|
|
|
|
declare -a LMS_LATENCIES LMS_TPS_ARR LMS_TTFT_ARR
|
|
LMS_JSON_ENTRIES=""
|
|
|
|
for i in $(seq 0 $((NUM_PROMPTS - 1))); do
|
|
NAME="${PROMPT_NAMES[$i]}"
|
|
PROMPT="${PROMPTS[$i]}"
|
|
|
|
T0=$(perl -MTime::HiRes=time -e 'printf "%.3f", time')
|
|
LMS_RESP=$(curl -s --max-time 120 "http://localhost:$LMS_PORT/api/v1/chat" \
|
|
-H "Content-Type: application/json" \
|
|
-H "Authorization: Bearer $LMS_API_KEY" \
|
|
-d "{\"model\":\"$LMS_MODEL\",\"system_prompt\":\"You are a helpful assistant. Be concise.\",\"input\":\"$PROMPT\"}" 2>&1)
|
|
T1=$(perl -MTime::HiRes=time -e 'printf "%.3f", time')
|
|
LMS_MS=$(echo "$T0 $T1" | awk '{printf "%.0f", ($2 - $1) * 1000}')
|
|
|
|
eval "$(echo "$LMS_RESP" | python3 -c "
|
|
import sys, json
|
|
try:
|
|
r = json.load(sys.stdin)
|
|
text = r.get('output', [{}])[0].get('content', '').replace(chr(10),' ').replace('\"', '')
|
|
s = r.get('stats', {})
|
|
tps = s.get('tokens_per_second', 0)
|
|
ttft = int(s.get('time_to_first_token_seconds', 0) * 1000)
|
|
in_tok = s.get('input_tokens', 0)
|
|
out_tok = s.get('total_output_tokens', 0)
|
|
print(f'LMS_TEXT=\"{text}\"')
|
|
print(f'LMS_TPS={tps:.1f}')
|
|
print(f'LMS_TTFT={ttft}')
|
|
print(f'LMS_IN={in_tok}')
|
|
print(f'LMS_OUT={out_tok}')
|
|
except Exception as e:
|
|
print(f'LMS_TEXT=\"(parse error)\"')
|
|
print('LMS_TPS=0')
|
|
print('LMS_TTFT=0')
|
|
print('LMS_IN=0')
|
|
print('LMS_OUT=0')
|
|
" 2>/dev/null)"
|
|
|
|
printf "%-10s %5s %5s %10s %10s %10s\n" "$NAME" "$LMS_IN" "$LMS_OUT" "$LMS_TPS" "$LMS_TTFT" "$LMS_MS"
|
|
echo " → $LMS_TEXT"
|
|
echo ""
|
|
LMS_LATENCIES+=("$LMS_MS")
|
|
LMS_TPS_ARR+=("$LMS_TPS")
|
|
LMS_TTFT_ARR+=("$LMS_TTFT")
|
|
LMS_JSON_ENTRIES="$LMS_JSON_ENTRIES{\"name\":\"$NAME\",\"latency_ms\":$LMS_MS,\"tps\":$LMS_TPS,\"ttft_ms\":$LMS_TTFT,\"input_tokens\":$LMS_IN,\"output_tokens\":$LMS_OUT},"
|
|
done
|
|
|
|
printf '%.0s─' {1..55}; echo ""
|
|
|
|
# Averages (awk, no python)
|
|
LMS_AVG_LAT=$(shell_avg_int "${LMS_LATENCIES[@]}")
|
|
LMS_AVG_TPS=$(shell_avg "${LMS_TPS_ARR[@]}")
|
|
LMS_AVG_TTFT=$(shell_avg_int "${LMS_TTFT_ARR[@]}")
|
|
printf "%-10s %5s %5s %10s %10s %10s\n" "Average" "" "" "$LMS_AVG_TPS" "$LMS_AVG_TTFT" "$LMS_AVG_LAT"
|
|
echo ""
|
|
|
|
# Side-by-side comparison
|
|
info "=== Side-by-Side Comparison ==="
|
|
dim "(Round-trip = wall-clock from client, apples-to-apples)"
|
|
echo ""
|
|
printf "%-24s %15s %15s\n" "" "ANE (qwen_ane)" "LM Studio"
|
|
printf '%.0s─' {1..56}; echo ""
|
|
printf "%-24s %12s t/s %12s t/s\n" "Decode speed" "$AVG_D" "$LMS_AVG_TPS"
|
|
printf "%-24s %12s t/s %12s\n" "Prefill speed" "$AVG_P" "N/A"
|
|
printf "%-24s %12s ms %12s ms\n" "TTFT" "$AVG_TTFT" "$LMS_AVG_TTFT"
|
|
printf "%-24s %12s ms %12s ms\n" "Avg round-trip" "$AVG_RT" "$LMS_AVG_LAT"
|
|
printf "%-24s %12s ms %12s ms\n" " (server-only)" "$AVG_INF" "N/A"
|
|
printf "%-24s %12s ms %12s\n" "Cold start" "$COLD_MS" "N/A"
|
|
printf "%-24s %15s %15s\n" "Precision" "F32 (from BF16)" "GGUF quantized"
|
|
printf "%-24s %15s %15s\n" "Accelerator" "Neural Engine" "CPU/GPU"
|
|
printf "%-24s %15s %15s\n" "Timing method" "Wall-clock" "Wall-clock"
|
|
echo ""
|
|
|
|
# Append LM Studio block to JSON results (pure shell, no python)
|
|
# Remove trailing "}" and newline, append lm_studio object
|
|
LMS_JSON_BLOCK=",
|
|
\"lm_studio\": {
|
|
\"port\": $LMS_PORT,
|
|
\"model\": \"$LMS_MODEL\",
|
|
\"avg_latency_ms\": $LMS_AVG_LAT,
|
|
\"avg_tps\": $LMS_AVG_TPS,
|
|
\"avg_ttft_ms\": $LMS_AVG_TTFT,
|
|
\"results\": [${LMS_JSON_ENTRIES%,}]
|
|
}
|
|
}"
|
|
# Replace the final "}" with the LM Studio block
|
|
sed -i '' '$ s/}$//' "$RESULTS_JSON"
|
|
printf '%s\n' "$LMS_JSON_BLOCK" >> "$RESULTS_JSON"
|
|
dim "LM Studio results added to $RESULTS_JSON"
|
|
else
|
|
info "=== LM Studio Comparison ==="
|
|
echo ""
|
|
if [ "$LMS_REACHABLE" -eq 0 ]; then
|
|
echo " LM Studio server not detected on localhost:$LMS_PORT"
|
|
echo ""
|
|
echo " To enable automatic comparison:"
|
|
echo " 1. Open LM Studio, download Qwen2.5-0.5B-Instruct (GGUF)"
|
|
echo " 2. Load the model, go to Developer tab > Start Server"
|
|
echo " 3. Re-run this benchmark"
|
|
echo ""
|
|
echo " Or set env vars: LMS_PORT=1234 LMS_API_KEY=your-key ./benchmark.sh"
|
|
fi
|
|
echo ""
|
|
echo " Manual test:"
|
|
echo " curl http://localhost:1234/api/v1/chat \\"
|
|
echo " -H 'Content-Type: application/json' \\"
|
|
echo " -H 'Authorization: Bearer YOUR_API_KEY' \\"
|
|
echo " -d '{\"model\":\"qwen2.5-0.5b-instruct\",\"system_prompt\":\"You are a helpful assistant.\",\"input\":\"What is 2+2?\"}'"
|
|
echo ""
|
|
echo " ANE (this benchmark): prefill=${AVG_P} t/s, decode=${AVG_D} t/s, inference=${AVG_INF}ms"
|
|
echo ""
|
|
echo " Note: LM Studio uses quantized GGUF (CPU/GPU) while we use"
|
|
echo " BF16 weights (full precision) running on the Neural Engine."
|
|
fi
|
|
echo ""
|