#!/bin/bash set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" # Load .env if present (LMS_API_KEY, LMS_PORT, LMS_MODEL) if [ -f "$SCRIPT_DIR/.env" ]; then set -a source "$SCRIPT_DIR/.env" set +a fi BINARY="$SCRIPT_DIR/qwen_ane" WEIGHTS="$SCRIPT_DIR/qwen05b.bin" MODEL_DIR="${MODEL_DIR:-$HOME/models/Qwen2.5-0.5B-Instruct}" SOCK="/tmp/qwen_ane_bench.sock" HTTP_PORT=8877 RESULTS_JSON="$SCRIPT_DIR/benchmark_results.json" # --- Prompt suite --- PROMPT_NAMES=( "tiny" "short" "medium" "long" "stress") PROMPTS=( "Hi" "What is 2+2?" "Explain how neural networks work in 3 sentences." "Write a short story about a robot learning to paint. Include dialogue." "The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog.") MAX_TOKENS=( 10 20 100 200 50) info() { printf "\033[1;34m%s\033[0m\n" "$1"; } dim() { printf "\033[2m%s\033[0m\n" "$1"; } # Extract a numeric or string value from flat JSON. No python needed. # Usage: json_val '{"key":123}' "key" → 123 json_val() { local json="$1" key="$2" echo "$json" | sed -n "s/.*\"$key\"[[:space:]]*:[[:space:]]*\"\{0,1\}\([^,\"}\]*\)\"\{0,1\}.*/\1/p" | head -1 } # Extract the "text" field which may contain escaped chars and commas. # Grabs everything between "text":" and the next unescaped quote. json_text() { local json="$1" echo "$json" | sed -n 's/.*"text":"\(.*\)","prompt_tokens".*/\1/p' | sed 's/\\n/ /g; s/\\"//g' } # Truncate a float string to integer: "317.2" → "317" trunc() { echo "${1%%.*}"; } # Average an array of numbers using awk. Handles both ints and floats. # Usage: shell_avg "1.5" "2.3" "3.1" → 2.3 shell_avg() { printf '%s\n' "$@" | awk '{s+=$1; n++} END {if(n>0) printf "%.1f", s/n; else print "0"}'; } shell_avg_int() { printf '%s\n' "$@" | awk '{s+=$1; n++} END {if(n>0) printf "%.0f", s/n; else print "0"}'; } # --- Preflight --- if [ ! -f "$BINARY" ]; then echo "Binary not found: $BINARY" echo "Run setup.sh first: $SCRIPT_DIR/setup.sh" exit 1 fi if [ ! -f "$WEIGHTS" ]; then echo "Weights not found: $WEIGHTS" echo "Run setup.sh first: $SCRIPT_DIR/setup.sh" exit 1 fi # Detect hardware CHIP=$(sysctl -n machdep.cpu.brand_string 2>/dev/null || echo "Unknown") MACOS=$(sw_vers -productVersion 2>/dev/null || echo "Unknown") MEM_BYTES=$(sysctl -n hw.memsize 2>/dev/null || echo "0") MEM_GB=$((MEM_BYTES / 1073741824)) echo "" info "=== ANE Inference Benchmark (qwen_ane) ===" echo "Hardware: $CHIP" echo "macOS: $MACOS" echo "Memory: ${MEM_GB} GB" echo "Model: Qwen2.5-0.5B-Instruct (BF16, 494M params)" echo "" # --- Phase 1: Server mode benchmark (HTTP API) --- info "Phase 1: Server mode (persistent ANE kernels via HTTP API)" dim "Starting server on port $HTTP_PORT..." # Start HTTP server in background "$BINARY" "$WEIGHTS" --http "$HTTP_PORT" --model-dir "$MODEL_DIR" > /tmp/qwen_bench_server.log 2>&1 & SERVER_PID=$! cleanup() { kill "$SERVER_PID" 2>/dev/null || true rm -f "$SOCK" /tmp/qwen_bench_server.log } trap cleanup EXIT # Wait for READY for i in $(seq 1 30); do if grep -q "READY" /tmp/qwen_bench_server.log 2>/dev/null; then break fi sleep 1 done if ! grep -q "READY" /tmp/qwen_bench_server.log 2>/dev/null; then echo "Server failed to start. Log:" cat /tmp/qwen_bench_server.log exit 1 fi dim "Server ready (PID $SERVER_PID)" echo "" # Warmup: first request primes any remaining caches dim "Warmup run (discarded)..." curl -s "http://127.0.0.1:$HTTP_PORT/v1/completions" \ -H "Content-Type: application/json" \ -d '{"prompt":"warmup","max_tokens":5}' > /dev/null 2>&1 echo "" # Print table header printf "%-10s %5s %5s %10s %10s %10s %10s %10s %10s\n" \ "Prompt" "In" "Out" "Prefill" "Decode" "TTFT" "Infer" "Rndtrip" "Overhead" printf "%-10s %5s %5s %10s %10s %10s %10s %10s %10s\n" \ "" "tok" "tok" "(t/s)" "(t/s)" "(ms)" "(ms)" "(ms)" "(ms)" printf '%.0s─' {1..85}; echo "" # Arrays for averages declare -a P_TPS_ARR D_TPS_ARR INF_MS_ARR TTFT_MS_ARR RT_MS_ARR JSON_ENTRIES="" NUM_PROMPTS=${#PROMPTS[@]} for i in $(seq 0 $((NUM_PROMPTS - 1))); do NAME="${PROMPT_NAMES[$i]}" PROMPT="${PROMPTS[$i]}" MAXTOK="${MAX_TOKENS[$i]}" RT_T0=$(perl -MTime::HiRes=time -e 'printf "%.3f", time') RESP=$(curl -s "http://127.0.0.1:$HTTP_PORT/v1/completions" \ -H "Content-Type: application/json" \ -d "{\"prompt\": \"$PROMPT\", \"max_tokens\": $MAXTOK}" 2>&1) RT_T1=$(perl -MTime::HiRes=time -e 'printf "%.3f", time') RT_MS=$(echo "$RT_T0 $RT_T1" | awk '{printf "%.0f", ($2 - $1) * 1000}') # Parse server JSON with pure shell -- no python P_TOKENS=$(json_val "$RESP" "prompt_tokens") G_TOKENS=$(json_val "$RESP" "gen_tokens") P_TPS=$(json_val "$RESP" "prefill_tps") D_TPS=$(json_val "$RESP" "decode_tps") TTFT_MS=$(trunc "$(json_val "$RESP" "ttft_ms")") INF_MS=$(trunc "$(json_val "$RESP" "inference_ms")") TOTAL_MS=$(trunc "$(json_val "$RESP" "total_ms")") TEXT=$(json_text "$RESP") OVERHEAD=$((RT_MS - TOTAL_MS)) printf "%-10s %5s %5s %10s %10s %10s %10s %10s %10s\n" \ "$NAME" "$P_TOKENS" "$G_TOKENS" "$P_TPS" "$D_TPS" "$TTFT_MS" "$INF_MS" "$RT_MS" "$OVERHEAD" P_TPS_ARR+=("$P_TPS") D_TPS_ARR+=("$D_TPS") INF_MS_ARR+=("$INF_MS") TTFT_MS_ARR+=("$TTFT_MS") RT_MS_ARR+=("$RT_MS") # Build JSON entry JSON_ENTRIES="$JSON_ENTRIES{\"name\":\"$NAME\",\"prompt_tokens\":$P_TOKENS,\"gen_tokens\":$G_TOKENS,\"prefill_tps\":$P_TPS,\"decode_tps\":$D_TPS,\"ttft_ms\":$TTFT_MS,\"inference_ms\":$INF_MS,\"roundtrip_ms\":$RT_MS}," # Print response text indented below echo " → $TEXT" echo "" done printf '%.0s─' {1..85}; echo "" # Averages (pure shell, no python) AVG_P=$(shell_avg "${P_TPS_ARR[@]}") AVG_D=$(shell_avg "${D_TPS_ARR[@]}") AVG_INF=$(shell_avg_int "${INF_MS_ARR[@]}") AVG_TTFT=$(shell_avg_int "${TTFT_MS_ARR[@]}") AVG_RT=$(shell_avg_int "${RT_MS_ARR[@]}") AVG_OVERHEAD=$((AVG_RT - AVG_INF)) printf "%-10s %5s %5s %10s %10s %10s %10s %10s %10s\n" "Average" "" "" "$AVG_P" "$AVG_D" "$AVG_TTFT" "$AVG_INF" "$AVG_RT" "$AVG_OVERHEAD" echo "" info "Infer = server-reported (pure processing). Rndtrip = wall-clock (what clients see)." echo "" # --- Phase 2: Cold start measurement --- info "Phase 2: Cold start (single-shot, recompiles ANE kernels)" # Kill server, run single-shot kill "$SERVER_PID" 2>/dev/null || true sleep 1 # Use perl for sub-second timing (available on all macOS, no python) COLD_T0=$(perl -MTime::HiRes=time -e 'printf "%.3f", time') COLD_OUT=$("$BINARY" "$WEIGHTS" "151644 8948 198 2610 525 264 10950 17847 13 151645 198 151644 872 198 13048 151645 198 151644 77091 198" 10 2>&1 || true) COLD_T1=$(perl -MTime::HiRes=time -e 'printf "%.3f", time') COLD_MS=$(echo "$COLD_T0 $COLD_T1" | awk '{printf "%.0f", ($2 - $1) * 1000}') echo "Cold start latency: ${COLD_MS}ms (includes ANE kernel compilation)" echo "" # Re-start server for any additional tests "$BINARY" "$WEIGHTS" --http "$HTTP_PORT" --model-dir "$MODEL_DIR" > /tmp/qwen_bench_server.log 2>&1 & SERVER_PID=$! # --- Phase 3: Repeated prompt (consistency check) --- info "Phase 3: Decode speed consistency (5x same prompt)" for retry in $(seq 1 15); do if grep -q "READY" /tmp/qwen_bench_server.log 2>/dev/null; then break; fi sleep 1 done printf "%-6s %10s %10s %10s\n" "Run" "Prefill" "Decode" "Infer(ms)" printf '%.0s─' {1..40}; echo "" for run in $(seq 1 5); do RESP=$(curl -s "http://127.0.0.1:$HTTP_PORT/v1/completions" \ -H "Content-Type: application/json" \ -d '{"prompt": "Count from 1 to 10", "max_tokens": 50}' 2>&1) P=$(json_val "$RESP" "prefill_tps") D=$(json_val "$RESP" "decode_tps") IM=$(trunc "$(json_val "$RESP" "inference_ms")") printf "%-6s %10s %10s %10s\n" "#$run" "$P" "$D" "$IM" done echo "" # --- Save JSON results --- JSON="{ \"hardware\": \"$CHIP\", \"macos\": \"$MACOS\", \"memory_gb\": $MEM_GB, \"model\": \"Qwen2.5-0.5B-Instruct\", \"mode\": \"http_server\", \"cold_start_ms\": $COLD_MS, \"avg_prefill_tps\": $AVG_P, \"avg_decode_tps\": $AVG_D, \"avg_inference_ms\": $AVG_INF, \"avg_roundtrip_ms\": $AVG_RT, \"avg_ttft_ms\": $AVG_TTFT, \"results\": [${JSON_ENTRIES%,}] }" echo "$JSON" > "$RESULTS_JSON" dim "Results saved to $RESULTS_JSON" echo "" # --- Phase 4: LM Studio comparison (if running) --- LMS_PORT="${LMS_PORT:-1234}" LMS_MODEL="${LMS_MODEL:-qwen2.5-0.5b-instruct}" LMS_API_KEY="${LMS_API_KEY:-}" # Check if LM Studio is running LMS_REACHABLE=0 if curl -s --max-time 2 "http://localhost:$LMS_PORT/api/v1/chat" -H "Content-Type: application/json" -d '{}' >/dev/null 2>&1; then LMS_REACHABLE=1 fi if [ "$LMS_REACHABLE" -eq 1 ]; then info "Phase 4: LM Studio comparison (localhost:$LMS_PORT)" # If no API key, prompt for it if [ -z "$LMS_API_KEY" ]; then echo "" echo " LM Studio requires an API key." echo " Find it in LM Studio > Developer tab > API key" echo " Or set LMS_API_KEY env var before running." echo "" printf " Enter LM Studio API key (or press Enter to skip): " read -r LMS_API_KEY if [ -z "$LMS_API_KEY" ]; then dim "Skipping LM Studio benchmark." LMS_REACHABLE=0 fi fi fi if [ "$LMS_REACHABLE" -eq 1 ] && [ -n "$LMS_API_KEY" ]; then echo "" printf "%-10s %5s %5s %10s %10s %10s\n" \ "Prompt" "In" "Out" "Decode" "TTFT" "Rndtrip" printf "%-10s %5s %5s %10s %10s %10s\n" \ "" "tok" "tok" "(t/s)" "(ms)" "(ms)" printf '%.0s─' {1..55}; echo "" declare -a LMS_LATENCIES LMS_TPS_ARR LMS_TTFT_ARR LMS_JSON_ENTRIES="" for i in $(seq 0 $((NUM_PROMPTS - 1))); do NAME="${PROMPT_NAMES[$i]}" PROMPT="${PROMPTS[$i]}" T0=$(perl -MTime::HiRes=time -e 'printf "%.3f", time') LMS_RESP=$(curl -s --max-time 120 "http://localhost:$LMS_PORT/api/v1/chat" \ -H "Content-Type: application/json" \ -H "Authorization: Bearer $LMS_API_KEY" \ -d "{\"model\":\"$LMS_MODEL\",\"system_prompt\":\"You are a helpful assistant. Be concise.\",\"input\":\"$PROMPT\"}" 2>&1) T1=$(perl -MTime::HiRes=time -e 'printf "%.3f", time') LMS_MS=$(echo "$T0 $T1" | awk '{printf "%.0f", ($2 - $1) * 1000}') eval "$(echo "$LMS_RESP" | python3 -c " import sys, json try: r = json.load(sys.stdin) text = r.get('output', [{}])[0].get('content', '').replace(chr(10),' ').replace('\"', '') s = r.get('stats', {}) tps = s.get('tokens_per_second', 0) ttft = int(s.get('time_to_first_token_seconds', 0) * 1000) in_tok = s.get('input_tokens', 0) out_tok = s.get('total_output_tokens', 0) print(f'LMS_TEXT=\"{text}\"') print(f'LMS_TPS={tps:.1f}') print(f'LMS_TTFT={ttft}') print(f'LMS_IN={in_tok}') print(f'LMS_OUT={out_tok}') except Exception as e: print(f'LMS_TEXT=\"(parse error)\"') print('LMS_TPS=0') print('LMS_TTFT=0') print('LMS_IN=0') print('LMS_OUT=0') " 2>/dev/null)" printf "%-10s %5s %5s %10s %10s %10s\n" "$NAME" "$LMS_IN" "$LMS_OUT" "$LMS_TPS" "$LMS_TTFT" "$LMS_MS" echo " → $LMS_TEXT" echo "" LMS_LATENCIES+=("$LMS_MS") LMS_TPS_ARR+=("$LMS_TPS") LMS_TTFT_ARR+=("$LMS_TTFT") LMS_JSON_ENTRIES="$LMS_JSON_ENTRIES{\"name\":\"$NAME\",\"latency_ms\":$LMS_MS,\"tps\":$LMS_TPS,\"ttft_ms\":$LMS_TTFT,\"input_tokens\":$LMS_IN,\"output_tokens\":$LMS_OUT}," done printf '%.0s─' {1..55}; echo "" # Averages (awk, no python) LMS_AVG_LAT=$(shell_avg_int "${LMS_LATENCIES[@]}") LMS_AVG_TPS=$(shell_avg "${LMS_TPS_ARR[@]}") LMS_AVG_TTFT=$(shell_avg_int "${LMS_TTFT_ARR[@]}") printf "%-10s %5s %5s %10s %10s %10s\n" "Average" "" "" "$LMS_AVG_TPS" "$LMS_AVG_TTFT" "$LMS_AVG_LAT" echo "" # Side-by-side comparison info "=== Side-by-Side Comparison ===" dim "(Round-trip = wall-clock from client, apples-to-apples)" echo "" printf "%-24s %15s %15s\n" "" "ANE (qwen_ane)" "LM Studio" printf '%.0s─' {1..56}; echo "" printf "%-24s %12s t/s %12s t/s\n" "Decode speed" "$AVG_D" "$LMS_AVG_TPS" printf "%-24s %12s t/s %12s\n" "Prefill speed" "$AVG_P" "N/A" printf "%-24s %12s ms %12s ms\n" "TTFT" "$AVG_TTFT" "$LMS_AVG_TTFT" printf "%-24s %12s ms %12s ms\n" "Avg round-trip" "$AVG_RT" "$LMS_AVG_LAT" printf "%-24s %12s ms %12s ms\n" " (server-only)" "$AVG_INF" "N/A" printf "%-24s %12s ms %12s\n" "Cold start" "$COLD_MS" "N/A" printf "%-24s %15s %15s\n" "Precision" "F32 (from BF16)" "GGUF quantized" printf "%-24s %15s %15s\n" "Accelerator" "Neural Engine" "CPU/GPU" printf "%-24s %15s %15s\n" "Timing method" "Wall-clock" "Wall-clock" echo "" # Append LM Studio block to JSON results (pure shell, no python) # Remove trailing "}" and newline, append lm_studio object LMS_JSON_BLOCK=", \"lm_studio\": { \"port\": $LMS_PORT, \"model\": \"$LMS_MODEL\", \"avg_latency_ms\": $LMS_AVG_LAT, \"avg_tps\": $LMS_AVG_TPS, \"avg_ttft_ms\": $LMS_AVG_TTFT, \"results\": [${LMS_JSON_ENTRIES%,}] } }" # Replace the final "}" with the LM Studio block sed -i '' '$ s/}$//' "$RESULTS_JSON" printf '%s\n' "$LMS_JSON_BLOCK" >> "$RESULTS_JSON" dim "LM Studio results added to $RESULTS_JSON" else info "=== LM Studio Comparison ===" echo "" if [ "$LMS_REACHABLE" -eq 0 ]; then echo " LM Studio server not detected on localhost:$LMS_PORT" echo "" echo " To enable automatic comparison:" echo " 1. Open LM Studio, download Qwen2.5-0.5B-Instruct (GGUF)" echo " 2. Load the model, go to Developer tab > Start Server" echo " 3. Re-run this benchmark" echo "" echo " Or set env vars: LMS_PORT=1234 LMS_API_KEY=your-key ./benchmark.sh" fi echo "" echo " Manual test:" echo " curl http://localhost:1234/api/v1/chat \\" echo " -H 'Content-Type: application/json' \\" echo " -H 'Authorization: Bearer YOUR_API_KEY' \\" echo " -d '{\"model\":\"qwen2.5-0.5b-instruct\",\"system_prompt\":\"You are a helpful assistant.\",\"input\":\"What is 2+2?\"}'" echo "" echo " ANE (this benchmark): prefill=${AVG_P} t/s, decode=${AVG_D} t/s, inference=${AVG_INF}ms" echo "" echo " Note: LM Studio uses quantized GGUF (CPU/GPU) while we use" echo " BF16 weights (full precision) running on the Neural Engine." fi echo ""