#!/bin/bash set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" # Load .env if present (LMS_API_KEY, LMS_PORT, LMS_MODEL) if [ -f "$SCRIPT_DIR/.env" ]; then set -a source "$SCRIPT_DIR/.env" set +a fi BINARY="$SCRIPT_DIR/qwen_ane" WEIGHTS="$SCRIPT_DIR/qwen05b.bin" MODEL_DIR="${MODEL_DIR:-$HOME/models/Qwen2.5-0.5B-Instruct}" SOCK="/tmp/qwen_ane_bench.sock" HTTP_PORT=8877 RESULTS_JSON="$SCRIPT_DIR/benchmark_results.json" # --- Prompt suite --- PROMPT_NAMES=( "tiny" "short" "medium" "long" "stress") PROMPTS=( "Hi" "What is 2+2?" "Explain how neural networks work in 3 sentences." "Write a short story about a robot learning to paint. Include dialogue." "The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog.") MAX_TOKENS=( 10 20 100 200 50) info() { printf "\033[1;34m%s\033[0m\n" "$1"; } dim() { printf "\033[2m%s\033[0m\n" "$1"; } # Extract a numeric or string value from flat JSON. No python needed. # Usage: json_val '{"key":123}' "key" → 123 json_val() { local json="$1" key="$2" echo "$json" | sed -n "s/.*\"$key\"[[:space:]]*:[[:space:]]*\"\{0,1\}\([^,\"}\]*\)\"\{0,1\}.*/\1/p" | head -1 } # Extract the "text" field which may contain escaped chars and commas. # Grabs everything between "text":" and the next unescaped quote. json_text() { local json="$1" echo "$json" | sed -n 's/.*"text":"\(.*\)","prompt_tokens".*/\1/p' | sed 's/\\n/ /g; s/\\"//g' } # Truncate a float string to integer: "317.2" → "317" trunc() { echo "${1%%.*}"; } # Average an array of numbers using awk. Handles both ints and floats. # Usage: shell_avg "1.5" "2.3" "3.1" → 2.3 shell_avg() { printf '%s\n' "$@" | awk '{s+=$1; n++} END {if(n>0) printf "%.1f", s/n; else print "0"}'; } shell_avg_int() { printf '%s\n' "$@" | awk '{s+=$1; n++} END {if(n>0) printf "%.0f", s/n; else print "0"}'; } # --- Preflight --- if [ ! -f "$BINARY" ]; then echo "Binary not found: $BINARY" echo "Run setup.sh first: $SCRIPT_DIR/setup.sh" exit 1 fi if [ ! -f "$WEIGHTS" ]; then echo "Weights not found: $WEIGHTS" echo "Run setup.sh first: $SCRIPT_DIR/setup.sh" exit 1 fi # Detect hardware CHIP=$(sysctl -n machdep.cpu.brand_string 2>/dev/null || echo "Unknown") MACOS=$(sw_vers -productVersion 2>/dev/null || echo "Unknown") MEM_BYTES=$(sysctl -n hw.memsize 2>/dev/null || echo "0") MEM_GB=$((MEM_BYTES / 1073741824)) echo "" info "=== ANE Multi-Format Inference Benchmark ===" echo "Hardware: $CHIP" echo "macOS: $MACOS" echo "Memory: ${MEM_GB} GB" echo "Model: Qwen2.5-0.5B-Instruct (494M params)" echo "" # --- Phase 0: Prepare weight files (F16 + Q8) --- WEIGHTS_F16="$SCRIPT_DIR/qwen05b.bin" WEIGHTS_Q8="$SCRIPT_DIR/qwen05b_q8.bin" WEIGHTS_Q4="$SCRIPT_DIR/qwen05b_q4.bin" CONVERT="$SCRIPT_DIR/convert_weights.py" VENV_DIR="$SCRIPT_DIR/.venv" info "Phase 0: Preparing weight files" if [ ! -f "$WEIGHTS_Q8" ]; then if [ ! -f "$CONVERT" ]; then echo " convert_weights.py not found, skipping Q8 generation." WEIGHTS_Q8="" else dim "Generating Q8 weights (one-time)..." if [ -d "$VENV_DIR" ]; then source "$VENV_DIR/bin/activate" fi python3 "$CONVERT" "$MODEL_DIR" "$WEIGHTS_Q8" --q8 dim "Q8 weights ready: $(du -h "$WEIGHTS_Q8" | cut -f1)" fi else dim "Q8 weights already exist: $(du -h "$WEIGHTS_Q8" | cut -f1)" fi if [ ! -f "$WEIGHTS_Q4" ]; then if [ ! -f "$CONVERT" ]; then echo " convert_weights.py not found, skipping Q4 generation." WEIGHTS_Q4="" else dim "Generating Q4 weights (one-time)..." if [ -d "$VENV_DIR" ]; then source "$VENV_DIR/bin/activate" fi python3 "$CONVERT" "$MODEL_DIR" "$WEIGHTS_Q4" --q4 dim "Q4 weights ready: $(du -h "$WEIGHTS_Q4" | cut -f1)" fi else dim "Q4 weights already exist: $(du -h "$WEIGHTS_Q4" | cut -f1)" fi dim "F16 weights: $(du -h "$WEIGHTS_F16" | cut -f1)" echo "" # ANE weight formats to benchmark # GPU flag: empty for CPU formats, "--gpu" for Metal GPU formats ANE_FMT_NAMES=("F16") ANE_FMT_WEIGHTS=("$WEIGHTS_F16") ANE_FMT_LABELS=("F16→F32 (AMX)") ANE_FMT_GPU=("") if [ -n "$WEIGHTS_Q8" ] && [ -f "$WEIGHTS_Q8" ]; then ANE_FMT_NAMES+=("Q8") ANE_FMT_WEIGHTS+=("$WEIGHTS_Q8") ANE_FMT_LABELS+=("Q8 (NEON dequant)") ANE_FMT_GPU+=("") fi if [ -n "$WEIGHTS_Q4" ] && [ -f "$WEIGHTS_Q4" ]; then ANE_FMT_NAMES+=("Q4_Metal") ANE_FMT_WEIGHTS+=("$WEIGHTS_Q4") ANE_FMT_LABELS+=("Q4 SIMD (Metal GPU)") ANE_FMT_GPU+=("--gpu") ANE_FMT_NAMES+=("Q4_AMX") ANE_FMT_WEIGHTS+=("$WEIGHTS_Q4") ANE_FMT_LABELS+=("Q4→F32 (AMX dequant)") ANE_FMT_GPU+=("") fi NUM_ANE_FMTS=${#ANE_FMT_NAMES[@]} NUM_PROMPTS=${#PROMPTS[@]} # Global cleanup SERVER_PID="" cleanup() { [ -n "$SERVER_PID" ] && kill "$SERVER_PID" 2>/dev/null || true rm -f "$SOCK" /tmp/qwen_bench_server.log } trap cleanup EXIT # Helper: start server with given weight file and optional extra flags, wait for READY start_server() { local wfile="$1" shift local extra_flags="$*" [ -n "$SERVER_PID" ] && kill "$SERVER_PID" 2>/dev/null || true sleep 1 rm -f /tmp/qwen_bench_server.log "$BINARY" "$wfile" --http "$HTTP_PORT" --model-dir "$MODEL_DIR" $extra_flags > /tmp/qwen_bench_server.log 2>&1 & SERVER_PID=$! for _i in $(seq 1 30); do if grep -q "READY" /tmp/qwen_bench_server.log 2>/dev/null; then return 0; fi sleep 1 done echo "Server failed to start with $wfile. Log:" cat /tmp/qwen_bench_server.log return 1 } # --- Phase 1: Multi-format ANE benchmarks --- # Per-format result tracking (indexed by format number) declare -a ALL_AVG_P ALL_AVG_D ALL_AVG_INF ALL_AVG_TTFT ALL_AVG_RT ANE_JSON_BLOCKS="" for fmt_idx in $(seq 0 $((NUM_ANE_FMTS - 1))); do FMT_NAME="${ANE_FMT_NAMES[$fmt_idx]}" FMT_WEIGHTS="${ANE_FMT_WEIGHTS[$fmt_idx]}" FMT_LABEL="${ANE_FMT_LABELS[$fmt_idx]}" FMT_GPU="${ANE_FMT_GPU[$fmt_idx]}" echo "" info "Phase 1.$((fmt_idx+1)): ANE $FMT_NAME benchmark ($FMT_LABEL)" dim "Weights: $(du -h "$FMT_WEIGHTS" | cut -f1) — Starting server..." if ! start_server "$FMT_WEIGHTS" $FMT_GPU; then echo "Skipping $FMT_NAME format." ALL_AVG_P+=("0"); ALL_AVG_D+=("0"); ALL_AVG_INF+=("0") ALL_AVG_TTFT+=("0"); ALL_AVG_RT+=("0") continue fi dim "Warmup run (discarded)..." curl -s "http://127.0.0.1:$HTTP_PORT/v1/completions" \ -H "Content-Type: application/json" \ -d '{"prompt":"warmup","max_tokens":5}' > /dev/null 2>&1 echo "" printf "%-10s %5s %5s %10s %10s %10s %10s %10s %10s\n" \ "Prompt" "In" "Out" "Prefill" "Decode" "TTFT" "Infer" "Rndtrip" "Overhead" printf "%-10s %5s %5s %10s %10s %10s %10s %10s %10s\n" \ "" "tok" "tok" "(t/s)" "(t/s)" "(ms)" "(ms)" "(ms)" "(ms)" printf '%.0s─' {1..85}; echo "" declare -a P_TPS_ARR=() D_TPS_ARR=() INF_MS_ARR=() TTFT_MS_ARR=() RT_MS_ARR=() FMT_JSON_ENTRIES="" for i in $(seq 0 $((NUM_PROMPTS - 1))); do NAME="${PROMPT_NAMES[$i]}" PROMPT="${PROMPTS[$i]}" MAXTOK="${MAX_TOKENS[$i]}" RT_T0=$(perl -MTime::HiRes=time -e 'printf "%.3f", time') RESP=$(curl -s "http://127.0.0.1:$HTTP_PORT/v1/completions" \ -H "Content-Type: application/json" \ -d "{\"prompt\": \"$PROMPT\", \"max_tokens\": $MAXTOK}" 2>&1) RT_T1=$(perl -MTime::HiRes=time -e 'printf "%.3f", time') RT_MS=$(echo "$RT_T0 $RT_T1" | awk '{printf "%.0f", ($2 - $1) * 1000}') P_TOKENS=$(json_val "$RESP" "prompt_tokens") G_TOKENS=$(json_val "$RESP" "gen_tokens") P_TPS=$(json_val "$RESP" "prefill_tps") D_TPS=$(json_val "$RESP" "decode_tps") TTFT_MS=$(trunc "$(json_val "$RESP" "ttft_ms")") INF_MS=$(trunc "$(json_val "$RESP" "inference_ms")") TOTAL_MS=$(trunc "$(json_val "$RESP" "total_ms")") TEXT=$(json_text "$RESP") OVERHEAD=$((RT_MS - TOTAL_MS)) printf "%-10s %5s %5s %10s %10s %10s %10s %10s %10s\n" \ "$NAME" "$P_TOKENS" "$G_TOKENS" "$P_TPS" "$D_TPS" "$TTFT_MS" "$INF_MS" "$RT_MS" "$OVERHEAD" P_TPS_ARR+=("$P_TPS") D_TPS_ARR+=("$D_TPS") INF_MS_ARR+=("$INF_MS") TTFT_MS_ARR+=("$TTFT_MS") RT_MS_ARR+=("$RT_MS") FMT_JSON_ENTRIES="$FMT_JSON_ENTRIES{\"name\":\"$NAME\",\"prompt_tokens\":$P_TOKENS,\"gen_tokens\":$G_TOKENS,\"prefill_tps\":$P_TPS,\"decode_tps\":$D_TPS,\"ttft_ms\":$TTFT_MS,\"inference_ms\":$INF_MS,\"roundtrip_ms\":$RT_MS}," echo " → $TEXT" echo "" done printf '%.0s─' {1..85}; echo "" F_AVG_P=$(shell_avg "${P_TPS_ARR[@]}") F_AVG_D=$(shell_avg "${D_TPS_ARR[@]}") F_AVG_INF=$(shell_avg_int "${INF_MS_ARR[@]}") F_AVG_TTFT=$(shell_avg_int "${TTFT_MS_ARR[@]}") F_AVG_RT=$(shell_avg_int "${RT_MS_ARR[@]}") F_AVG_OVERHEAD=$((F_AVG_RT - F_AVG_INF)) printf "%-10s %5s %5s %10s %10s %10s %10s %10s %10s\n" "Average" "" "" "$F_AVG_P" "$F_AVG_D" "$F_AVG_TTFT" "$F_AVG_INF" "$F_AVG_RT" "$F_AVG_OVERHEAD" echo "" ALL_AVG_P+=("$F_AVG_P") ALL_AVG_D+=("$F_AVG_D") ALL_AVG_INF+=("$F_AVG_INF") ALL_AVG_TTFT+=("$F_AVG_TTFT") ALL_AVG_RT+=("$F_AVG_RT") ANE_JSON_BLOCKS="$ANE_JSON_BLOCKS \"$FMT_NAME\": { \"format\": \"$FMT_NAME\", \"label\": \"$FMT_LABEL\", \"weight_size_mb\": $(du -m "$FMT_WEIGHTS" | cut -f1), \"avg_prefill_tps\": $F_AVG_P, \"avg_decode_tps\": $F_AVG_D, \"avg_inference_ms\": $F_AVG_INF, \"avg_roundtrip_ms\": $F_AVG_RT, \"avg_ttft_ms\": $F_AVG_TTFT, \"results\": [${FMT_JSON_ENTRIES%,}] }," done # Use F16 results as the primary ANE numbers (first format) AVG_P="${ALL_AVG_P[0]}" AVG_D="${ALL_AVG_D[0]}" AVG_INF="${ALL_AVG_INF[0]}" AVG_TTFT="${ALL_AVG_TTFT[0]}" AVG_RT="${ALL_AVG_RT[0]}" info "Infer = server-reported (pure processing). Rndtrip = wall-clock (what clients see)." echo "" # --- Phase 2: Cold start measurement --- info "Phase 2: Cold start (single-shot, recompiles ANE kernels)" kill "$SERVER_PID" 2>/dev/null || true SERVER_PID="" sleep 1 COLD_T0=$(perl -MTime::HiRes=time -e 'printf "%.3f", time') COLD_OUT=$("$BINARY" "$WEIGHTS" "151644 8948 198 2610 525 264 10950 17847 13 151645 198 151644 872 198 13048 151645 198 151644 77091 198" 10 2>&1 || true) COLD_T1=$(perl -MTime::HiRes=time -e 'printf "%.3f", time') COLD_MS=$(echo "$COLD_T0 $COLD_T1" | awk '{printf "%.0f", ($2 - $1) * 1000}') echo "Cold start latency: ${COLD_MS}ms (includes ANE kernel compilation)" echo "" # Re-start server (F16) for consistency check start_server "$WEIGHTS_F16" # --- Phase 3: Repeated prompt (consistency check) --- info "Phase 3: Decode speed consistency (5x same prompt, F16)" printf "%-6s %10s %10s %10s\n" "Run" "Prefill" "Decode" "Infer(ms)" printf '%.0s─' {1..40}; echo "" for run in $(seq 1 5); do RESP=$(curl -s "http://127.0.0.1:$HTTP_PORT/v1/completions" \ -H "Content-Type: application/json" \ -d '{"prompt": "Count from 1 to 10", "max_tokens": 50}' 2>&1) P=$(json_val "$RESP" "prefill_tps") D=$(json_val "$RESP" "decode_tps") IM=$(trunc "$(json_val "$RESP" "inference_ms")") printf "%-6s %10s %10s %10s\n" "#$run" "$P" "$D" "$IM" done echo "" # --- Save JSON results --- JSON="{ \"hardware\": \"$CHIP\", \"macos\": \"$MACOS\", \"memory_gb\": $MEM_GB, \"model\": \"Qwen2.5-0.5B-Instruct\", \"mode\": \"http_server\", \"cold_start_ms\": $COLD_MS, \"ane_formats\": {$( echo "$ANE_JSON_BLOCKS" | sed '$ s/,$//' ) } }" echo "$JSON" > "$RESULTS_JSON" dim "Results saved to $RESULTS_JSON" echo "" # --- Phase 4: LM Studio comparison (if running) --- LMS_PORT="${LMS_PORT:-1234}" LMS_API_KEY="${LMS_API_KEY:-}" # Models to benchmark (override via LMS_MODELS env var, comma-separated) LMS_MODELS_DEFAULT="qwen2.5-0.5b-instruct,qwen2.5-0.5b-instruct-mlx@8bit,qwen2.5-0.5b-instruct-mlx@4bit" IFS=',' read -ra LMS_MODEL_LIST <<< "${LMS_MODELS:-$LMS_MODELS_DEFAULT}" # Check if LM Studio is running LMS_REACHABLE=0 if curl -s --max-time 2 "http://localhost:$LMS_PORT/api/v1/chat" -H "Content-Type: application/json" -d '{}' >/dev/null 2>&1; then LMS_REACHABLE=1 fi if [ "$LMS_REACHABLE" -eq 1 ]; then info "Phase 4: LM Studio comparison (localhost:$LMS_PORT)" dim "Models: ${LMS_MODEL_LIST[*]}" if [ -z "$LMS_API_KEY" ]; then echo "" echo " LM Studio requires an API key." echo " Find it in LM Studio > Developer tab > API key" echo " Or set LMS_API_KEY env var before running." echo "" printf " Enter LM Studio API key (or press Enter to skip): " read -r LMS_API_KEY if [ -z "$LMS_API_KEY" ]; then dim "Skipping LM Studio benchmark." LMS_REACHABLE=0 fi fi fi LMS_ALL_JSON="" if [ "$LMS_REACHABLE" -eq 1 ] && [ -n "$LMS_API_KEY" ]; then # Track the best model for the final comparison table BEST_LMS_MODEL="" BEST_LMS_TPS="0" BEST_LMS_LAT="99999" BEST_LMS_TTFT="0" for LMS_MODEL in "${LMS_MODEL_LIST[@]}"; do echo "" info "── $LMS_MODEL ──" # Test if this model is available TEST_RESP=$(curl -s --max-time 10 "http://localhost:$LMS_PORT/api/v1/chat" \ -H "Content-Type: application/json" \ -H "Authorization: Bearer $LMS_API_KEY" \ -d "{\"model\":\"$LMS_MODEL\",\"system_prompt\":\"test\",\"input\":\"hi\"}" 2>&1) if echo "$TEST_RESP" | grep -qi "error\|not found\|not loaded\|no model"; then dim " Model '$LMS_MODEL' not available, skipping." continue fi printf "%-10s %5s %5s %10s %10s %10s\n" \ "Prompt" "In" "Out" "Decode" "TTFT" "Rndtrip" printf "%-10s %5s %5s %10s %10s %10s\n" \ "" "tok" "tok" "(t/s)" "(ms)" "(ms)" printf '%.0s─' {1..55}; echo "" declare -a LMS_LATENCIES=() LMS_TPS_ARR=() LMS_TTFT_ARR=() LMS_JSON_ENTRIES="" for i in $(seq 0 $((NUM_PROMPTS - 1))); do NAME="${PROMPT_NAMES[$i]}" PROMPT="${PROMPTS[$i]}" T0=$(perl -MTime::HiRes=time -e 'printf "%.3f", time') LMS_RESP=$(curl -s --max-time 120 "http://localhost:$LMS_PORT/api/v1/chat" \ -H "Content-Type: application/json" \ -H "Authorization: Bearer $LMS_API_KEY" \ -d "{\"model\":\"$LMS_MODEL\",\"system_prompt\":\"You are a helpful assistant. Be concise.\",\"input\":\"$PROMPT\"}" 2>&1) T1=$(perl -MTime::HiRes=time -e 'printf "%.3f", time') LMS_MS=$(echo "$T0 $T1" | awk '{printf "%.0f", ($2 - $1) * 1000}') eval "$(echo "$LMS_RESP" | python3 -c " import sys, json try: r = json.load(sys.stdin) text = r.get('output', [{}])[0].get('content', '').replace(chr(10),' ').replace('\"', '') s = r.get('stats', {}) tps = s.get('tokens_per_second', 0) ttft = int(s.get('time_to_first_token_seconds', 0) * 1000) in_tok = s.get('input_tokens', 0) out_tok = s.get('total_output_tokens', 0) print(f'LMS_TEXT=\"{text}\"') print(f'LMS_TPS={tps:.1f}') print(f'LMS_TTFT={ttft}') print(f'LMS_IN={in_tok}') print(f'LMS_OUT={out_tok}') except Exception as e: print(f'LMS_TEXT=\"(parse error)\"') print('LMS_TPS=0') print('LMS_TTFT=0') print('LMS_IN=0') print('LMS_OUT=0') " 2>/dev/null)" printf "%-10s %5s %5s %10s %10s %10s\n" "$NAME" "$LMS_IN" "$LMS_OUT" "$LMS_TPS" "$LMS_TTFT" "$LMS_MS" LMS_LATENCIES+=("$LMS_MS") LMS_TPS_ARR+=("$LMS_TPS") LMS_TTFT_ARR+=("$LMS_TTFT") LMS_JSON_ENTRIES="$LMS_JSON_ENTRIES{\"name\":\"$NAME\",\"latency_ms\":$LMS_MS,\"tps\":$LMS_TPS,\"ttft_ms\":$LMS_TTFT,\"input_tokens\":$LMS_IN,\"output_tokens\":$LMS_OUT}," done printf '%.0s─' {1..55}; echo "" M_AVG_LAT=$(shell_avg_int "${LMS_LATENCIES[@]}") M_AVG_TPS=$(shell_avg "${LMS_TPS_ARR[@]}") M_AVG_TTFT=$(shell_avg_int "${LMS_TTFT_ARR[@]}") printf "%-10s %5s %5s %10s %10s %10s\n" "Average" "" "" "$M_AVG_TPS" "$M_AVG_TTFT" "$M_AVG_LAT" # Track the best model by decode t/s if awk "BEGIN {exit !($M_AVG_TPS > $BEST_LMS_TPS)}" 2>/dev/null; then BEST_LMS_MODEL="$LMS_MODEL" BEST_LMS_TPS="$M_AVG_TPS" BEST_LMS_LAT="$M_AVG_LAT" BEST_LMS_TTFT="$M_AVG_TTFT" fi LMS_ALL_JSON="$LMS_ALL_JSON \"$(echo "$LMS_MODEL" | sed 's/[^a-zA-Z0-9._-]/_/g')\": { \"model\": \"$LMS_MODEL\", \"avg_latency_ms\": $M_AVG_LAT, \"avg_tps\": $M_AVG_TPS, \"avg_ttft_ms\": $M_AVG_TTFT, \"results\": [${LMS_JSON_ENTRIES%,}] }," done echo "" # --- Final Comparison Table: all ANE formats + all LM Studio models --- info "=== Multi-Format Comparison ===" dim "(All times are wall-clock round-trip, apples-to-apples)" echo "" # Collect all column names and data declare -a COL_NAMES=() COL_DECODE=() COL_PREFILL=() COL_TTFT=() COL_RT=() COL_PREC=() COL_ACCEL=() for fi2 in $(seq 0 $((NUM_ANE_FMTS - 1))); do COL_NAMES+=("ANE ${ANE_FMT_NAMES[$fi2]}") COL_DECODE+=("${ALL_AVG_D[$fi2]}") COL_PREFILL+=("${ALL_AVG_P[$fi2]}") COL_TTFT+=("${ALL_AVG_TTFT[$fi2]}") COL_RT+=("${ALL_AVG_RT[$fi2]}") COL_PREC+=("${ANE_FMT_LABELS[$fi2]}") if [ -n "${ANE_FMT_GPU[$fi2]}" ]; then COL_ACCEL+=("Metal GPU") else COL_ACCEL+=("CPU (AMX)") fi done # Add each tested LM Studio model as a column declare -a LMS_TESTED_NAMES=() LMS_TESTED_TPS=() LMS_TESTED_TTFT=() LMS_TESTED_LAT=() for LMS_MODEL in "${LMS_MODEL_LIST[@]}"; do # Check if this model was actually tested (has data in LMS_ALL_JSON) SAFE_KEY=$(echo "$LMS_MODEL" | sed 's/[^a-zA-Z0-9._-]/_/g') if echo "$LMS_ALL_JSON" | grep -q "\"$SAFE_KEY\""; then M_TPS=$(echo "$LMS_ALL_JSON" | sed -n "/\"$SAFE_KEY\"/,/}/p" | sed -n 's/.*"avg_tps":[[:space:]]*\([0-9.]*\).*/\1/p' | head -1) M_TTFT=$(echo "$LMS_ALL_JSON" | sed -n "/\"$SAFE_KEY\"/,/}/p" | sed -n 's/.*"avg_ttft_ms":[[:space:]]*\([0-9]*\).*/\1/p' | head -1) M_LAT=$(echo "$LMS_ALL_JSON" | sed -n "/\"$SAFE_KEY\"/,/}/p" | sed -n 's/.*"avg_latency_ms":[[:space:]]*\([0-9]*\).*/\1/p' | head -1) SHORT_NAME=$(echo "$LMS_MODEL" | sed 's/qwen2.5-0.5b-instruct/q0.5b/; s/-mlx/mlx/') COL_NAMES+=("LMS $SHORT_NAME") COL_DECODE+=("${M_TPS:-0}") COL_PREFILL+=("N/A") COL_TTFT+=("${M_TTFT:-0}") COL_RT+=("${M_LAT:-0}") PREC_TAG="GGUF" echo "$LMS_MODEL" | grep -q "8bit" && PREC_TAG="MLX 8-bit" echo "$LMS_MODEL" | grep -q "4bit" && PREC_TAG="MLX 4-bit" COL_PREC+=("$PREC_TAG") COL_ACCEL+=("CPU/GPU") LMS_TESTED_NAMES+=("$LMS_MODEL") LMS_TESTED_TPS+=("${M_TPS:-0}") LMS_TESTED_TTFT+=("${M_TTFT:-0}") LMS_TESTED_LAT+=("${M_LAT:-0}") fi done NUM_COLS=${#COL_NAMES[@]} COL_W=16 # Print header row printf "%-20s" "" for c in $(seq 0 $((NUM_COLS - 1))); do printf "%${COL_W}s" "${COL_NAMES[$c]}"; done echo "" printf '%.0s─' $(seq 1 $((20 + NUM_COLS * COL_W))); echo "" # Data rows printf "%-20s" "Decode (t/s)" for c in $(seq 0 $((NUM_COLS - 1))); do printf "%${COL_W}s" "${COL_DECODE[$c]}"; done echo "" printf "%-20s" "Prefill (t/s)" for c in $(seq 0 $((NUM_COLS - 1))); do printf "%${COL_W}s" "${COL_PREFILL[$c]}"; done echo "" printf "%-20s" "TTFT (ms)" for c in $(seq 0 $((NUM_COLS - 1))); do printf "%${COL_W}s" "${COL_TTFT[$c]}"; done echo "" printf "%-20s" "Round-trip (ms)" for c in $(seq 0 $((NUM_COLS - 1))); do printf "%${COL_W}s" "${COL_RT[$c]}"; done echo "" printf "%-20s" "Cold start (ms)" printf "%${COL_W}s" "$COLD_MS" for c in $(seq 1 $((NUM_COLS - 1))); do printf "%${COL_W}s" "N/A"; done echo "" printf '%.0s─' $(seq 1 $((20 + NUM_COLS * COL_W))); echo "" printf "%-20s" "Precision" for c in $(seq 0 $((NUM_COLS - 1))); do printf "%${COL_W}s" "${COL_PREC[$c]}"; done echo "" printf "%-20s" "Accelerator" for c in $(seq 0 $((NUM_COLS - 1))); do printf "%${COL_W}s" "${COL_ACCEL[$c]}"; done echo "" printf "%-20s" "Timing" for c in $(seq 0 $((NUM_COLS - 1))); do printf "%${COL_W}s" "Wall-clock"; done echo "" echo "" # Append LM Studio results to JSON LMS_JSON_BLOCK=", \"lm_studio\": { \"port\": $LMS_PORT, \"models_tested\": [$(printf '"%s",' "${LMS_MODEL_LIST[@]}" | sed 's/,$//')],$( echo "$LMS_ALL_JSON" | sed '$ s/,$//' ) } }" sed -i '' '$ s/}$//' "$RESULTS_JSON" printf '%s\n' "$LMS_JSON_BLOCK" >> "$RESULTS_JSON" dim "LM Studio results added to $RESULTS_JSON" else # No LM Studio -- print ANE-only comparison if we have multiple formats if [ "$NUM_ANE_FMTS" -gt 1 ]; then info "=== ANE Format Comparison ===" echo "" printf "%-20s" "" for fi2 in $(seq 0 $((NUM_ANE_FMTS - 1))); do printf "%16s" "ANE ${ANE_FMT_NAMES[$fi2]}"; done echo "" printf '%.0s─' $(seq 1 $((20 + NUM_ANE_FMTS * 16))); echo "" printf "%-20s" "Decode (t/s)" for fi2 in $(seq 0 $((NUM_ANE_FMTS - 1))); do printf "%16s" "${ALL_AVG_D[$fi2]}"; done echo "" printf "%-20s" "Prefill (t/s)" for fi2 in $(seq 0 $((NUM_ANE_FMTS - 1))); do printf "%16s" "${ALL_AVG_P[$fi2]}"; done echo "" printf "%-20s" "TTFT (ms)" for fi2 in $(seq 0 $((NUM_ANE_FMTS - 1))); do printf "%16s" "${ALL_AVG_TTFT[$fi2]}"; done echo "" printf "%-20s" "Round-trip (ms)" for fi2 in $(seq 0 $((NUM_ANE_FMTS - 1))); do printf "%16s" "${ALL_AVG_RT[$fi2]}"; done echo "" printf '%.0s─' $(seq 1 $((20 + NUM_ANE_FMTS * 16))); echo "" echo "" fi info "=== LM Studio Comparison ===" echo "" if [ "$LMS_REACHABLE" -eq 0 ]; then echo " LM Studio server not detected on localhost:$LMS_PORT" echo "" echo " To enable automatic comparison:" echo " 1. Open LM Studio, download Qwen2.5-0.5B-Instruct (GGUF + MLX variants)" echo " 2. Load the model, go to Developer tab > Start Server" echo " 3. Re-run this benchmark" echo "" echo " Or set env vars: LMS_PORT=1234 LMS_API_KEY=your-key ./benchmark.sh" echo "" echo " Models benchmarked by default:" echo " - qwen2.5-0.5b-instruct (GGUF)" echo " - qwen2.5-0.5b-instruct-mlx@8bit (MLX 8-bit)" echo " - qwen2.5-0.5b-instruct-mlx@4bit (MLX 4-bit)" echo "" echo " Override with: LMS_MODELS='model1,model2' ./benchmark.sh" fi echo "" echo " Manual test:" echo " curl http://localhost:1234/api/v1/chat \\" echo " -H 'Content-Type: application/json' \\" echo " -H 'Authorization: Bearer YOUR_API_KEY' \\" echo " -d '{\"model\":\"qwen2.5-0.5b-instruct\",\"system_prompt\":\"You are a helpful assistant.\",\"input\":\"What is 2+2?\"}'" echo "" echo " ANE F16: prefill=${AVG_P} t/s, decode=${AVG_D} t/s, inference=${AVG_INF}ms" echo "" echo " Note: LM Studio uses quantized GGUF/MLX (CPU/GPU) while we use" echo " F16/Q8 weights running on CPU AMX / NEON." fi echo ""