ANE/inference/setup.sh

162 lines
5.1 KiB
Bash
Executable File

#!/bin/bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
MODEL_ID="Qwen/Qwen2.5-0.5B-Instruct"
MODEL_DIR="$HOME/models/Qwen2.5-0.5B-Instruct"
WEIGHTS_BIN="$SCRIPT_DIR/qwen05b.bin"
BINARY="$SCRIPT_DIR/qwen_ane"
VENV_DIR="$SCRIPT_DIR/.venv"
EXPECTED_WEIGHT_SIZE_F32=1976131100
EXPECTED_WEIGHT_SIZE_F16=988082236
info() { printf "\033[1;34m==> %s\033[0m\n" "$1"; }
ok() { printf "\033[1;32m ✓ %s\033[0m\n" "$1"; }
warn() { printf "\033[1;33m ! %s\033[0m\n" "$1"; }
fail() { printf "\033[1;31m ✗ %s\033[0m\n" "$1"; exit 1; }
info "ANE Inference Setup"
echo "Model: $MODEL_ID"
echo "Target: $SCRIPT_DIR"
echo ""
# --- Step 1: Prerequisites ---
info "Checking prerequisites..."
if ! command -v xcrun &>/dev/null; then
fail "Xcode Command Line Tools not found. Install with: xcode-select --install"
fi
ok "xcrun clang available"
if ! command -v python3 &>/dev/null; then
fail "Python 3 not found"
fi
PY_VER=$(python3 -c 'import sys; print(f"{sys.version_info.major}.{sys.version_info.minor}")')
PY_MAJOR=$(echo "$PY_VER" | cut -d. -f1)
PY_MINOR=$(echo "$PY_VER" | cut -d. -f2)
if [ "$PY_MAJOR" -lt 3 ] || ([ "$PY_MAJOR" -eq 3 ] && [ "$PY_MINOR" -lt 11 ]); then
fail "Python 3.11+ required (found $PY_VER). coremltools needs 3.11-3.13."
fi
ok "Python $PY_VER"
# --- Step 2: Virtual environment ---
info "Setting up Python environment..."
if [ ! -d "$VENV_DIR" ]; then
python3 -m venv "$VENV_DIR"
ok "Created venv at $VENV_DIR"
else
ok "Venv already exists"
fi
source "$VENV_DIR/bin/activate"
pip install --quiet --upgrade pip
pip install --quiet safetensors torch transformers huggingface-hub
ok "Python dependencies installed"
# --- Step 3: Download model ---
info "Downloading model from HuggingFace..."
if [ -f "$MODEL_DIR/model.safetensors" ] && [ -f "$MODEL_DIR/tokenizer.json" ]; then
ok "Model already downloaded at $MODEL_DIR"
else
mkdir -p "$MODEL_DIR"
if command -v huggingface-cli &>/dev/null; then
huggingface-cli download "$MODEL_ID" --local-dir "$MODEL_DIR"
else
python3 -c "
from huggingface_hub import snapshot_download
snapshot_download('$MODEL_ID', local_dir='$MODEL_DIR')
"
fi
ok "Model downloaded to $MODEL_DIR"
fi
# Verify key files exist
for f in model.safetensors tokenizer.json vocab.json merges.txt config.json; do
if [ ! -f "$MODEL_DIR/$f" ]; then
fail "Missing $f in $MODEL_DIR"
fi
done
ok "All model files present"
# --- Step 4: Convert weights ---
info "Converting weights to binary format..."
if [ -f "$WEIGHTS_BIN" ]; then
ACTUAL_SIZE=$(stat -f%z "$WEIGHTS_BIN" 2>/dev/null || stat -c%s "$WEIGHTS_BIN" 2>/dev/null)
if [ "$ACTUAL_SIZE" -eq "$EXPECTED_WEIGHT_SIZE_F16" ] || [ "$ACTUAL_SIZE" -eq "$EXPECTED_WEIGHT_SIZE_F32" ]; then
ok "Weights already converted ($((ACTUAL_SIZE / 1024 / 1024)) MB)"
else
warn "Weight file exists but unexpected size ($ACTUAL_SIZE), reconverting as F16"
python3 "$SCRIPT_DIR/convert_weights.py" "$MODEL_DIR" "$WEIGHTS_BIN" --f16
ok "Weights converted (F16)"
fi
else
python3 "$SCRIPT_DIR/convert_weights.py" "$MODEL_DIR" "$WEIGHTS_BIN" --f16
ok "Weights converted (F16)"
fi
# --- Step 5: Build binary ---
info "Building qwen_ane binary..."
NEEDS_BUILD=0
if [ ! -f "$BINARY" ]; then
NEEDS_BUILD=1
elif [ "$SCRIPT_DIR/main.m" -nt "$BINARY" ] || \
[ "$SCRIPT_DIR/qwen_ane_infer.h" -nt "$BINARY" ] || \
[ "$SCRIPT_DIR/tokenizer.h" -nt "$BINARY" ] 2>/dev/null || \
[ "$SCRIPT_DIR/http_server.h" -nt "$BINARY" ] 2>/dev/null; then
NEEDS_BUILD=1
warn "Source files newer than binary, rebuilding"
fi
if [ "$NEEDS_BUILD" -eq 1 ]; then
xcrun clang -O3 -ffast-math -mcpu=apple-m4 -flto \
-framework Foundation -framework IOSurface \
-framework CoreML -framework Accelerate -framework Metal \
-ldl -lobjc -fobjc-arc \
-o "$BINARY" "$SCRIPT_DIR/main.m"
ok "Binary built: $BINARY"
else
ok "Binary up to date"
fi
# --- Step 6: Smoke test ---
info "Running smoke test..."
# Quick single-shot test with known token IDs for "system\nYou are a helpful assistant."
TEST_OUTPUT=$("$BINARY" "$WEIGHTS_BIN" "151644 8948 198" 3 2>&1 || true)
if echo "$TEST_OUTPUT" | grep -q "OUT:"; then
ok "Smoke test passed (model generates output)"
else
warn "Smoke test: no output tokens detected (this may be OK on first run)"
echo " Output was: $(echo "$TEST_OUTPUT" | tail -3)"
fi
# --- Done ---
echo ""
info "Setup complete!"
echo ""
echo " Binary: $BINARY"
echo " Weights: $WEIGHTS_BIN ($(du -h "$WEIGHTS_BIN" | cut -f1) )"
echo " Model: $MODEL_DIR"
echo ""
echo "Quick start:"
echo " # Single prompt (slow, compiles every time)"
echo " python3 $SCRIPT_DIR/run.py \"What is 2+2?\""
echo ""
echo " # Server mode (fast, compile once)"
echo " $BINARY $WEIGHTS_BIN --server /tmp/qwen_ane.sock &"
echo " python3 $SCRIPT_DIR/run.py \"What is 2+2?\""
echo ""
echo " # HTTP API (fast, no Python needed for queries)"
echo " $BINARY $WEIGHTS_BIN --http 8000 --model-dir $MODEL_DIR"
echo " curl http://localhost:8000/v1/completions -d '{\"prompt\":\"Hi\",\"max_tokens\":20}'"
echo ""
echo " # Run throughput benchmark"
echo " $SCRIPT_DIR/benchmark.sh"