ANE/inference/setup.sh

#!/bin/bash
set -euo pipefail

SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
MODEL_ID="Qwen/Qwen2.5-0.5B-Instruct"
MODEL_DIR="$HOME/models/Qwen2.5-0.5B-Instruct"
WEIGHTS_BIN="$SCRIPT_DIR/qwen05b.bin"
BINARY="$SCRIPT_DIR/qwen_ane"
VENV_DIR="$SCRIPT_DIR/.venv"
EXPECTED_WEIGHT_SIZE_F32=1976131100
EXPECTED_WEIGHT_SIZE_F16=988082236

info()  { printf "\033[1;34m==> %s\033[0m\n" "$1"; }
ok()    { printf "\033[1;32m  ✓ %s\033[0m\n" "$1"; }
warn()  { printf "\033[1;33m  ! %s\033[0m\n" "$1"; }
fail()  { printf "\033[1;31m  ✗ %s\033[0m\n" "$1"; exit 1; }

info "ANE Inference Setup"
echo "Model: $MODEL_ID"
echo "Target: $SCRIPT_DIR"
echo ""

# --- Step 1: Prerequisites ---
info "Checking prerequisites..."

if ! command -v xcrun &>/dev/null; then
    fail "Xcode Command Line Tools not found. Install with: xcode-select --install"
fi
ok "xcrun clang available"

if ! command -v python3 &>/dev/null; then
    fail "Python 3 not found"
fi

PY_VER=$(python3 -c 'import sys; print(f"{sys.version_info.major}.{sys.version_info.minor}")')
PY_MAJOR=$(echo "$PY_VER" | cut -d. -f1)
PY_MINOR=$(echo "$PY_VER" | cut -d. -f2)
if [ "$PY_MAJOR" -lt 3 ] || ([ "$PY_MAJOR" -eq 3 ] && [ "$PY_MINOR" -lt 11 ]); then
    fail "Python 3.11+ required (found $PY_VER). coremltools needs 3.11-3.13."
fi
ok "Python $PY_VER"

# --- Step 2: Virtual environment ---
info "Setting up Python environment..."

if [ ! -d "$VENV_DIR" ]; then
    python3 -m venv "$VENV_DIR"
    ok "Created venv at $VENV_DIR"
else
    ok "Venv already exists"
fi

source "$VENV_DIR/bin/activate"

pip install --quiet --upgrade pip
pip install --quiet safetensors torch transformers huggingface-hub
ok "Python dependencies installed"

# --- Step 3: Download model ---
info "Downloading model from HuggingFace..."

if [ -f "$MODEL_DIR/model.safetensors" ] && [ -f "$MODEL_DIR/tokenizer.json" ]; then
    ok "Model already downloaded at $MODEL_DIR"
else
    mkdir -p "$MODEL_DIR"
    if command -v huggingface-cli &>/dev/null; then
        huggingface-cli download "$MODEL_ID" --local-dir "$MODEL_DIR"
    else
        python3 -c "
from huggingface_hub import snapshot_download
snapshot_download('$MODEL_ID', local_dir='$MODEL_DIR')
"
    fi
    ok "Model downloaded to $MODEL_DIR"
fi

# Verify key files exist
for f in model.safetensors tokenizer.json vocab.json merges.txt config.json; do
    if [ ! -f "$MODEL_DIR/$f" ]; then
        fail "Missing $f in $MODEL_DIR"
    fi
done
ok "All model files present"

# --- Step 4: Convert weights ---
info "Converting weights to binary format..."

if [ -f "$WEIGHTS_BIN" ]; then
    ACTUAL_SIZE=$(stat -f%z "$WEIGHTS_BIN" 2>/dev/null || stat -c%s "$WEIGHTS_BIN" 2>/dev/null)
    if [ "$ACTUAL_SIZE" -eq "$EXPECTED_WEIGHT_SIZE_F16" ] || [ "$ACTUAL_SIZE" -eq "$EXPECTED_WEIGHT_SIZE_F32" ]; then
        ok "Weights already converted ($((ACTUAL_SIZE / 1024 / 1024)) MB)"
    else
        warn "Weight file exists but unexpected size ($ACTUAL_SIZE), reconverting as F16"
        python3 "$SCRIPT_DIR/convert_weights.py" "$MODEL_DIR" "$WEIGHTS_BIN" --f16
        ok "Weights converted (F16)"
    fi
else
    python3 "$SCRIPT_DIR/convert_weights.py" "$MODEL_DIR" "$WEIGHTS_BIN" --f16
    ok "Weights converted (F16)"
fi

# --- Step 5: Build binary ---
info "Building qwen_ane binary..."

NEEDS_BUILD=0
if [ ! -f "$BINARY" ]; then
    NEEDS_BUILD=1
elif [ "$SCRIPT_DIR/main.m" -nt "$BINARY" ] || \
     [ "$SCRIPT_DIR/qwen_ane_infer.h" -nt "$BINARY" ] || \
     [ "$SCRIPT_DIR/tokenizer.h" -nt "$BINARY" ] 2>/dev/null || \
     [ "$SCRIPT_DIR/http_server.h" -nt "$BINARY" ] 2>/dev/null; then
    NEEDS_BUILD=1
    warn "Source files newer than binary, rebuilding"
fi

if [ "$NEEDS_BUILD" -eq 1 ]; then
    xcrun clang -O3 -ffast-math -mcpu=apple-m4 -flto \
        -framework Foundation -framework IOSurface \
        -framework CoreML -framework Accelerate -framework Metal \
        -ldl -lobjc -fobjc-arc \
        -o "$BINARY" "$SCRIPT_DIR/main.m"
    ok "Binary built: $BINARY"
else
    ok "Binary up to date"
fi

# --- Step 6: Smoke test ---
info "Running smoke test..."

# Quick single-shot test with known token IDs for "system\nYou are a helpful assistant."
TEST_OUTPUT=$("$BINARY" "$WEIGHTS_BIN" "151644 8948 198" 3 2>&1 || true)

if echo "$TEST_OUTPUT" | grep -q "OUT:"; then
    ok "Smoke test passed (model generates output)"
else
    warn "Smoke test: no output tokens detected (this may be OK on first run)"
    echo "  Output was: $(echo "$TEST_OUTPUT" | tail -3)"
fi

# --- Done ---
echo ""
info "Setup complete!"
echo ""
echo "  Binary:  $BINARY"
echo "  Weights: $WEIGHTS_BIN ($(du -h "$WEIGHTS_BIN" | cut -f1) )"
echo "  Model:   $MODEL_DIR"
echo ""
echo "Quick start:"
echo "  # Single prompt (slow, compiles every time)"
echo "  python3 $SCRIPT_DIR/run.py \"What is 2+2?\""
echo ""
echo "  # Server mode (fast, compile once)"
echo "  $BINARY $WEIGHTS_BIN --server /tmp/qwen_ane.sock &"
echo "  python3 $SCRIPT_DIR/run.py \"What is 2+2?\""
echo ""
echo "  # HTTP API (fast, no Python needed for queries)"
echo "  $BINARY $WEIGHTS_BIN --http 8000 --model-dir $MODEL_DIR"
echo "  curl http://localhost:8000/v1/completions -d '{\"prompt\":\"Hi\",\"max_tokens\":20}'"
echo ""
echo "  # Run throughput benchmark"
echo "  $SCRIPT_DIR/benchmark.sh"