mirror of https://github.com/maderix/ANE.git
162 lines
5.1 KiB
Bash
Executable File
162 lines
5.1 KiB
Bash
Executable File
#!/bin/bash
|
|
set -euo pipefail
|
|
|
|
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
|
|
MODEL_ID="Qwen/Qwen2.5-0.5B-Instruct"
|
|
MODEL_DIR="$HOME/models/Qwen2.5-0.5B-Instruct"
|
|
WEIGHTS_BIN="$SCRIPT_DIR/qwen05b.bin"
|
|
BINARY="$SCRIPT_DIR/qwen_ane"
|
|
VENV_DIR="$SCRIPT_DIR/.venv"
|
|
EXPECTED_WEIGHT_SIZE_F32=1976131100
|
|
EXPECTED_WEIGHT_SIZE_F16=988082236
|
|
|
|
info() { printf "\033[1;34m==> %s\033[0m\n" "$1"; }
|
|
ok() { printf "\033[1;32m ✓ %s\033[0m\n" "$1"; }
|
|
warn() { printf "\033[1;33m ! %s\033[0m\n" "$1"; }
|
|
fail() { printf "\033[1;31m ✗ %s\033[0m\n" "$1"; exit 1; }
|
|
|
|
info "ANE Inference Setup"
|
|
echo "Model: $MODEL_ID"
|
|
echo "Target: $SCRIPT_DIR"
|
|
echo ""
|
|
|
|
# --- Step 1: Prerequisites ---
|
|
info "Checking prerequisites..."
|
|
|
|
if ! command -v xcrun &>/dev/null; then
|
|
fail "Xcode Command Line Tools not found. Install with: xcode-select --install"
|
|
fi
|
|
ok "xcrun clang available"
|
|
|
|
if ! command -v python3 &>/dev/null; then
|
|
fail "Python 3 not found"
|
|
fi
|
|
|
|
PY_VER=$(python3 -c 'import sys; print(f"{sys.version_info.major}.{sys.version_info.minor}")')
|
|
PY_MAJOR=$(echo "$PY_VER" | cut -d. -f1)
|
|
PY_MINOR=$(echo "$PY_VER" | cut -d. -f2)
|
|
if [ "$PY_MAJOR" -lt 3 ] || ([ "$PY_MAJOR" -eq 3 ] && [ "$PY_MINOR" -lt 11 ]); then
|
|
fail "Python 3.11+ required (found $PY_VER). coremltools needs 3.11-3.13."
|
|
fi
|
|
ok "Python $PY_VER"
|
|
|
|
# --- Step 2: Virtual environment ---
|
|
info "Setting up Python environment..."
|
|
|
|
if [ ! -d "$VENV_DIR" ]; then
|
|
python3 -m venv "$VENV_DIR"
|
|
ok "Created venv at $VENV_DIR"
|
|
else
|
|
ok "Venv already exists"
|
|
fi
|
|
|
|
source "$VENV_DIR/bin/activate"
|
|
|
|
pip install --quiet --upgrade pip
|
|
pip install --quiet safetensors torch transformers huggingface-hub
|
|
ok "Python dependencies installed"
|
|
|
|
# --- Step 3: Download model ---
|
|
info "Downloading model from HuggingFace..."
|
|
|
|
if [ -f "$MODEL_DIR/model.safetensors" ] && [ -f "$MODEL_DIR/tokenizer.json" ]; then
|
|
ok "Model already downloaded at $MODEL_DIR"
|
|
else
|
|
mkdir -p "$MODEL_DIR"
|
|
if command -v huggingface-cli &>/dev/null; then
|
|
huggingface-cli download "$MODEL_ID" --local-dir "$MODEL_DIR"
|
|
else
|
|
python3 -c "
|
|
from huggingface_hub import snapshot_download
|
|
snapshot_download('$MODEL_ID', local_dir='$MODEL_DIR')
|
|
"
|
|
fi
|
|
ok "Model downloaded to $MODEL_DIR"
|
|
fi
|
|
|
|
# Verify key files exist
|
|
for f in model.safetensors tokenizer.json vocab.json merges.txt config.json; do
|
|
if [ ! -f "$MODEL_DIR/$f" ]; then
|
|
fail "Missing $f in $MODEL_DIR"
|
|
fi
|
|
done
|
|
ok "All model files present"
|
|
|
|
# --- Step 4: Convert weights ---
|
|
info "Converting weights to binary format..."
|
|
|
|
if [ -f "$WEIGHTS_BIN" ]; then
|
|
ACTUAL_SIZE=$(stat -f%z "$WEIGHTS_BIN" 2>/dev/null || stat -c%s "$WEIGHTS_BIN" 2>/dev/null)
|
|
if [ "$ACTUAL_SIZE" -eq "$EXPECTED_WEIGHT_SIZE_F16" ] || [ "$ACTUAL_SIZE" -eq "$EXPECTED_WEIGHT_SIZE_F32" ]; then
|
|
ok "Weights already converted ($((ACTUAL_SIZE / 1024 / 1024)) MB)"
|
|
else
|
|
warn "Weight file exists but unexpected size ($ACTUAL_SIZE), reconverting as F16"
|
|
python3 "$SCRIPT_DIR/convert_weights.py" "$MODEL_DIR" "$WEIGHTS_BIN" --f16
|
|
ok "Weights converted (F16)"
|
|
fi
|
|
else
|
|
python3 "$SCRIPT_DIR/convert_weights.py" "$MODEL_DIR" "$WEIGHTS_BIN" --f16
|
|
ok "Weights converted (F16)"
|
|
fi
|
|
|
|
# --- Step 5: Build binary ---
|
|
info "Building qwen_ane binary..."
|
|
|
|
NEEDS_BUILD=0
|
|
if [ ! -f "$BINARY" ]; then
|
|
NEEDS_BUILD=1
|
|
elif [ "$SCRIPT_DIR/main.m" -nt "$BINARY" ] || \
|
|
[ "$SCRIPT_DIR/qwen_ane_infer.h" -nt "$BINARY" ] || \
|
|
[ "$SCRIPT_DIR/tokenizer.h" -nt "$BINARY" ] 2>/dev/null || \
|
|
[ "$SCRIPT_DIR/http_server.h" -nt "$BINARY" ] 2>/dev/null; then
|
|
NEEDS_BUILD=1
|
|
warn "Source files newer than binary, rebuilding"
|
|
fi
|
|
|
|
if [ "$NEEDS_BUILD" -eq 1 ]; then
|
|
xcrun clang -O3 -ffast-math -mcpu=apple-m4 -flto \
|
|
-framework Foundation -framework IOSurface \
|
|
-framework CoreML -framework Accelerate -framework Metal \
|
|
-ldl -lobjc -fobjc-arc \
|
|
-o "$BINARY" "$SCRIPT_DIR/main.m"
|
|
ok "Binary built: $BINARY"
|
|
else
|
|
ok "Binary up to date"
|
|
fi
|
|
|
|
# --- Step 6: Smoke test ---
|
|
info "Running smoke test..."
|
|
|
|
# Quick single-shot test with known token IDs for "system\nYou are a helpful assistant."
|
|
TEST_OUTPUT=$("$BINARY" "$WEIGHTS_BIN" "151644 8948 198" 3 2>&1 || true)
|
|
|
|
if echo "$TEST_OUTPUT" | grep -q "OUT:"; then
|
|
ok "Smoke test passed (model generates output)"
|
|
else
|
|
warn "Smoke test: no output tokens detected (this may be OK on first run)"
|
|
echo " Output was: $(echo "$TEST_OUTPUT" | tail -3)"
|
|
fi
|
|
|
|
# --- Done ---
|
|
echo ""
|
|
info "Setup complete!"
|
|
echo ""
|
|
echo " Binary: $BINARY"
|
|
echo " Weights: $WEIGHTS_BIN ($(du -h "$WEIGHTS_BIN" | cut -f1) )"
|
|
echo " Model: $MODEL_DIR"
|
|
echo ""
|
|
echo "Quick start:"
|
|
echo " # Single prompt (slow, compiles every time)"
|
|
echo " python3 $SCRIPT_DIR/run.py \"What is 2+2?\""
|
|
echo ""
|
|
echo " # Server mode (fast, compile once)"
|
|
echo " $BINARY $WEIGHTS_BIN --server /tmp/qwen_ane.sock &"
|
|
echo " python3 $SCRIPT_DIR/run.py \"What is 2+2?\""
|
|
echo ""
|
|
echo " # HTTP API (fast, no Python needed for queries)"
|
|
echo " $BINARY $WEIGHTS_BIN --http 8000 --model-dir $MODEL_DIR"
|
|
echo " curl http://localhost:8000/v1/completions -d '{\"prompt\":\"Hi\",\"max_tokens\":20}'"
|
|
echo ""
|
|
echo " # Run throughput benchmark"
|
|
echo " $SCRIPT_DIR/benchmark.sh"
|