201 lines
8.6 KiB
Bash
Executable File
201 lines
8.6 KiB
Bash
Executable File
#!/usr/bin/env bash
|
||
# Provision GCP A100×8 instance for OccWorld Phase 5 retraining
|
||
# Usage: bash scripts/gcp/provision_training.sh [--dry-run]
|
||
#
|
||
# Provisions an a2-highgpu-8g (8× A100 40GB) in us-central1-a (fallback us-east1-b).
|
||
# GCP project: cognitum-20260110
|
||
# Auth: ruv@ruv.net (gcloud must already be authenticated)
|
||
|
||
set -euo pipefail
|
||
|
||
# ── Constants ──────────────────────────────────────────────────────────────────
|
||
PROJECT="cognitum-20260110"
|
||
INSTANCE_NAME="occworld-train-$(date +%Y%m%d)"
|
||
MACHINE_TYPE="a2-highgpu-8g"
|
||
PRIMARY_ZONE="us-central1-a"
|
||
FALLBACK_ZONE="us-east1-b"
|
||
IMAGE_FAMILY="pytorch-latest-gpu"
|
||
IMAGE_PROJECT="deeplearning-platform-release"
|
||
DISK_SIZE="500GB"
|
||
DISK_TYPE="pd-ssd"
|
||
# Cost reference: a2-highgpu-8g ~$29.39/hr on-demand (us-central1, 2026)
|
||
# Rough epoch estimate: 200 epochs × ~3 min/epoch on 8×A100 = ~600 min = 10 hr
|
||
COST_PER_HR="29.39"
|
||
EPOCH_HOURS="10"
|
||
|
||
# ── Flags ─────────────────────────────────────────────────────────────────────
|
||
DRY_RUN=false
|
||
for arg in "$@"; do
|
||
case "$arg" in
|
||
--dry-run) DRY_RUN=true ;;
|
||
-h|--help)
|
||
echo "Usage: $0 [--dry-run]"
|
||
echo " --dry-run Echo gcloud commands without executing them"
|
||
exit 0
|
||
;;
|
||
*)
|
||
echo "Unknown argument: $arg" >&2
|
||
echo "Usage: $0 [--dry-run]" >&2
|
||
exit 1
|
||
;;
|
||
esac
|
||
done
|
||
|
||
# ── Helpers ───────────────────────────────────────────────────────────────────
|
||
run() {
|
||
if [[ "$DRY_RUN" == "true" ]]; then
|
||
echo "[DRY-RUN] $*"
|
||
else
|
||
"$@"
|
||
fi
|
||
}
|
||
|
||
log() { echo "[provision_training] $*"; }
|
||
|
||
# ── Startup script (embedded heredoc) ─────────────────────────────────────────
|
||
# Written to a temp file so gcloud can reference it via --metadata-from-file.
|
||
STARTUP_SCRIPT_FILE="$(mktemp /tmp/startup_training_XXXXXX.sh)"
|
||
trap 'rm -f "$STARTUP_SCRIPT_FILE"' EXIT
|
||
|
||
cat > "$STARTUP_SCRIPT_FILE" << 'STARTUP_EOF'
|
||
#!/usr/bin/env bash
|
||
set -euo pipefail
|
||
LOGFILE="/var/log/ruview-startup.log"
|
||
exec > >(tee -a "$LOGFILE") 2>&1
|
||
|
||
echo "[startup] $(date): beginning environment setup"
|
||
|
||
# ── 1. System packages ────────────────────────────────────────────────────────
|
||
apt-get update -qq
|
||
apt-get install -y -qq git rsync wget curl htop nvtop screen tmux
|
||
|
||
# ── 2. Conda (miniforge) ──────────────────────────────────────────────────────
|
||
if [[ ! -d /opt/conda ]]; then
|
||
echo "[startup] Installing miniforge ..."
|
||
MINI_URL="https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-Linux-x86_64.sh"
|
||
wget -q "$MINI_URL" -O /tmp/miniforge.sh
|
||
bash /tmp/miniforge.sh -b -p /opt/conda
|
||
rm /tmp/miniforge.sh
|
||
fi
|
||
export PATH="/opt/conda/bin:$PATH"
|
||
conda init bash
|
||
|
||
# ── 3. OccWorld conda env ─────────────────────────────────────────────────────
|
||
if ! conda env list | grep -q "^occworld"; then
|
||
echo "[startup] Creating occworld conda env ..."
|
||
conda create -y -n occworld python=3.10
|
||
fi
|
||
|
||
# shellcheck source=/dev/null
|
||
source /opt/conda/etc/profile.d/conda.sh
|
||
conda activate occworld
|
||
|
||
# PyTorch 2.x + CUDA 12 (deeplearning image ships CUDA 12)
|
||
pip install -q --upgrade pip
|
||
pip install -q torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
|
||
pip install -q \
|
||
numpy scipy einops timm mmcv-full \
|
||
tensorboard wandb tqdm pyyaml \
|
||
huggingface_hub accelerate
|
||
|
||
# ── 4. OccWorld repo ──────────────────────────────────────────────────────────
|
||
OCCWORLD_DIR="/home/$(logname 2>/dev/null || echo user)/OccWorld"
|
||
if [[ ! -d "$OCCWORLD_DIR" ]]; then
|
||
echo "[startup] Cloning OccWorld ..."
|
||
git clone --depth=1 https://github.com/OpenDriveLab/OccWorld.git "$OCCWORLD_DIR"
|
||
fi
|
||
cd "$OCCWORLD_DIR"
|
||
pip install -q -r requirements.txt 2>/dev/null || true
|
||
|
||
# ── 5. RuView repo sync placeholder ──────────────────────────────────────────
|
||
# Actual repo sync is done by run_training.sh via rsync before SSH commands.
|
||
mkdir -p ~/ruview-scripts ~/checkpoints/vqvae ~/checkpoints/transformer
|
||
|
||
echo "[startup] $(date): setup complete — instance ready for training"
|
||
STARTUP_EOF
|
||
|
||
# ── Zone availability check ────────────────────────────────────────────────────
|
||
ZONE="$PRIMARY_ZONE"
|
||
if [[ "$DRY_RUN" == "false" ]]; then
|
||
log "Checking A100 availability in $PRIMARY_ZONE ..."
|
||
AVAIL=$(gcloud compute accelerator-types list \
|
||
--project="$PROJECT" \
|
||
--filter="name=nvidia-tesla-a100 AND zone=$PRIMARY_ZONE" \
|
||
--format="value(name)" 2>/dev/null | head -1)
|
||
if [[ -z "$AVAIL" ]]; then
|
||
log "A100 not available in $PRIMARY_ZONE — falling back to $FALLBACK_ZONE"
|
||
ZONE="$FALLBACK_ZONE"
|
||
else
|
||
log "A100 confirmed available in $PRIMARY_ZONE"
|
||
fi
|
||
else
|
||
log "[DRY-RUN] Would check A100 availability in $PRIMARY_ZONE (fallback: $FALLBACK_ZONE)"
|
||
fi
|
||
|
||
# ── Cost estimate ──────────────────────────────────────────────────────────────
|
||
TOTAL_COST=$(awk "BEGIN {printf \"%.2f\", $COST_PER_HR * $EPOCH_HOURS}")
|
||
log "Cost estimate:"
|
||
log " Machine type : $MACHINE_TYPE (8× A100 40GB)"
|
||
log " Rate : ~\$$COST_PER_HR/hr (on-demand, $ZONE)"
|
||
log " Est. duration: ~${EPOCH_HOURS} hr (200 epochs, 8×A100)"
|
||
log " Est. total : ~\$$TOTAL_COST"
|
||
log " Tip: Use --preemptible to cut cost ~60% at the risk of interruptions"
|
||
|
||
# ── Provision instance ────────────────────────────────────────────────────────
|
||
log "Provisioning $INSTANCE_NAME in $ZONE ..."
|
||
|
||
run gcloud compute instances create "$INSTANCE_NAME" \
|
||
--project="$PROJECT" \
|
||
--zone="$ZONE" \
|
||
--machine-type="$MACHINE_TYPE" \
|
||
--accelerator="type=nvidia-tesla-a100,count=8" \
|
||
--image-family="$IMAGE_FAMILY" \
|
||
--image-project="$IMAGE_PROJECT" \
|
||
--boot-disk-size="$DISK_SIZE" \
|
||
--boot-disk-type="$DISK_TYPE" \
|
||
--boot-disk-device-name="${INSTANCE_NAME}-disk" \
|
||
--maintenance-policy=TERMINATE \
|
||
--restart-on-failure \
|
||
--metadata-from-file="startup-script=$STARTUP_SCRIPT_FILE" \
|
||
--scopes="cloud-platform" \
|
||
--format="value(name)"
|
||
|
||
if [[ "$DRY_RUN" == "true" ]]; then
|
||
log "[DRY-RUN] Skipping IP lookup and SSH command output"
|
||
exit 0
|
||
fi
|
||
|
||
# ── Wait for instance to be ready ─────────────────────────────────────────────
|
||
log "Waiting for instance to reach RUNNING state ..."
|
||
for i in $(seq 1 30); do
|
||
STATUS=$(gcloud compute instances describe "$INSTANCE_NAME" \
|
||
--project="$PROJECT" --zone="$ZONE" \
|
||
--format="value(status)" 2>/dev/null || echo "UNKNOWN")
|
||
if [[ "$STATUS" == "RUNNING" ]]; then
|
||
break
|
||
fi
|
||
sleep 10
|
||
if [[ $i -eq 30 ]]; then
|
||
log "ERROR: Instance did not reach RUNNING within 5 min" >&2
|
||
exit 1
|
||
fi
|
||
done
|
||
|
||
# ── Print connection info ─────────────────────────────────────────────────────
|
||
INSTANCE_IP=$(gcloud compute instances describe "$INSTANCE_NAME" \
|
||
--project="$PROJECT" --zone="$ZONE" \
|
||
--format="value(networkInterfaces[0].accessConfigs[0].natIP)")
|
||
|
||
log "Instance ready:"
|
||
log " Name : $INSTANCE_NAME"
|
||
log " Zone : $ZONE"
|
||
log " IP : $INSTANCE_IP"
|
||
log " SSH : gcloud compute ssh $INSTANCE_NAME --project=$PROJECT --zone=$ZONE"
|
||
log " SSH IP : ssh $(gcloud config get-value account 2>/dev/null)@$INSTANCE_IP"
|
||
log ""
|
||
log "Startup script is running in background (/var/log/ruview-startup.log)."
|
||
log "Wait 3-5 min for conda/deps before running run_training.sh."
|
||
log ""
|
||
log "Next step:"
|
||
log " bash scripts/gcp/run_training.sh $INSTANCE_IP <SNAPSHOT_DIR>"
|