#!/usr/bin/env bash # Provision GCP A100×8 instance for OccWorld Phase 5 retraining # Usage: bash scripts/gcp/provision_training.sh [--dry-run] # # Provisions an a2-highgpu-8g (8× A100 40GB) in us-central1-a (fallback us-east1-b). # GCP project: cognitum-20260110 # Auth: ruv@ruv.net (gcloud must already be authenticated) set -euo pipefail # ── Constants ────────────────────────────────────────────────────────────────── PROJECT="cognitum-20260110" INSTANCE_NAME="occworld-train-$(date +%Y%m%d)" MACHINE_TYPE="a2-highgpu-8g" PRIMARY_ZONE="us-central1-a" FALLBACK_ZONE="us-east1-b" IMAGE_FAMILY="pytorch-latest-gpu" IMAGE_PROJECT="deeplearning-platform-release" DISK_SIZE="500GB" DISK_TYPE="pd-ssd" # Cost reference: a2-highgpu-8g ~$29.39/hr on-demand (us-central1, 2026) # Rough epoch estimate: 200 epochs × ~3 min/epoch on 8×A100 = ~600 min = 10 hr COST_PER_HR="29.39" EPOCH_HOURS="10" # ── Flags ───────────────────────────────────────────────────────────────────── DRY_RUN=false for arg in "$@"; do case "$arg" in --dry-run) DRY_RUN=true ;; -h|--help) echo "Usage: $0 [--dry-run]" echo " --dry-run Echo gcloud commands without executing them" exit 0 ;; *) echo "Unknown argument: $arg" >&2 echo "Usage: $0 [--dry-run]" >&2 exit 1 ;; esac done # ── Helpers ─────────────────────────────────────────────────────────────────── run() { if [[ "$DRY_RUN" == "true" ]]; then echo "[DRY-RUN] $*" else "$@" fi } log() { echo "[provision_training] $*"; } # ── Startup script (embedded heredoc) ───────────────────────────────────────── # Written to a temp file so gcloud can reference it via --metadata-from-file. STARTUP_SCRIPT_FILE="$(mktemp /tmp/startup_training_XXXXXX.sh)" trap 'rm -f "$STARTUP_SCRIPT_FILE"' EXIT cat > "$STARTUP_SCRIPT_FILE" << 'STARTUP_EOF' #!/usr/bin/env bash set -euo pipefail LOGFILE="/var/log/ruview-startup.log" exec > >(tee -a "$LOGFILE") 2>&1 echo "[startup] $(date): beginning environment setup" # ── 1. System packages ──────────────────────────────────────────────────────── apt-get update -qq apt-get install -y -qq git rsync wget curl htop nvtop screen tmux # ── 2. Conda (miniforge) ────────────────────────────────────────────────────── if [[ ! -d /opt/conda ]]; then echo "[startup] Installing miniforge ..." MINI_URL="https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-Linux-x86_64.sh" wget -q "$MINI_URL" -O /tmp/miniforge.sh bash /tmp/miniforge.sh -b -p /opt/conda rm /tmp/miniforge.sh fi export PATH="/opt/conda/bin:$PATH" conda init bash # ── 3. OccWorld conda env ───────────────────────────────────────────────────── if ! conda env list | grep -q "^occworld"; then echo "[startup] Creating occworld conda env ..." conda create -y -n occworld python=3.10 fi # shellcheck source=/dev/null source /opt/conda/etc/profile.d/conda.sh conda activate occworld # PyTorch 2.x + CUDA 12 (deeplearning image ships CUDA 12) pip install -q --upgrade pip pip install -q torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121 pip install -q \ numpy scipy einops timm mmcv-full \ tensorboard wandb tqdm pyyaml \ huggingface_hub accelerate # ── 4. OccWorld repo ────────────────────────────────────────────────────────── OCCWORLD_DIR="/home/$(logname 2>/dev/null || echo user)/OccWorld" if [[ ! -d "$OCCWORLD_DIR" ]]; then echo "[startup] Cloning OccWorld ..." git clone --depth=1 https://github.com/OpenDriveLab/OccWorld.git "$OCCWORLD_DIR" fi cd "$OCCWORLD_DIR" pip install -q -r requirements.txt 2>/dev/null || true # ── 5. RuView repo sync placeholder ────────────────────────────────────────── # Actual repo sync is done by run_training.sh via rsync before SSH commands. mkdir -p ~/ruview-scripts ~/checkpoints/vqvae ~/checkpoints/transformer echo "[startup] $(date): setup complete — instance ready for training" STARTUP_EOF # ── Zone availability check ──────────────────────────────────────────────────── ZONE="$PRIMARY_ZONE" if [[ "$DRY_RUN" == "false" ]]; then log "Checking A100 availability in $PRIMARY_ZONE ..." AVAIL=$(gcloud compute accelerator-types list \ --project="$PROJECT" \ --filter="name=nvidia-tesla-a100 AND zone=$PRIMARY_ZONE" \ --format="value(name)" 2>/dev/null | head -1) if [[ -z "$AVAIL" ]]; then log "A100 not available in $PRIMARY_ZONE — falling back to $FALLBACK_ZONE" ZONE="$FALLBACK_ZONE" else log "A100 confirmed available in $PRIMARY_ZONE" fi else log "[DRY-RUN] Would check A100 availability in $PRIMARY_ZONE (fallback: $FALLBACK_ZONE)" fi # ── Cost estimate ────────────────────────────────────────────────────────────── TOTAL_COST=$(awk "BEGIN {printf \"%.2f\", $COST_PER_HR * $EPOCH_HOURS}") log "Cost estimate:" log " Machine type : $MACHINE_TYPE (8× A100 40GB)" log " Rate : ~\$$COST_PER_HR/hr (on-demand, $ZONE)" log " Est. duration: ~${EPOCH_HOURS} hr (200 epochs, 8×A100)" log " Est. total : ~\$$TOTAL_COST" log " Tip: Use --preemptible to cut cost ~60% at the risk of interruptions" # ── Provision instance ──────────────────────────────────────────────────────── log "Provisioning $INSTANCE_NAME in $ZONE ..." run gcloud compute instances create "$INSTANCE_NAME" \ --project="$PROJECT" \ --zone="$ZONE" \ --machine-type="$MACHINE_TYPE" \ --accelerator="type=nvidia-tesla-a100,count=8" \ --image-family="$IMAGE_FAMILY" \ --image-project="$IMAGE_PROJECT" \ --boot-disk-size="$DISK_SIZE" \ --boot-disk-type="$DISK_TYPE" \ --boot-disk-device-name="${INSTANCE_NAME}-disk" \ --maintenance-policy=TERMINATE \ --restart-on-failure \ --metadata-from-file="startup-script=$STARTUP_SCRIPT_FILE" \ --scopes="cloud-platform" \ --format="value(name)" if [[ "$DRY_RUN" == "true" ]]; then log "[DRY-RUN] Skipping IP lookup and SSH command output" exit 0 fi # ── Wait for instance to be ready ───────────────────────────────────────────── log "Waiting for instance to reach RUNNING state ..." for i in $(seq 1 30); do STATUS=$(gcloud compute instances describe "$INSTANCE_NAME" \ --project="$PROJECT" --zone="$ZONE" \ --format="value(status)" 2>/dev/null || echo "UNKNOWN") if [[ "$STATUS" == "RUNNING" ]]; then break fi sleep 10 if [[ $i -eq 30 ]]; then log "ERROR: Instance did not reach RUNNING within 5 min" >&2 exit 1 fi done # ── Print connection info ───────────────────────────────────────────────────── INSTANCE_IP=$(gcloud compute instances describe "$INSTANCE_NAME" \ --project="$PROJECT" --zone="$ZONE" \ --format="value(networkInterfaces[0].accessConfigs[0].natIP)") log "Instance ready:" log " Name : $INSTANCE_NAME" log " Zone : $ZONE" log " IP : $INSTANCE_IP" log " SSH : gcloud compute ssh $INSTANCE_NAME --project=$PROJECT --zone=$ZONE" log " SSH IP : ssh $(gcloud config get-value account 2>/dev/null)@$INSTANCE_IP" log "" log "Startup script is running in background (/var/log/ruview-startup.log)." log "Wait 3-5 min for conda/deps before running run_training.sh." log "" log "Next step:" log " bash scripts/gcp/run_training.sh $INSTANCE_IP "