#!/usr/bin/env bash # Provision GCP L4 instance for ruview-swarm MARL training (ADR-148 M4). # # RIGHT-SIZING RATIONALE: # The MARL policy is a 64→128→64 MLP (~12K params). GPU matmul is NOT the # bottleneck — environment-rollout throughput (stepping the swarm sim) is. # An L4 + 16 vCPU (g2-standard-16, ~$1.40/hr) beats an 8× A100 box # (a2-highgpu-8g, ~$29/hr) for this workload at 1/20th the cost. # Reserve the A100×8 box (provision_training.sh) for OccWorld world-model # training, which actually saturates the GPUs. # # Usage: bash scripts/gcp/provision_marl.sh [--dry-run] # # Provisions a g2-standard-16 (1× L4 24GB, 16 vCPU) in us-central1-a # (fallback us-east1-b). # GCP project: cognitum-20260110 # Auth: ruv@ruv.net (gcloud must already be authenticated) set -euo pipefail # ── Constants ────────────────────────────────────────────────────────────────── PROJECT="cognitum-20260110" INSTANCE_NAME="ruview-marl-$(date +%Y%m%d)" MACHINE_TYPE="g2-standard-16" PRIMARY_ZONE="us-central1-a" FALLBACK_ZONE="us-east1-b" IMAGE_FAMILY="pytorch-latest-gpu" IMAGE_PROJECT="deeplearning-platform-release" DISK_SIZE="200GB" DISK_TYPE="pd-ssd" # Cost reference: g2-standard-16 ~$1.40/hr on-demand (us-central1, 2026). # Compare a2-highgpu-8g at ~$29.39/hr — a ~20× cost reduction. MARL is # rollout-bound (CPU-stepped swarm sim), not matmul-bound, so the 16 vCPUs # matter more than peak GPU FLOPs for this 12K-param policy. COST_PER_HR="1.40" A100_BOX_RATE="29.39" # Rough estimate: 5000 episodes × 4 drones, rollout-bound on 16 vCPU ≈ 2–4 hr. RUN_HOURS="3" # ── Flags ───────────────────────────────────────────────────────────────────── DRY_RUN=false for arg in "$@"; do case "$arg" in --dry-run) DRY_RUN=true ;; -h|--help) echo "Usage: $0 [--dry-run]" echo " --dry-run Echo gcloud commands without executing them" exit 0 ;; *) echo "Unknown argument: $arg" >&2 echo "Usage: $0 [--dry-run]" >&2 exit 1 ;; esac done # ── Helpers ─────────────────────────────────────────────────────────────────── run() { if [[ "$DRY_RUN" == "true" ]]; then echo "[DRY-RUN] $*" else "$@" fi } log() { echo "[provision_marl] $*"; } # ── Startup script (embedded heredoc) ───────────────────────────────────────── # Written to a temp file so gcloud can reference it via --metadata-from-file. # For MARL the heavy lifting is a Rust/Candle binary, so we install the Rust # toolchain rather than a conda Python env. STARTUP_SCRIPT_FILE="$(mktemp /tmp/startup_marl_XXXXXX.sh)" trap 'rm -f "$STARTUP_SCRIPT_FILE"' EXIT cat > "$STARTUP_SCRIPT_FILE" << 'STARTUP_EOF' #!/usr/bin/env bash set -euo pipefail LOGFILE="/var/log/ruview-marl-startup.log" exec > >(tee -a "$LOGFILE") 2>&1 echo "[startup] $(date): beginning MARL environment setup" # ── 1. System packages ──────────────────────────────────────────────────────── apt-get update -qq apt-get install -y -qq git rsync wget curl htop nvtop screen tmux \ build-essential pkg-config libssl-dev # ── 2. Rust toolchain (for cargo build of ruview-swarm) ──────────────────────── TARGET_USER="$(logname 2>/dev/null || echo user)" TARGET_HOME="$(getent passwd "$TARGET_USER" | cut -d: -f6)" if [[ ! -d "$TARGET_HOME/.cargo" ]]; then echo "[startup] Installing Rust toolchain for $TARGET_USER ..." sudo -u "$TARGET_USER" bash -c \ 'curl --proto "=https" --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y' fi # ── 3. CUDA sanity (deeplearning image ships CUDA 12 + driver) ───────────────── echo "[startup] CUDA check:" nvidia-smi || echo "[startup] WARNING: nvidia-smi not available yet" # ── 4. Checkpoint dirs + repo sync placeholder ───────────────────────────────── # Actual crate sync is done by run_marl_train.sh via rsync before the build. sudo -u "$TARGET_USER" mkdir -p "$TARGET_HOME/ruview-swarm" \ "$TARGET_HOME/marl-checkpoints" echo "[startup] $(date): setup complete — instance ready for MARL training" STARTUP_EOF # ── L4 availability check (with zone fallback) ───────────────────────────────── ZONE="$PRIMARY_ZONE" if [[ "$DRY_RUN" == "false" ]]; then log "Checking L4 availability in $PRIMARY_ZONE ..." AVAIL=$(gcloud compute accelerator-types list \ --project="$PROJECT" \ --filter="name=nvidia-l4 AND zone=$PRIMARY_ZONE" \ --format="value(name)" 2>/dev/null | head -1) if [[ -z "$AVAIL" ]]; then log "L4 not available in $PRIMARY_ZONE — falling back to $FALLBACK_ZONE" ZONE="$FALLBACK_ZONE" else log "L4 confirmed available in $PRIMARY_ZONE" fi else log "[DRY-RUN] Would check L4 availability in $PRIMARY_ZONE (fallback: $FALLBACK_ZONE)" fi # ── Cost estimate ────────────────────────────────────────────────────────────── TOTAL_COST=$(awk "BEGIN {printf \"%.2f\", $COST_PER_HR * $RUN_HOURS}") A100_COST=$(awk "BEGIN {printf \"%.2f\", $A100_BOX_RATE * $RUN_HOURS}") SAVINGS=$(awk "BEGIN {printf \"%.0f\", $A100_BOX_RATE / $COST_PER_HR}") log "Cost estimate:" log " Machine type : $MACHINE_TYPE (1× L4 24GB, 16 vCPU)" log " Rate : ~\$$COST_PER_HR/hr (on-demand, $ZONE)" log " Est. duration: ~${RUN_HOURS} hr (5000 episodes, rollout-bound)" log " Est. total : ~\$$TOTAL_COST" log " vs A100×8 : ~\$$A100_COST for the same wall time (~${SAVINGS}× more expensive)" log " Why L4 : MARL policy is a 12K-param MLP — bottleneck is CPU env rollout, not GPU matmul" log " Tip: Use --preemptible to cut cost further at the risk of interruptions" # ── Provision instance ──────────────────────────────────────────────────────── log "Provisioning $INSTANCE_NAME in $ZONE ..." run gcloud compute instances create "$INSTANCE_NAME" \ --project="$PROJECT" \ --zone="$ZONE" \ --machine-type="$MACHINE_TYPE" \ --accelerator="type=nvidia-l4,count=1" \ --image-family="$IMAGE_FAMILY" \ --image-project="$IMAGE_PROJECT" \ --boot-disk-size="$DISK_SIZE" \ --boot-disk-type="$DISK_TYPE" \ --boot-disk-device-name="${INSTANCE_NAME}-disk" \ --maintenance-policy=TERMINATE \ --restart-on-failure \ --metadata-from-file="startup-script=$STARTUP_SCRIPT_FILE" \ --scopes="cloud-platform" \ --format="value(name)" if [[ "$DRY_RUN" == "true" ]]; then log "[DRY-RUN] Skipping IP lookup and SSH command output" exit 0 fi # ── Wait for instance to be ready ───────────────────────────────────────────── log "Waiting for instance to reach RUNNING state ..." for i in $(seq 1 30); do STATUS=$(gcloud compute instances describe "$INSTANCE_NAME" \ --project="$PROJECT" --zone="$ZONE" \ --format="value(status)" 2>/dev/null || echo "UNKNOWN") if [[ "$STATUS" == "RUNNING" ]]; then break fi sleep 10 if [[ $i -eq 30 ]]; then log "ERROR: Instance did not reach RUNNING within 5 min" >&2 exit 1 fi done # ── Print connection info ───────────────────────────────────────────────────── INSTANCE_IP=$(gcloud compute instances describe "$INSTANCE_NAME" \ --project="$PROJECT" --zone="$ZONE" \ --format="value(networkInterfaces[0].accessConfigs[0].natIP)") log "Instance ready:" log " Name : $INSTANCE_NAME" log " Zone : $ZONE" log " IP : $INSTANCE_IP" log " SSH : gcloud compute ssh $INSTANCE_NAME --project=$PROJECT --zone=$ZONE" log " SSH IP : ssh $(gcloud config get-value account 2>/dev/null)@$INSTANCE_IP" log "" log "Startup script is running in background (/var/log/ruview-marl-startup.log)." log "Wait 2-3 min for the Rust toolchain install before running run_marl_train.sh." log "" log "Next step:" log " bash scripts/gcp/run_marl_train.sh $INSTANCE_IP" log "Teardown when done:" log " bash scripts/gcp/teardown.sh $INSTANCE_NAME"