wifi-densepose/scripts/gcp/provision_marl.sh

200 lines
8.7 KiB
Bash
Executable File
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env bash
# Provision GCP L4 instance for ruview-swarm MARL training (ADR-148 M4).
#
# RIGHT-SIZING RATIONALE:
# The MARL policy is a 64→128→64 MLP (~12K params). GPU matmul is NOT the
# bottleneck — environment-rollout throughput (stepping the swarm sim) is.
# An L4 + 16 vCPU (g2-standard-16, ~$1.40/hr) beats an 8× A100 box
# (a2-highgpu-8g, ~$29/hr) for this workload at 1/20th the cost.
# Reserve the A100×8 box (provision_training.sh) for OccWorld world-model
# training, which actually saturates the GPUs.
#
# Usage: bash scripts/gcp/provision_marl.sh [--dry-run]
#
# Provisions a g2-standard-16 (1× L4 24GB, 16 vCPU) in us-central1-a
# (fallback us-east1-b).
# GCP project: cognitum-20260110
# Auth: ruv@ruv.net (gcloud must already be authenticated)
set -euo pipefail
# ── Constants ──────────────────────────────────────────────────────────────────
PROJECT="cognitum-20260110"
INSTANCE_NAME="ruview-marl-$(date +%Y%m%d)"
MACHINE_TYPE="g2-standard-16"
PRIMARY_ZONE="us-central1-a"
FALLBACK_ZONE="us-east1-b"
IMAGE_FAMILY="pytorch-latest-gpu"
IMAGE_PROJECT="deeplearning-platform-release"
DISK_SIZE="200GB"
DISK_TYPE="pd-ssd"
# Cost reference: g2-standard-16 ~$1.40/hr on-demand (us-central1, 2026).
# Compare a2-highgpu-8g at ~$29.39/hr — a ~20× cost reduction. MARL is
# rollout-bound (CPU-stepped swarm sim), not matmul-bound, so the 16 vCPUs
# matter more than peak GPU FLOPs for this 12K-param policy.
COST_PER_HR="1.40"
A100_BOX_RATE="29.39"
# Rough estimate: 5000 episodes × 4 drones, rollout-bound on 16 vCPU ≈ 24 hr.
RUN_HOURS="3"
# ── Flags ─────────────────────────────────────────────────────────────────────
DRY_RUN=false
for arg in "$@"; do
case "$arg" in
--dry-run) DRY_RUN=true ;;
-h|--help)
echo "Usage: $0 [--dry-run]"
echo " --dry-run Echo gcloud commands without executing them"
exit 0
;;
*)
echo "Unknown argument: $arg" >&2
echo "Usage: $0 [--dry-run]" >&2
exit 1
;;
esac
done
# ── Helpers ───────────────────────────────────────────────────────────────────
run() {
if [[ "$DRY_RUN" == "true" ]]; then
echo "[DRY-RUN] $*"
else
"$@"
fi
}
log() { echo "[provision_marl] $*"; }
# ── Startup script (embedded heredoc) ─────────────────────────────────────────
# Written to a temp file so gcloud can reference it via --metadata-from-file.
# For MARL the heavy lifting is a Rust/Candle binary, so we install the Rust
# toolchain rather than a conda Python env.
STARTUP_SCRIPT_FILE="$(mktemp /tmp/startup_marl_XXXXXX.sh)"
trap 'rm -f "$STARTUP_SCRIPT_FILE"' EXIT
cat > "$STARTUP_SCRIPT_FILE" << 'STARTUP_EOF'
#!/usr/bin/env bash
set -euo pipefail
LOGFILE="/var/log/ruview-marl-startup.log"
exec > >(tee -a "$LOGFILE") 2>&1
echo "[startup] $(date): beginning MARL environment setup"
# ── 1. System packages ────────────────────────────────────────────────────────
apt-get update -qq
apt-get install -y -qq git rsync wget curl htop nvtop screen tmux \
build-essential pkg-config libssl-dev
# ── 2. Rust toolchain (for cargo build of ruview-swarm) ────────────────────────
TARGET_USER="$(logname 2>/dev/null || echo user)"
TARGET_HOME="$(getent passwd "$TARGET_USER" | cut -d: -f6)"
if [[ ! -d "$TARGET_HOME/.cargo" ]]; then
echo "[startup] Installing Rust toolchain for $TARGET_USER ..."
sudo -u "$TARGET_USER" bash -c \
'curl --proto "=https" --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y'
fi
# ── 3. CUDA sanity (deeplearning image ships CUDA 12 + driver) ─────────────────
echo "[startup] CUDA check:"
nvidia-smi || echo "[startup] WARNING: nvidia-smi not available yet"
# ── 4. Checkpoint dirs + repo sync placeholder ─────────────────────────────────
# Actual crate sync is done by run_marl_train.sh via rsync before the build.
sudo -u "$TARGET_USER" mkdir -p "$TARGET_HOME/ruview-swarm" \
"$TARGET_HOME/marl-checkpoints"
echo "[startup] $(date): setup complete — instance ready for MARL training"
STARTUP_EOF
# ── L4 availability check (with zone fallback) ─────────────────────────────────
ZONE="$PRIMARY_ZONE"
if [[ "$DRY_RUN" == "false" ]]; then
log "Checking L4 availability in $PRIMARY_ZONE ..."
AVAIL=$(gcloud compute accelerator-types list \
--project="$PROJECT" \
--filter="name=nvidia-l4 AND zone=$PRIMARY_ZONE" \
--format="value(name)" 2>/dev/null | head -1)
if [[ -z "$AVAIL" ]]; then
log "L4 not available in $PRIMARY_ZONE — falling back to $FALLBACK_ZONE"
ZONE="$FALLBACK_ZONE"
else
log "L4 confirmed available in $PRIMARY_ZONE"
fi
else
log "[DRY-RUN] Would check L4 availability in $PRIMARY_ZONE (fallback: $FALLBACK_ZONE)"
fi
# ── Cost estimate ──────────────────────────────────────────────────────────────
TOTAL_COST=$(awk "BEGIN {printf \"%.2f\", $COST_PER_HR * $RUN_HOURS}")
A100_COST=$(awk "BEGIN {printf \"%.2f\", $A100_BOX_RATE * $RUN_HOURS}")
SAVINGS=$(awk "BEGIN {printf \"%.0f\", $A100_BOX_RATE / $COST_PER_HR}")
log "Cost estimate:"
log " Machine type : $MACHINE_TYPE (1× L4 24GB, 16 vCPU)"
log " Rate : ~\$$COST_PER_HR/hr (on-demand, $ZONE)"
log " Est. duration: ~${RUN_HOURS} hr (5000 episodes, rollout-bound)"
log " Est. total : ~\$$TOTAL_COST"
log " vs A100×8 : ~\$$A100_COST for the same wall time (~${SAVINGS}× more expensive)"
log " Why L4 : MARL policy is a 12K-param MLP — bottleneck is CPU env rollout, not GPU matmul"
log " Tip: Use --preemptible to cut cost further at the risk of interruptions"
# ── Provision instance ────────────────────────────────────────────────────────
log "Provisioning $INSTANCE_NAME in $ZONE ..."
run gcloud compute instances create "$INSTANCE_NAME" \
--project="$PROJECT" \
--zone="$ZONE" \
--machine-type="$MACHINE_TYPE" \
--accelerator="type=nvidia-l4,count=1" \
--image-family="$IMAGE_FAMILY" \
--image-project="$IMAGE_PROJECT" \
--boot-disk-size="$DISK_SIZE" \
--boot-disk-type="$DISK_TYPE" \
--boot-disk-device-name="${INSTANCE_NAME}-disk" \
--maintenance-policy=TERMINATE \
--restart-on-failure \
--metadata-from-file="startup-script=$STARTUP_SCRIPT_FILE" \
--scopes="cloud-platform" \
--format="value(name)"
if [[ "$DRY_RUN" == "true" ]]; then
log "[DRY-RUN] Skipping IP lookup and SSH command output"
exit 0
fi
# ── Wait for instance to be ready ─────────────────────────────────────────────
log "Waiting for instance to reach RUNNING state ..."
for i in $(seq 1 30); do
STATUS=$(gcloud compute instances describe "$INSTANCE_NAME" \
--project="$PROJECT" --zone="$ZONE" \
--format="value(status)" 2>/dev/null || echo "UNKNOWN")
if [[ "$STATUS" == "RUNNING" ]]; then
break
fi
sleep 10
if [[ $i -eq 30 ]]; then
log "ERROR: Instance did not reach RUNNING within 5 min" >&2
exit 1
fi
done
# ── Print connection info ─────────────────────────────────────────────────────
INSTANCE_IP=$(gcloud compute instances describe "$INSTANCE_NAME" \
--project="$PROJECT" --zone="$ZONE" \
--format="value(networkInterfaces[0].accessConfigs[0].natIP)")
log "Instance ready:"
log " Name : $INSTANCE_NAME"
log " Zone : $ZONE"
log " IP : $INSTANCE_IP"
log " SSH : gcloud compute ssh $INSTANCE_NAME --project=$PROJECT --zone=$ZONE"
log " SSH IP : ssh $(gcloud config get-value account 2>/dev/null)@$INSTANCE_IP"
log ""
log "Startup script is running in background (/var/log/ruview-marl-startup.log)."
log "Wait 2-3 min for the Rust toolchain install before running run_marl_train.sh."
log ""
log "Next step:"
log " bash scripts/gcp/run_marl_train.sh $INSTANCE_IP"
log "Teardown when done:"
log " bash scripts/gcp/teardown.sh $INSTANCE_NAME"