wifi-densepose/scripts/gcp/run_marl_train.sh

142 lines
6.4 KiB
Bash
Executable File

#!/usr/bin/env bash
# Run ruview-swarm MARL training on a GCP L4 instance (ADR-148 M4).
# Usage: bash scripts/gcp/run_marl_train.sh <INSTANCE_IP> [EPISODES] [DRONES] [PROFILE]
#
# Rsyncs the v2/ Rust workspace to the instance, then runs the Candle PPO
# MARL trainer:
# cargo run --release -p ruview-swarm --features train,cuda --bin train_marl
# Downloads the trained checkpoints back on completion.
#
# NOTE: the `--bin train_marl` target is added by the companion MARL trainer
# work (Candle PPO trainer). This script calls it; it is expected to
# exist once that work lands.
set -euo pipefail
# ── Usage ─────────────────────────────────────────────────────────────────────
if [[ $# -lt 1 ]]; then
echo "Usage: $0 <INSTANCE_IP> [EPISODES] [DRONES] [PROFILE]" >&2
echo ""
echo " INSTANCE_IP External IP of the GCP L4 MARL training instance"
echo " EPISODES Training episodes (default: 5000)"
echo " DRONES Swarm size (default: 4)"
echo " PROFILE Mission profile (default: sar)"
echo ""
echo "Example:"
echo " $0 34.123.45.67"
echo " $0 34.123.45.67 10000 6 sar"
exit 1
fi
INSTANCE_IP="$1"
EPISODES="${2:-5000}"
DRONES="${3:-4}"
PROFILE="${4:-sar}"
GCP_USER="${GCP_USER:-$(gcloud config get-value account 2>/dev/null | cut -d@ -f1)}"
REMOTE="${GCP_USER}@${INSTANCE_IP}"
LOCAL_V2_DIR="$(cd "$(dirname "$0")/../.." && pwd)/v2"
OUTPUT_DIR="./out/gcp-checkpoints/marl"
REMOTE_CRATE="~/ruview-swarm"
REMOTE_CHECKPOINTS="~/ruview-swarm/marl-checkpoints"
log() { echo "[run_marl_train] $*"; }
# ── Validation ────────────────────────────────────────────────────────────────
if [[ ! -d "$LOCAL_V2_DIR" ]]; then
echo "ERROR: v2 workspace not found: $LOCAL_V2_DIR" >&2
exit 1
fi
log "Config: $EPISODES episodes, $DRONES drones, profile=$PROFILE"
# ── SSH connectivity check ────────────────────────────────────────────────────
SSH_OPTS="-o StrictHostKeyChecking=no -o ConnectTimeout=15 -o BatchMode=yes"
log "Checking SSH connectivity to $REMOTE ..."
if ! ssh $SSH_OPTS "$REMOTE" "echo ok" &>/dev/null; then
echo "ERROR: Cannot SSH to $REMOTE" >&2
echo " Ensure the instance is running and your SSH key is authorized." >&2
echo " Try: gcloud compute ssh <INSTANCE_NAME> --project=cognitum-20260110" >&2
exit 1
fi
log "SSH connection OK"
# ── Startup script completion check ───────────────────────────────────────────
log "Checking that startup script completed ..."
STARTUP_READY=$(ssh $SSH_OPTS "$REMOTE" \
"grep -c 'setup complete' /var/log/ruview-marl-startup.log 2>/dev/null || echo 0")
if [[ "$STARTUP_READY" -lt 1 ]]; then
log "WARNING: Startup script may not have finished yet."
log " Check /var/log/ruview-marl-startup.log on the instance."
log " Continuing anyway — the Rust toolchain may need more time."
fi
# ── Rsync the v2 Rust workspace ───────────────────────────────────────────────
# Exclude build artifacts and VCS — the instance rebuilds from source.
log "Rsyncing v2 workspace → $REMOTE:$REMOTE_CRATE ..."
ssh $SSH_OPTS "$REMOTE" "mkdir -p $REMOTE_CRATE"
rsync -avz --progress --stats \
-e "ssh $SSH_OPTS" \
--exclude="target/" \
--exclude=".git/" \
--exclude="marl-checkpoints/" \
--exclude="*.log" \
"$LOCAL_V2_DIR/" \
"${REMOTE}:${REMOTE_CRATE}/"
log "Workspace sync complete"
# ── Run MARL training ─────────────────────────────────────────────────────────
log "=== MARL training ($EPISODES episodes, $DRONES drones, $PROFILE) ==="
TRAIN_START=$(date +%s)
ssh $SSH_OPTS "$REMOTE" bash << REMOTE_TRAIN
set -euo pipefail
# shellcheck source=/dev/null
source "\$HOME/.cargo/env"
cd "\$HOME/ruview-swarm"
mkdir -p ./marl-checkpoints
echo "[train] \$(date): starting Candle PPO MARL trainer"
# --bin train_marl is provided by the companion MARL trainer work.
cargo run --release -p ruview-swarm --features train,cuda --bin train_marl -- \\
--episodes ${EPISODES} --drones ${DRONES} --profile ${PROFILE} \\
--checkpoint-dir ./marl-checkpoints
echo "[train] \$(date): MARL training complete"
ls -lh ./marl-checkpoints/
REMOTE_TRAIN
TRAIN_END=$(date +%s)
TRAIN_MIN=$(( (TRAIN_END - TRAIN_START) / 60 ))
log "Training complete in ${TRAIN_MIN} min"
# ── Download checkpoints ──────────────────────────────────────────────────────
log "Downloading checkpoints → $OUTPUT_DIR ..."
mkdir -p "$OUTPUT_DIR"
rsync -avz --progress --stats \
-e "ssh $SSH_OPTS" \
"${REMOTE}:${REMOTE_CHECKPOINTS}/" \
"$OUTPUT_DIR/"
# ── Verify download ───────────────────────────────────────────────────────────
LOCAL_FILE_COUNT=$(find "$OUTPUT_DIR" -type f 2>/dev/null | wc -l)
LOCAL_SIZE_MB=$(du -sm "$OUTPUT_DIR" 2>/dev/null | awk '{print $1}')
log "Downloaded $LOCAL_FILE_COUNT files, ~${LOCAL_SIZE_MB} MB to $OUTPUT_DIR"
if [[ "$LOCAL_FILE_COUNT" -lt 1 ]]; then
echo "WARNING: No checkpoints were downloaded from $REMOTE" >&2
fi
# ── Summary ───────────────────────────────────────────────────────────────────
TRAIN_HR=$(awk "BEGIN {printf \"%.2f\", $TRAIN_MIN / 60}")
COST=$(awk "BEGIN {printf \"%.2f\", 1.40 * $TRAIN_HR}")
log ""
log "=== MARL training complete ==="
log " Episodes : $EPISODES (drones=$DRONES, profile=$PROFILE)"
log " Wall time : ${TRAIN_MIN} min (${TRAIN_HR} hr)"
log " Est. compute cost: ~\$$COST (at \$1.40/hr on-demand, g2-standard-16)"
log " Checkpoints in : $OUTPUT_DIR"
log ""
log "Next step (teardown):"
log " bash scripts/gcp/teardown.sh <INSTANCE_NAME> --skip-download"