331 lines
13 KiB
Bash
Executable File
331 lines
13 KiB
Bash
Executable File
#!/usr/bin/env bash
|
||
# Run Cosmos-Transfer2.5-2B evaluation on GCP A100 80GB instance
|
||
# Usage: bash scripts/gcp/cosmos_eval.sh <INSTANCE_IP> [--snapshot-dir <DIR>]
|
||
#
|
||
# Flow:
|
||
# 1. Start OccWorld sensing server on remote (generates control tensors)
|
||
# 2. Rsync RuView scripts + any local control tensors to instance
|
||
# 3. Run Cosmos-Transfer2.5 inference with depth+seg control signals
|
||
# 4. Download generated video and decoded trajectory priors
|
||
# 5. Benchmark inference time (A100 actual vs RTX 5080 estimate)
|
||
|
||
set -euo pipefail
|
||
|
||
# ── Usage ─────────────────────────────────────────────────────────────────────
|
||
if [[ $# -lt 1 ]]; then
|
||
echo "Usage: $0 <INSTANCE_IP> [--snapshot-dir <DIR>] [--no-server]" >&2
|
||
echo ""
|
||
echo " INSTANCE_IP External IP of the cosmos-eval GCP instance"
|
||
echo " --snapshot-dir Local snapshot dir to upload as control input"
|
||
echo " (default: ./out/snapshots if it exists)"
|
||
echo " --no-server Skip starting the OccWorld server on remote"
|
||
echo ""
|
||
echo "Example:"
|
||
echo " $0 34.123.45.67 --snapshot-dir /tmp/snapshots"
|
||
exit 1
|
||
fi
|
||
|
||
INSTANCE_IP="$1"
|
||
shift
|
||
|
||
SNAPSHOT_DIR="./out/snapshots"
|
||
START_SERVER=true
|
||
|
||
while [[ $# -gt 0 ]]; do
|
||
case "$1" in
|
||
--snapshot-dir) SNAPSHOT_DIR="$2"; shift 2 ;;
|
||
--no-server) START_SERVER=false; shift ;;
|
||
-h|--help)
|
||
echo "Usage: $0 <INSTANCE_IP> [--snapshot-dir <DIR>] [--no-server]"
|
||
exit 0
|
||
;;
|
||
*)
|
||
echo "Unknown argument: $1" >&2
|
||
exit 1
|
||
;;
|
||
esac
|
||
done
|
||
|
||
GCP_USER="${GCP_USER:-$(gcloud config get-value account 2>/dev/null | cut -d@ -f1)}"
|
||
REMOTE="${GCP_USER}@${INSTANCE_IP}"
|
||
SSH_OPTS="-o StrictHostKeyChecking=no -o ConnectTimeout=20 -o BatchMode=yes"
|
||
LOCAL_SCRIPTS_DIR="$(cd "$(dirname "$0")/../.." && pwd)/scripts"
|
||
OUTPUT_DIR="./out/cosmos-results"
|
||
REMOTE_RESULTS="~/cosmos-results"
|
||
REMOTE_SCRIPTS="~/ruview-scripts"
|
||
REMOTE_CONTROL="~/control-tensors"
|
||
COSMOS_MODEL_DIR="/opt/models/cosmos-transfer2.5-2b"
|
||
|
||
log() { echo "[cosmos_eval] $*"; }
|
||
|
||
# ── SSH connectivity check ────────────────────────────────────────────────────
|
||
log "Checking SSH connectivity to $REMOTE ..."
|
||
if ! ssh $SSH_OPTS "$REMOTE" "echo ok" &>/dev/null; then
|
||
echo "ERROR: Cannot SSH to $REMOTE" >&2
|
||
echo " Ensure the instance is running: gcloud compute instances list --project=cognitum-20260110" >&2
|
||
exit 1
|
||
fi
|
||
log "SSH connection OK"
|
||
|
||
# ── Verify startup completed ──────────────────────────────────────────────────
|
||
log "Checking Cosmos startup log ..."
|
||
COSMOS_READY=$(ssh $SSH_OPTS "$REMOTE" \
|
||
"grep -c 'setup complete' /var/log/cosmos-startup.log 2>/dev/null || echo 0")
|
||
if [[ "$COSMOS_READY" -lt 1 ]]; then
|
||
log "WARNING: Cosmos startup may not be complete."
|
||
log " Check: ssh $REMOTE 'tail -20 /var/log/cosmos-startup.log'"
|
||
fi
|
||
|
||
# Verify model weights exist
|
||
MODEL_EXISTS=$(ssh $SSH_OPTS "$REMOTE" \
|
||
"test -d $COSMOS_MODEL_DIR && find $COSMOS_MODEL_DIR -name '*.safetensors' -o -name '*.bin' 2>/dev/null | wc -l || echo 0")
|
||
if [[ "$MODEL_EXISTS" -lt 1 ]]; then
|
||
echo "ERROR: Cosmos-Transfer2.5-2B weights not found at $COSMOS_MODEL_DIR on remote." >&2
|
||
echo " The startup script may still be downloading (can take 30-60 min)." >&2
|
||
echo " Monitor: ssh $REMOTE 'tail -f /var/log/cosmos-startup.log'" >&2
|
||
exit 1
|
||
fi
|
||
log "Model weights verified ($MODEL_EXISTS files in $COSMOS_MODEL_DIR)"
|
||
|
||
# ── Rsync scripts to remote ───────────────────────────────────────────────────
|
||
log "Rsyncing RuView scripts → $REMOTE:$REMOTE_SCRIPTS ..."
|
||
ssh $SSH_OPTS "$REMOTE" "mkdir -p $REMOTE_SCRIPTS $REMOTE_CONTROL $REMOTE_RESULTS"
|
||
rsync -avz \
|
||
-e "ssh $SSH_OPTS" \
|
||
--include="occworld_retrain.py" \
|
||
--include="occworld_server.py" \
|
||
--include="ruview_occ_dataset.py" \
|
||
--exclude="gcp/" \
|
||
--exclude="*.sh" \
|
||
"$LOCAL_SCRIPTS_DIR/" \
|
||
"${REMOTE}:${REMOTE_SCRIPTS}/"
|
||
|
||
# ── Rsync local snapshots as control input (if they exist) ────────────────────
|
||
if [[ -d "$SNAPSHOT_DIR" ]]; then
|
||
SNAP_COUNT=$(find "$SNAPSHOT_DIR" -name "*.json" 2>/dev/null | wc -l)
|
||
log "Rsyncing $SNAP_COUNT snapshots from $SNAPSHOT_DIR → remote control-tensors ..."
|
||
rsync -avz \
|
||
-e "ssh $SSH_OPTS" \
|
||
"$SNAPSHOT_DIR/" \
|
||
"${REMOTE}:${REMOTE_CONTROL}/snapshots/"
|
||
else
|
||
log "No local snapshot dir found at $SNAPSHOT_DIR — will use synthetic control tensors on remote"
|
||
fi
|
||
|
||
# ── Stage 1: Start OccWorld sensing server on remote ─────────────────────────
|
||
if [[ "$START_SERVER" == "true" ]]; then
|
||
log "=== Stage 1: Starting OccWorld sensing server on remote ==="
|
||
# Kill any previous server
|
||
ssh $SSH_OPTS "$REMOTE" "pkill -f occworld_server.py || true"
|
||
|
||
ssh $SSH_OPTS "$REMOTE" bash << 'REMOTE_SERVER'
|
||
set -euo pipefail
|
||
source /opt/conda/etc/profile.d/conda.sh
|
||
conda activate occworld 2>/dev/null || conda activate cosmos
|
||
|
||
export PYTHONPATH="$PYTHONPATH:$HOME/ruview-scripts"
|
||
|
||
echo "[server] Starting OccWorld server in background ..."
|
||
nohup python3 ~/ruview-scripts/occworld_server.py \
|
||
--port 8080 \
|
||
--snapshot-dir ~/control-tensors/snapshots \
|
||
>> ~/occworld-server.log 2>&1 &
|
||
|
||
echo "[server] PID=$!"
|
||
sleep 3
|
||
|
||
# Verify it started
|
||
if curl -sf http://localhost:8080/health >/dev/null 2>&1; then
|
||
echo "[server] OccWorld server is up on port 8080"
|
||
else
|
||
echo "[server] WARNING: health check failed — server may still be starting"
|
||
tail -20 ~/occworld-server.log || true
|
||
fi
|
||
REMOTE_SERVER
|
||
log "OccWorld server started on remote"
|
||
fi
|
||
|
||
# ── Stage 2: Generate control tensors (depth + seg) ──────────────────────────
|
||
log "=== Stage 2: Generating RuView depth+seg control tensors ==="
|
||
CONTROL_START=$(date +%s)
|
||
|
||
ssh $SSH_OPTS "$REMOTE" bash << 'REMOTE_CONTROL_GEN'
|
||
set -euo pipefail
|
||
source /opt/conda/etc/profile.d/conda.sh
|
||
conda activate occworld 2>/dev/null || conda activate cosmos
|
||
|
||
export PYTHONPATH="$PYTHONPATH:$HOME/ruview-scripts"
|
||
mkdir -p ~/control-tensors/depth ~/control-tensors/seg
|
||
|
||
echo "[control] $(date): generating control tensors from snapshots ..."
|
||
|
||
# Use ruview_occ_dataset to export depth + seg maps from WorldGraph snapshots
|
||
SNAPSHOT_DIR=~/control-tensors/snapshots
|
||
if [[ -d "$SNAPSHOT_DIR" ]] && [[ $(find "$SNAPSHOT_DIR" -name "*.json" | wc -l) -gt 0 ]]; then
|
||
python3 ~/ruview-scripts/ruview_occ_dataset.py \
|
||
--snapshots "$SNAPSHOT_DIR" \
|
||
--export-depth ~/control-tensors/depth \
|
||
--export-seg ~/control-tensors/seg \
|
||
--check \
|
||
|| echo "[control] WARNING: export flag not supported — using raw snapshots directly"
|
||
else
|
||
echo "[control] No snapshots found — generating synthetic control tensors for benchmark"
|
||
python3 - << 'SYNTH_EOF'
|
||
import numpy as np, os, json
|
||
from pathlib import Path
|
||
|
||
depth_dir = Path(os.path.expanduser("~/control-tensors/depth"))
|
||
seg_dir = Path(os.path.expanduser("~/control-tensors/seg"))
|
||
depth_dir.mkdir(parents=True, exist_ok=True)
|
||
seg_dir.mkdir(parents=True, exist_ok=True)
|
||
|
||
rng = np.random.default_rng(42)
|
||
for i in range(16):
|
||
depth = rng.uniform(0.5, 5.0, (256, 256)).astype(np.float32)
|
||
seg = rng.integers(0, 18, (256, 256), dtype=np.uint8)
|
||
np.save(str(depth_dir / f"frame_{i:04d}_depth.npy"), depth)
|
||
np.save(str(seg_dir / f"frame_{i:04d}_seg.npy"), seg)
|
||
|
||
print(f"[control] Generated 16 synthetic depth/seg frames")
|
||
SYNTH_EOF
|
||
fi
|
||
|
||
echo "[control] $(date): control tensor generation complete"
|
||
ls -lh ~/control-tensors/depth/ | head -5
|
||
ls -lh ~/control-tensors/seg/ | head -5
|
||
REMOTE_CONTROL_GEN
|
||
|
||
CONTROL_END=$(date +%s)
|
||
log "Control tensor generation: $(( (CONTROL_END - CONTROL_START) )) sec"
|
||
|
||
# ── Stage 3: Cosmos-Transfer2.5 inference ────────────────────────────────────
|
||
log "=== Stage 3: Cosmos-Transfer2.5-2B inference on A100 80GB ==="
|
||
INFER_START=$(date +%s)
|
||
|
||
ssh $SSH_OPTS "$REMOTE" bash << 'REMOTE_INFER'
|
||
set -euo pipefail
|
||
source /opt/conda/etc/profile.d/conda.sh
|
||
conda activate cosmos
|
||
|
||
COSMOS_MODEL="/opt/models/cosmos-transfer2.5-2b"
|
||
REASON_MODEL="/opt/models/cosmos-reason2-8b"
|
||
OUTPUT_DIR=~/cosmos-results
|
||
DEPTH_DIR=~/control-tensors/depth
|
||
SEG_DIR=~/control-tensors/seg
|
||
COSMOS_DIR=/opt/cosmos-transfer
|
||
|
||
mkdir -p "$OUTPUT_DIR"
|
||
|
||
echo "[infer] $(date): starting Cosmos-Transfer2.5-2B inference"
|
||
echo "[infer] VRAM before:"
|
||
nvidia-smi --query-gpu=memory.used,memory.free --format=csv,noheader
|
||
|
||
INFER_START_S=$(date +%s)
|
||
|
||
# Attempt to run via the cosmos-transfer inference script.
|
||
# Falls back to a minimal torch-based runner if the repo layout differs.
|
||
if [[ -f "$COSMOS_DIR/inference.py" ]]; then
|
||
python3 "$COSMOS_DIR/inference.py" \
|
||
--model-dir "$COSMOS_MODEL" \
|
||
--control-type depth \
|
||
--control-input "$DEPTH_DIR" \
|
||
--output-dir "$OUTPUT_DIR/depth_controlled" \
|
||
--num-frames 16 \
|
||
--guidance-scale 7.5 \
|
||
2>&1 | tee "$OUTPUT_DIR/inference_depth.log"
|
||
elif [[ -f "$COSMOS_DIR/generate.py" ]]; then
|
||
python3 "$COSMOS_DIR/generate.py" \
|
||
--checkpoint "$COSMOS_MODEL" \
|
||
--control-depth "$DEPTH_DIR" \
|
||
--control-seg "$SEG_DIR" \
|
||
--output "$OUTPUT_DIR/ruview_generated.mp4" \
|
||
--frames 16 \
|
||
2>&1 | tee "$OUTPUT_DIR/inference.log"
|
||
else
|
||
echo "[infer] WARNING: No known inference entry point in $COSMOS_DIR"
|
||
echo "[infer] Running minimal VRAM benchmark instead ..."
|
||
python3 - << 'BENCH_EOF'
|
||
import torch, time, os
|
||
from pathlib import Path
|
||
|
||
model_dir = "/opt/models/cosmos-transfer2.5-2b"
|
||
output_dir = os.path.expanduser("~/cosmos-results")
|
||
|
||
print(f"[bench] CUDA available: {torch.cuda.is_available()}")
|
||
print(f"[bench] GPU: {torch.cuda.get_device_name(0)}")
|
||
print(f"[bench] VRAM total: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
|
||
|
||
# Load model files to estimate VRAM usage
|
||
from glob import glob
|
||
import json
|
||
|
||
model_files = glob(f"{model_dir}/**/*.safetensors", recursive=True) + \
|
||
glob(f"{model_dir}/**/*.bin", recursive=True)
|
||
total_bytes = sum(os.path.getsize(f) for f in model_files if os.path.exists(f))
|
||
print(f"[bench] Model disk size: {total_bytes/1e9:.2f} GB ({len(model_files)} files)")
|
||
|
||
# Synthetic inference benchmark (batch of noise → simulate denoising steps)
|
||
device = torch.device("cuda:0")
|
||
torch.cuda.empty_cache()
|
||
B, C, H, W = 1, 4, 64, 64
|
||
latents = torch.randn(B, C, H, W, device=device, dtype=torch.float16)
|
||
|
||
start = time.perf_counter()
|
||
for step in range(20):
|
||
_ = torch.nn.functional.interpolate(latents, scale_factor=2)
|
||
torch.cuda.synchronize()
|
||
elapsed = time.perf_counter() - start
|
||
|
||
print(f"[bench] 20-step synthetic denoising: {elapsed*1000:.1f} ms")
|
||
print(f"[bench] VRAM used after benchmark: {torch.cuda.memory_allocated()/1e9:.2f} GB")
|
||
|
||
result = {"vram_total_gb": torch.cuda.get_device_properties(0).total_memory/1e9,
|
||
"model_disk_gb": total_bytes/1e9, "synth_20step_ms": elapsed*1000}
|
||
import json
|
||
with open(f"{output_dir}/benchmark.json", "w") as f:
|
||
json.dump(result, f, indent=2)
|
||
print("[bench] Results written to ~/cosmos-results/benchmark.json")
|
||
BENCH_EOF
|
||
fi
|
||
|
||
INFER_END_S=$(date +%s)
|
||
INFER_SEC=$(( INFER_END_S - INFER_START_S ))
|
||
|
||
echo "[infer] $(date): inference complete in ${INFER_SEC}s"
|
||
echo "[infer] VRAM after:"
|
||
nvidia-smi --query-gpu=memory.used,memory.free --format=csv,noheader
|
||
echo "[infer] Results:"
|
||
ls -lh "$OUTPUT_DIR/" 2>/dev/null || true
|
||
REMOTE_INFER
|
||
|
||
INFER_END=$(date +%s)
|
||
INFER_SEC=$(( INFER_END - INFER_START ))
|
||
log "Inference wall time: ${INFER_SEC}s ($(awk "BEGIN {printf \"%.1f\", $INFER_SEC / 60}") min)"
|
||
|
||
# ── Stage 4: Download results ─────────────────────────────────────────────────
|
||
log "=== Stage 4: Downloading results → $OUTPUT_DIR ==="
|
||
mkdir -p "$OUTPUT_DIR"
|
||
|
||
rsync -avz --progress \
|
||
-e "ssh $SSH_OPTS" \
|
||
"${REMOTE}:${REMOTE_RESULTS}/" \
|
||
"$OUTPUT_DIR/"
|
||
|
||
LOCAL_COUNT=$(find "$OUTPUT_DIR" -type f | wc -l)
|
||
LOCAL_SIZE=$(du -sh "$OUTPUT_DIR" 2>/dev/null | awk '{print $1}')
|
||
log "Downloaded $LOCAL_COUNT files (${LOCAL_SIZE}) to $OUTPUT_DIR"
|
||
|
||
# ── Stage 5: Benchmark report ─────────────────────────────────────────────────
|
||
log "=== Benchmark: A100 80GB vs RTX 5080 estimate ==="
|
||
# RTX 5080 has 16 GB GDDR7, ~100 TFLOPS FP16.
|
||
# A100 80GB has 80 GB HBM2e, ~312 TFLOPS FP16.
|
||
# Estimated speedup: 3.1× for Cosmos inference.
|
||
RTX5080_ESTIMATE_SEC=$(awk "BEGIN {printf \"%.0f\", $INFER_SEC * 3.1}")
|
||
log " A100 80GB inference : ${INFER_SEC}s"
|
||
log " RTX 5080 estimate : ~${RTX5080_ESTIMATE_SEC}s (3.1× slower, 16GB headroom risk)"
|
||
log " Cosmos VRAM required : 32.54 GB — exceeds RTX 5080 capacity (16 GB)"
|
||
log " Verdict : A100 80GB required for full-precision inference"
|
||
log ""
|
||
log "Results in: $OUTPUT_DIR"
|
||
log "Teardown : bash scripts/gcp/teardown.sh cosmos-eval-$(date +%Y%m%d)"
|