wifi-densepose/scripts/pretrain-mae-gcloud.sh

#!/bin/bash
# ==============================================================================
# GCloud GPU driver for the MERIDIAN CSI masked-autoencoder pre-train (ADR-027 §2.0)
# ==============================================================================
#
# Creates a GCloud VM with a GPU, builds wifi-densepose-train with the
# `tch-backend` (+ `cuda`) feature, runs the `pretrain-mae` binary, downloads
# the pre-trained variable store (`.ot`), and tears the VM down.
#
# STATUS: prototype wiring stub (ADR-027 §2.0, iteration 3). The `pretrain-mae`
# binary currently drives the *deterministic SyntheticCsiDataset* — that's the
# end-to-end smoke path. The real heterogeneous-CSI pre-train (MM-Fi + Wi-Pose +
# data/recordings/ + multi-band virtual sub-carriers) needs the ingest pipeline
# tracked in ADR-027 §2.0 "Iteration 3 plan"; the TODO markers below show where
# it plugs in. This script is intentionally a thin, reviewable shell of the real
# gcloud-train.sh (which it mirrors) — it has NOT been run.
#
# Usage:
#   bash scripts/pretrain-mae-gcloud.sh [OPTIONS]
#
# Options:
#   --gpu        l4|a100|h100   GPU type (default: l4)
#   --zone       ZONE           GCloud zone (default: us-central1-a)
#   --hours      N              Max VM lifetime in hours (default: 3)
#   --epochs     N              Pre-train epochs (default: 20)
#   --samples    N              Synthetic samples (until the real ingest lands) (default: 4096)
#   --batch      N              Mini-batch size (default: 64)
#   --mask-ratio R              Token mask ratio (default: 0.75)
#   --lr         R              Adam learning rate (default: 1e-3)
#   --out        FILE           Local path for the downloaded .ot (default: data/models/mae-pretrained.ot)
#   --data-dir   DIR            (future) heterogeneous CSI corpus to upload — see TODO below
#   --dry-run                   Build + run a tiny pre-train locally with synthetic data; no VM
#   --keep-vm                   Do not delete the VM after the run
#   --instance   NAME           Custom VM instance name
#
# Prerequisites (same as gcloud-train.sh):
#   - gcloud CLI authenticated:  gcloud auth login
#   - Project set:               gcloud config set project cognitum-20260110
#   - GPU quota in the chosen zone
#
# Cost (same envelope as gcloud-train.sh):
#   L4 ~$0.80/hr (prototyping) · A100 40GB ~$3.60/hr (full pre-train) · H100 80GB ~$11/hr
# ==============================================================================

set -euo pipefail

# ── Defaults ──────────────────────────────────────────────────────────────────
PROJECT="cognitum-20260110"
GPU_TYPE="l4"
ZONE="us-central1-a"
HOURS=3
EPOCHS=20
SAMPLES=4096
BATCH=64
MASK_RATIO=0.75
LR="1e-3"
OUT="data/models/mae-pretrained.ot"
DATA_DIR=""
DRY_RUN=0
KEEP_VM=0
INSTANCE="meridian-mae-$(date +%s)"

# ── Arg parse ─────────────────────────────────────────────────────────────────
while [[ $# -gt 0 ]]; do
  case "$1" in
    --gpu)        GPU_TYPE="$2"; shift 2;;
    --zone)       ZONE="$2"; shift 2;;
    --hours)      HOURS="$2"; shift 2;;
    --epochs)     EPOCHS="$2"; shift 2;;
    --samples)    SAMPLES="$2"; shift 2;;
    --batch)      BATCH="$2"; shift 2;;
    --mask-ratio) MASK_RATIO="$2"; shift 2;;
    --lr)         LR="$2"; shift 2;;
    --out)        OUT="$2"; shift 2;;
    --data-dir)   DATA_DIR="$2"; shift 2;;
    --dry-run)    DRY_RUN=1; shift;;
    --keep-vm)    KEEP_VM=1; shift;;
    --instance)   INSTANCE="$2"; shift 2;;
    -h|--help)    sed -n '2,46p' "$0"; exit 0;;
    *) echo "unknown option: $1" >&2; exit 2;;
  esac
done

case "$GPU_TYPE" in
  l4)   ACCEL="type=nvidia-l4,count=1";        MACHINE="g2-standard-8";;
  a100) ACCEL="type=nvidia-tesla-a100,count=1"; MACHINE="a2-highgpu-1g";;
  h100) ACCEL="type=nvidia-h100-80gb,count=1";  MACHINE="a3-highgpu-1g";;
  *) echo "unknown --gpu: $GPU_TYPE (l4|a100|h100)" >&2; exit 2;;
esac

PRETRAIN_ARGS="--epochs $EPOCHS --samples $SAMPLES --batch $BATCH --mask-ratio $MASK_RATIO --lr $LR --save mae-pretrained.ot"

# ── Dry run: build + tiny pre-train locally (synthetic data), no VM ───────────
if [[ "$DRY_RUN" -eq 1 ]]; then
  echo "[dry-run] cargo run -p wifi-densepose-train --features tch-backend --bin pretrain-mae -- --epochs 2 --samples 64 --batch 8"
  echo "[dry-run] (requires LibTorch — set LIBTORCH or use a tch download-libtorch feature build)"
  cd "$(dirname "$0")/../v2"
  cargo run -p wifi-densepose-train --features tch-backend --bin pretrain-mae -- --epochs 2 --samples 64 --batch 8
  exit 0
fi

# ── Provision VM ──────────────────────────────────────────────────────────────
echo "==> Project: $PROJECT  Zone: $ZONE  GPU: $GPU_TYPE  Machine: $MACHINE  Instance: $INSTANCE"
gcloud config set project "$PROJECT" >/dev/null
gcloud compute instances create "$INSTANCE" \
  --zone="$ZONE" --machine-type="$MACHINE" \
  --accelerator="$ACCEL" --maintenance-policy=TERMINATE \
  --image-family=pytorch-latest-gpu --image-project=deeplearning-platform-release \
  --boot-disk-size=128GB --metadata="install-nvidia-driver=True" \
  --max-run-duration="${HOURS}h" --instance-termination-action=DELETE

cleanup() {
  if [[ "$KEEP_VM" -eq 0 ]]; then
    echo "==> Deleting VM $INSTANCE"
    gcloud compute instances delete "$INSTANCE" --zone="$ZONE" --quiet || true
  else
    echo "==> --keep-vm set; VM $INSTANCE left running (remember to delete it)."
  fi
}
trap cleanup EXIT

run_remote() { gcloud compute ssh "$INSTANCE" --zone="$ZONE" --command="$1"; }

echo "==> Waiting for SSH..."
for _ in $(seq 1 30); do run_remote "true" 2>/dev/null && break; sleep 10; done

echo "==> Provisioning toolchain on the VM"
run_remote 'set -e
  curl --proto "=https" --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
  source "$HOME/.cargo/env"
  # The pytorch-latest-gpu image ships libtorch; point tch at it.
  TORCH_DIR="$(python -c "import torch,os;print(os.path.dirname(torch.__file__))")"
  echo "export LIBTORCH=$TORCH_DIR" >> "$HOME/.bashrc"
  echo "export LD_LIBRARY_PATH=$TORCH_DIR/lib:\$LD_LIBRARY_PATH" >> "$HOME/.bashrc"
  sudo apt-get update -qq && sudo apt-get install -y -qq git build-essential pkg-config'

echo "==> Uploading repo"
# rsync the repo (excluding build artifacts) — same approach as gcloud-train.sh.
gcloud compute scp --recurse --zone="$ZONE" \
  ../v2 ../scripts ../docs "$INSTANCE":~/ruview/ >/dev/null

# TODO (ADR-027 §2.0, iter 3 ingest): when --data-dir is given, upload the
# heterogeneous CSI corpus and point pretrain-mae at it instead of the synthetic
# dataset (needs a `--data-dir`/`--datasets` flag on the bin first — see the plan).
if [[ -n "$DATA_DIR" ]]; then
  echo "==> Uploading CSI corpus from $DATA_DIR"
  gcloud compute scp --recurse --zone="$ZONE" "$DATA_DIR" "$INSTANCE":~/ruview/csi-corpus/ >/dev/null
  PRETRAIN_ARGS="$PRETRAIN_ARGS # TODO: --data-dir ~/ruview/csi-corpus"
fi

echo "==> Building + running pre-train on the VM"
run_remote "set -e; source \$HOME/.cargo/env; source \$HOME/.bashrc
  cd ~/ruview/v2
  cargo build --release -p wifi-densepose-train --features tch-backend,cuda
  cargo run --release -p wifi-densepose-train --features tch-backend,cuda --bin pretrain-mae -- $PRETRAIN_ARGS"

echo "==> Downloading pre-trained variable store → $OUT"
mkdir -p "$(dirname "$OUT")"
gcloud compute scp --zone="$ZONE" "$INSTANCE":~/ruview/v2/mae-pretrained.ot "$OUT"

echo "==> Done. Pre-trained encoder: $OUT"
echo "    Next: fine-tune the ADR-027 §2.x heads on top of it (see §2.0 'Iteration 3 plan')."