feat: GCloud GPU training pipeline + data collection + benchmarking

- gcloud-train.sh: L4/A100/H100 VM provisioning, Rust build, training with --cuda, artifact download, auto-cleanup ($0.80-$8.50/hr) - training-config-sweep.json: 10 hyperparameter configs (LR, batch, backbone, windows, loss weights, warmup) - collect-training-data.py: UDP listener for 2-node ESP32 CSI recording to .csi.jsonl with interactive/batch labeling and manifest generation - benchmark-model.py: ONNX latency/throughput/PCK/FLOPs profiling with multi-model sweep comparison Co-Authored-By: claude-flow <ruv@ruv.net>
2026-04-02 22:04:57 -04:00 · 2026-04-02 22:04:57 -04:00 · c63cf2ee77
parent 9a2bc1839a
commit c63cf2ee77
4 changed files with 1657 additions and 0 deletions
--- a/scripts/benchmark-model.py
+++ b/scripts/benchmark-model.py
@ -0,0 +1,550 @@
 #!/usr/bin/env python3
 """
 WiFi-DensePose Model Benchmarking
 Loads trained ONNX models, runs inference on test data, and reports
 performance metrics: latency, throughput, PCK@0.2, model size, and
 estimated FLOPs.
 Can compare multiple models from a hyperparameter sweep.
 Usage:
    # Benchmark a single model
    python scripts/benchmark-model.py --model checkpoints/best.onnx
    # Benchmark with recorded test data
    python scripts/benchmark-model.py --model best.onnx --test-data data/recordings/test.csi.jsonl
    # Compare models from a sweep
    python scripts/benchmark-model.py --sweep-dir training-results/wdp-train-a100-*/checkpoints/
    # Benchmark with synthetic data (no recordings needed)
    python scripts/benchmark-model.py --model best.onnx --synthetic --num-samples 200
    # Export results as JSON
    python scripts/benchmark-model.py --model best.onnx --output results.json
 Prerequisites:
    pip install onnxruntime numpy
    Optional: pip install onnx  (for FLOPs estimation)
 """
 from __future__ import annotations
 import argparse
 import json
 import os
 import sys
 import time
 from dataclasses import dataclass, field, asdict
 from pathlib import Path
 from typing import Optional
 import numpy as np
 try:
    import onnxruntime as ort
 except ImportError:
    print("ERROR: onnxruntime not installed. Run: pip install onnxruntime")
    sys.exit(1)
 # ── Configuration ────────────────────────────────────────────────────────────
 # Default model input shape (must match TrainingConfig defaults)
 NUM_SUBCARRIERS = 56
 NUM_ANTENNAS_TX = 3
 NUM_ANTENNAS_RX = 3
 WINDOW_FRAMES = 100
 NUM_KEYPOINTS = 17
 HEATMAP_SIZE = 56
 # PCK threshold
 PCK_THRESHOLD = 0.2
 # ── Data classes ─────────────────────────────────────────────────────────────
@dataclass
 class BenchmarkResult:
    model_path: str
    model_size_mb: float
    num_parameters: Optional[int] = None
    estimated_flops: Optional[int] = None
    # Latency
    warmup_runs: int = 10
    benchmark_runs: int = 100
    latency_mean_ms: float = 0.0
    latency_std_ms: float = 0.0
    latency_p50_ms: float = 0.0
    latency_p95_ms: float = 0.0
    latency_p99_ms: float = 0.0
    throughput_fps: float = 0.0
    # Accuracy (if ground truth available)
    pck_at_02: Optional[float] = None
    mean_per_joint_error: Optional[float] = None
    num_test_samples: int = 0
    # Input shape
    input_shape: list = field(default_factory=list)
    provider: str = ""
 # ── ONNX model loading ──────────────────────────────────────────────────────
 def load_model(model_path: str) -> ort.InferenceSession:
    """Load an ONNX model with the best available execution provider."""
    providers = []
    if "CUDAExecutionProvider" in ort.get_available_providers():
        providers.append("CUDAExecutionProvider")
    providers.append("CPUExecutionProvider")
    sess_opts = ort.SessionOptions()
    sess_opts.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
    sess_opts.intra_op_num_threads = os.cpu_count() or 4
    session = ort.InferenceSession(model_path, sess_opts, providers=providers)
    return session
 def get_model_info(model_path: str) -> dict:
    """Extract model metadata: size, parameter count, FLOPs estimate."""
    path = Path(model_path)
    size_mb = path.stat().st_size / (1024 * 1024)
    info = {
        "size_mb": round(size_mb, 2),
        "num_parameters": None,
        "estimated_flops": None,
    }
    # Try to count parameters via onnx
    try:
        import onnx
        model = onnx.load(model_path)
        total_params = 0
        for initializer in model.graph.initializer:
            shape = list(initializer.dims)
            if shape:
                total_params += int(np.prod(shape))
        info["num_parameters"] = total_params
        # Rough FLOPs estimate: ~2 * params (multiply-accumulate)
        info["estimated_flops"] = total_params * 2
    except ImportError:
        pass
    except Exception as e:
        print(f"  Warning: Could not extract parameter count: {e}")
    return info
 # ── Synthetic data generation ────────────────────────────────────────────────
 def generate_synthetic_input(
    batch_size: int = 1,
    num_subcarriers: int = NUM_SUBCARRIERS,
    num_tx: int = NUM_ANTENNAS_TX,
    num_rx: int = NUM_ANTENNAS_RX,
    window_frames: int = WINDOW_FRAMES,
 ) -> np.ndarray:
    """Generate synthetic CSI input tensor matching the model's expected shape.
    The WiFi-DensePose model expects input shape:
      [batch, channels, height, width]
    where channels = num_tx * num_rx, height = window_frames, width = num_subcarriers.
    """
    channels = num_tx * num_rx  # 3x3 = 9 MIMO streams
    # Simulate CSI amplitude data with realistic distribution
    rng = np.random.default_rng(42)
    data = rng.normal(loc=0.0, scale=1.0, size=(batch_size, channels, window_frames, num_subcarriers))
    return data.astype(np.float32)
 def generate_synthetic_keypoints(
    num_samples: int,
    num_keypoints: int = NUM_KEYPOINTS,
    heatmap_size: int = HEATMAP_SIZE,
 ) -> np.ndarray:
    """Generate synthetic ground truth keypoint coordinates for PCK evaluation."""
    rng = np.random.default_rng(123)
    # Keypoints as (x, y) in [0, heatmap_size) range
    return rng.uniform(0, heatmap_size, size=(num_samples, num_keypoints, 2)).astype(np.float32)
 # ── Load test data from .csi.jsonl ──────────────────────────────────────────
 def load_test_data(
    jsonl_path: str,
    window_frames: int = WINDOW_FRAMES,
    num_subcarriers: int = NUM_SUBCARRIERS,
    max_samples: int = 500,
 ) -> np.ndarray:
    """Load CSI frames from a .csi.jsonl file and window them into model inputs."""
    frames = []
    path = Path(jsonl_path)
    with open(path, "r") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            try:
                record = json.loads(line)
                subs = record.get("subcarriers", [])
                if len(subs) > 0:
                    frames.append(subs)
            except json.JSONDecodeError:
                continue
    if len(frames) < window_frames:
        print(f"  Warning: Only {len(frames)} frames, need {window_frames}. Padding with zeros.")
        while len(frames) < window_frames:
            frames.append([0.0] * num_subcarriers)
    # Normalize subcarrier count
    normalized = []
    for frame in frames:
        if len(frame) < num_subcarriers:
            frame = frame + [0.0] * (num_subcarriers - len(frame))
        elif len(frame) > num_subcarriers:
            # Downsample via linear interpolation
            indices = np.linspace(0, len(frame) - 1, num_subcarriers)
            frame = np.interp(indices, range(len(frame)), frame).tolist()
        normalized.append(frame)
    frames = normalized
    # Create sliding windows
    samples = []
    stride = max(1, window_frames // 2)
    for i in range(0, len(frames) - window_frames + 1, stride):
        window = frames[i : i + window_frames]
        # Shape: [channels=1, window_frames, num_subcarriers]
        # Expand single stream to 9 channels (repeat for MIMO)
        arr = np.array(window, dtype=np.float32)
        arr = np.expand_dims(arr, axis=0)  # [1, window_frames, num_subcarriers]
        arr = np.repeat(arr, NUM_ANTENNAS_TX * NUM_ANTENNAS_RX, axis=0)  # [9, window, subs]
        samples.append(arr)
        if len(samples) >= max_samples:
            break
    if not samples:
        return generate_synthetic_input(1)
    return np.stack(samples, axis=0)  # [N, 9, window_frames, num_subcarriers]
 # ── Benchmarking ─────────────────────────────────────────────────────────────
 def benchmark_latency(
    session: ort.InferenceSession,
    input_data: np.ndarray,
    warmup: int = 10,
    runs: int = 100,
 ) -> dict:
    """Measure inference latency over multiple runs."""
    input_name = session.get_inputs()[0].name
    # Warmup
    for _ in range(warmup):
        session.run(None, {input_name: input_data[:1]})
    # Timed runs
    latencies = []
    for _ in range(runs):
        start = time.perf_counter()
        session.run(None, {input_name: input_data[:1]})
        end = time.perf_counter()
        latencies.append((end - start) * 1000)  # ms
    latencies = np.array(latencies)
    return {
        "mean_ms": float(np.mean(latencies)),
        "std_ms": float(np.std(latencies)),
        "p50_ms": float(np.percentile(latencies, 50)),
        "p95_ms": float(np.percentile(latencies, 95)),
        "p99_ms": float(np.percentile(latencies, 99)),
        "throughput_fps": 1000.0 / float(np.mean(latencies)),
    }
 def compute_pck(
    predictions: np.ndarray,
    ground_truth: np.ndarray,
    threshold: float = PCK_THRESHOLD,
    normalize_by: float = HEATMAP_SIZE,
 ) -> float:
    """Compute Percentage of Correct Keypoints at a given threshold.
    PCK@t = fraction of predicted keypoints within t * normalize_by of ground truth.
    """
    if predictions.shape != ground_truth.shape:
        return 0.0
    # Euclidean distance per keypoint
    distances = np.linalg.norm(predictions - ground_truth, axis=-1)  # [N, K]
    threshold_pixels = threshold * normalize_by
    correct = (distances < threshold_pixels).astype(float)
    return float(np.mean(correct))
 def extract_keypoints_from_heatmaps(heatmaps: np.ndarray) -> np.ndarray:
    """Convert heatmap outputs [N, K, H, W] to keypoint coordinates [N, K, 2]."""
    n, k, h, w = heatmaps.shape
    flat = heatmaps.reshape(n, k, -1)
    max_idx = np.argmax(flat, axis=-1)  # [N, K]
    y = max_idx // w
    x = max_idx % w
    return np.stack([x, y], axis=-1).astype(np.float32)
 def benchmark_model(
    model_path: str,
    test_data: Optional[np.ndarray] = None,
    gt_keypoints: Optional[np.ndarray] = None,
    warmup: int = 10,
    runs: int = 100,
 ) -> BenchmarkResult:
    """Run full benchmark on a single model."""
    print(f"\nBenchmarking: {model_path}")
    # Load model
    session = load_model(model_path)
    provider = session.get_providers()[0]
    print(f"  Provider: {provider}")
    # Model info
    model_info = get_model_info(model_path)
    print(f"  Size: {model_info['size_mb']} MB")
    if model_info["num_parameters"]:
        print(f"  Parameters: {model_info['num_parameters']:,}")
    if model_info["estimated_flops"]:
        print(f"  Estimated FLOPs: {model_info['estimated_flops']:,}")
    # Input shape
    input_meta = session.get_inputs()[0]
    input_shape = input_meta.shape
    print(f"  Input: {input_meta.name} {input_shape} ({input_meta.type})")
    # Output shapes
    for out in session.get_outputs():
        print(f"  Output: {out.name} {out.shape}")
    # Generate or use provided test data
    if test_data is None:
        # Infer shape from model
        if input_shape and all(isinstance(d, int) for d in input_shape):
            batch = max(1, input_shape[0] if input_shape[0] > 0 else 1)
            test_data = np.random.randn(*[batch if d <= 0 else d for d in input_shape]).astype(np.float32)
        else:
            test_data = generate_synthetic_input(1)
    # Latency benchmark
    print(f"  Running {warmup} warmup + {runs} benchmark iterations...")
    latency = benchmark_latency(session, test_data, warmup=warmup, runs=runs)
    print(f"  Latency: {latency['mean_ms']:.2f} +/- {latency['std_ms']:.2f} ms")
    print(f"  P50/P95/P99: {latency['p50_ms']:.2f} / {latency['p95_ms']:.2f} / {latency['p99_ms']:.2f} ms")
    print(f"  Throughput: {latency['throughput_fps']:.1f} fps")
    # Accuracy (if ground truth provided or we can do synthetic evaluation)
    pck = None
    mpjpe = None
    num_samples = 0
    if gt_keypoints is not None and test_data is not None:
        input_name = session.get_inputs()[0].name
        all_preds = []
        for i in range(len(test_data)):
            outputs = session.run(None, {input_name: test_data[i : i + 1]})
            # Assume first output is keypoint heatmaps [1, K, H, W]
            heatmaps = outputs[0]
            if heatmaps.ndim == 4:
                kp = extract_keypoints_from_heatmaps(heatmaps)
                all_preds.append(kp[0])
        if all_preds:
            predictions = np.stack(all_preds)
            gt = gt_keypoints[: len(predictions)]
            pck = compute_pck(predictions, gt)
            distances = np.linalg.norm(predictions - gt, axis=-1)
            mpjpe = float(np.mean(distances))
            num_samples = len(predictions)
            print(f"  PCK@{PCK_THRESHOLD}: {pck:.4f}")
            print(f"  MPJPE: {mpjpe:.2f} px")
            print(f"  Samples: {num_samples}")
    result = BenchmarkResult(
        model_path=model_path,
        model_size_mb=model_info["size_mb"],
        num_parameters=model_info["num_parameters"],
        estimated_flops=model_info["estimated_flops"],
        warmup_runs=warmup,
        benchmark_runs=runs,
        latency_mean_ms=round(latency["mean_ms"], 3),
        latency_std_ms=round(latency["std_ms"], 3),
        latency_p50_ms=round(latency["p50_ms"], 3),
        latency_p95_ms=round(latency["p95_ms"], 3),
        latency_p99_ms=round(latency["p99_ms"], 3),
        throughput_fps=round(latency["throughput_fps"], 1),
        pck_at_02=round(pck, 4) if pck is not None else None,
        mean_per_joint_error=round(mpjpe, 2) if mpjpe is not None else None,
        num_test_samples=num_samples,
        input_shape=list(input_shape) if input_shape else [],
        provider=provider,
    )
    return result
 # ── Comparison table ─────────────────────────────────────────────────────────
 def print_comparison_table(results: list[BenchmarkResult]):
    """Print a formatted comparison table of multiple models."""
    if not results:
        return
    print("\n" + "=" * 100)
    print("  Model Comparison")
    print("=" * 100)
    # Header
    print(
        f"{'Model':<35} {'Size(MB)':>8} {'Params':>10} "
        f"{'Lat(ms)':>8} {'P95(ms)':>8} {'FPS':>7} {'PCK@0.2':>8}"
    )
    print("-" * 100)
    for r in results:
        name = Path(r.model_path).stem[:33]
        params = f"{r.num_parameters:,}" if r.num_parameters else "?"
        pck = f"{r.pck_at_02:.4f}" if r.pck_at_02 is not None else "N/A"
        print(
            f"{name:<35} {r.model_size_mb:>8.2f} {params:>10} "
            f"{r.latency_mean_ms:>8.2f} {r.latency_p95_ms:>8.2f} "
            f"{r.throughput_fps:>7.1f} {pck:>8}"
        )
    print("=" * 100)
    # Best model by latency
    best_latency = min(results, key=lambda r: r.latency_mean_ms)
    print(f"\n  Fastest: {Path(best_latency.model_path).stem} ({best_latency.latency_mean_ms:.2f} ms)")
    # Best by PCK (if available)
    pck_results = [r for r in results if r.pck_at_02 is not None]
    if pck_results:
        best_pck = max(pck_results, key=lambda r: r.pck_at_02)
        print(f"  Best accuracy: {Path(best_pck.model_path).stem} (PCK@0.2={best_pck.pck_at_02:.4f})")
    # Smallest model
    smallest = min(results, key=lambda r: r.model_size_mb)
    print(f"  Smallest: {Path(smallest.model_path).stem} ({smallest.model_size_mb:.2f} MB)")
 # ── Main ─────────────────────────────────────────────────────────────────────
 def main():
    parser = argparse.ArgumentParser(
        description="Benchmark WiFi-DensePose ONNX models",
        formatter_class=argparse.RawDescriptionHelpFormatter,
    )
    parser.add_argument("--model", type=str, help="Path to a single ONNX model")
    parser.add_argument("--sweep-dir", type=str, help="Directory containing multiple ONNX models to compare")
    parser.add_argument("--test-data", type=str, help="Path to .csi.jsonl test data file")
    parser.add_argument("--synthetic", action="store_true", help="Use synthetic test data")
    parser.add_argument("--num-samples", type=int, default=100, help="Number of synthetic samples (default: 100)")
    parser.add_argument("--warmup", type=int, default=10, help="Warmup iterations (default: 10)")
    parser.add_argument("--runs", type=int, default=100, help="Benchmark iterations (default: 100)")
    parser.add_argument("--output", type=str, help="Save results to JSON file")
    parser.add_argument("--gpu", action="store_true", help="Force GPU execution provider")
    args = parser.parse_args()
    if not args.model and not args.sweep_dir:
        parser.error("Specify --model or --sweep-dir")
    # Prepare test data
    test_data = None
    gt_keypoints = None
    if args.test_data:
        print(f"Loading test data from: {args.test_data}")
        test_data = load_test_data(args.test_data)
        print(f"  Loaded {len(test_data)} windowed samples")
    elif args.synthetic:
        print(f"Generating {args.num_samples} synthetic samples...")
        test_data = generate_synthetic_input(args.num_samples)
        gt_keypoints = generate_synthetic_keypoints(args.num_samples)
        print(f"  Input shape: {test_data.shape}")
    # Collect models
    model_paths = []
    if args.model:
        model_paths.append(args.model)
    if args.sweep_dir:
        sweep = Path(args.sweep_dir)
        if sweep.is_dir():
            model_paths.extend(sorted(str(p) for p in sweep.glob("**/*.onnx")))
        else:
            # Glob pattern
            from glob import glob
            model_paths.extend(sorted(glob(str(sweep))))
    if not model_paths:
        print("ERROR: No ONNX models found.")
        sys.exit(1)
    print(f"Found {len(model_paths)} model(s) to benchmark.")
    # Benchmark each model
    results = []
    for path in model_paths:
        if not Path(path).exists():
            print(f"  Skipping (not found): {path}")
            continue
        try:
            result = benchmark_model(
                path,
                test_data=test_data,
                gt_keypoints=gt_keypoints,
                warmup=args.warmup,
                runs=args.runs,
            )
            results.append(result)
        except Exception as e:
            print(f"  ERROR benchmarking {path}: {e}")
    # Comparison table
    if len(results) > 1:
        print_comparison_table(results)
    # Save results
    if args.output:
        output_path = Path(args.output)
        output_path.parent.mkdir(parents=True, exist_ok=True)
        with open(output_path, "w") as f:
            json.dump(
                {
                    "benchmark_results": [asdict(r) for r in results],
                    "timestamp": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
                    "num_models": len(results),
                },
                f,
                indent=2,
            )
        print(f"\nResults saved to: {output_path}")
    if not results:
        print("No models were successfully benchmarked.")
        sys.exit(1)
 if __name__ == "__main__":
    main()
--- a/scripts/collect-training-data.py
+++ b/scripts/collect-training-data.py
@ -0,0 +1,483 @@
 #!/usr/bin/env python3
 """
 WiFi-DensePose Training Data Collector
 Listens on UDP for CSI data from ESP32 nodes and records to .csi.jsonl
 files compatible with the Rust training pipeline (MmFiDataset / CsiDataset).
 Supports two packet formats:
  - ADR-069 feature vectors (magic 0xC5110003, 48 bytes) — 8-dim pre-extracted
  - ADR-018 raw CSI frames (magic 0xC5110001, variable) — full subcarrier data
 Usage:
    # Interactive — prompts for scenario labels
    python scripts/collect-training-data.py --port 5006
    # Scripted — fixed label, 60s per recording
    python scripts/collect-training-data.py --port 5006 --label walking --duration 60
    # Multiple scenarios in sequence
    python scripts/collect-training-data.py --port 5006 --scenarios walking,standing,sitting --duration 30
    # Dual-node collection (two ESP32s on different ports)
    python scripts/collect-training-data.py --port 5005 --port2 5006 --label walking
    # Generate manifest only from existing recordings
    python scripts/collect-training-data.py --manifest-only --output-dir data/recordings
 Prerequisites:
    - ESP32 nodes streaming CSI on UDP (see firmware/esp32-csi-node)
    - Python 3.9+
 """
 from __future__ import annotations
 import argparse
 import json
 import logging
 import os
 import socket
 import struct
 import sys
 import time
 import signal
 from datetime import datetime, timezone
 from pathlib import Path
 from typing import Optional
 logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    datefmt="%H:%M:%S",
 )
 log = logging.getLogger("collect-data")
 # ── Packet formats (must match firmware) ─────────────────────────────────────
 # ADR-018 raw CSI frame header
 MAGIC_CSI_RAW = 0xC5110001
 # ADR-069 feature vector packet
 MAGIC_FEATURES = 0xC5110003
 FEATURE_PKT_FMT = "<IBBHq8f"
 FEATURE_PKT_SIZE = struct.calcsize(FEATURE_PKT_FMT)  # 48 bytes
 # Raw CSI header: magic(4) + node_id(1) + antenna_cfg(1) + n_sub(2) + rssi(1) + noise(1) + channel(1) + reserved(1) + timestamp_ms(4)
 RAW_CSI_HDR_FMT = "<IBBHbbBxI"
 RAW_CSI_HDR_SIZE = struct.calcsize(RAW_CSI_HDR_FMT)  # 16 bytes
 # ── Packet parsing ───────────────────────────────────────────────────────────
 def parse_packet(data: bytes) -> Optional[dict]:
    """Parse a UDP packet into a frame dict, or None if unrecognized."""
    if len(data) < 4:
        return None
    magic = struct.unpack_from("<I", data)[0]
    if magic == MAGIC_FEATURES and len(data) >= FEATURE_PKT_SIZE:
        return _parse_feature_packet(data)
    elif magic == MAGIC_CSI_RAW and len(data) >= RAW_CSI_HDR_SIZE:
        return _parse_raw_csi_packet(data)
    else:
        return None
 def _parse_feature_packet(data: bytes) -> Optional[dict]:
    """Parse ADR-069 feature vector packet (48 bytes)."""
    try:
        magic, node_id, _, seq, ts_us, *features = struct.unpack_from(FEATURE_PKT_FMT, data)
    except struct.error:
        return None
    if magic != MAGIC_FEATURES:
        return None
    # Reject NaN/inf
    import math
    if any(math.isnan(f) or math.isinf(f) for f in features):
        return None
    return {
        "type": "features",
        "node_id": node_id,
        "seq": seq,
        "timestamp_us": ts_us,
        "timestamp": ts_us / 1_000_000.0,
        "features": features,
        "subcarriers": features,  # Use features as subcarrier proxy for training
        "rssi": 0.0,
        "noise_floor": 0.0,
    }
 def _parse_raw_csi_packet(data: bytes) -> Optional[dict]:
    """Parse ADR-018 raw CSI frame with full subcarrier data."""
    try:
        magic, node_id, ant_cfg, n_sub, rssi, noise, channel, ts_ms = struct.unpack_from(
            RAW_CSI_HDR_FMT, data
        )
    except struct.error:
        return None
    if magic != MAGIC_CSI_RAW:
        return None
    # Subcarrier data follows header as int16 I/Q pairs
    payload_offset = RAW_CSI_HDR_SIZE
    expected_bytes = n_sub * 2 * 2  # n_sub * (I + Q) * int16
    if len(data) < payload_offset + expected_bytes:
        return None
    iq_data = struct.unpack_from(f"<{n_sub * 2}h", data, payload_offset)
    # Convert I/Q pairs to amplitude
    subcarriers = []
    for i in range(0, len(iq_data), 2):
        real, imag = iq_data[i], iq_data[i + 1]
        amplitude = (real ** 2 + imag ** 2) ** 0.5
        subcarriers.append(amplitude)
    return {
        "type": "raw_csi",
        "node_id": node_id,
        "antenna_config": ant_cfg,
        "n_subcarriers": n_sub,
        "channel": channel,
        "timestamp": ts_ms / 1000.0,
        "subcarriers": subcarriers,
        "rssi": float(rssi),
        "noise_floor": float(noise),
    }
 # ── JSONL recording ──────────────────────────────────────────────────────────
 class CsiRecorder:
    """Records CSI frames to .csi.jsonl files compatible with the Rust pipeline."""
    def __init__(self, output_dir: str, session_name: str, label: Optional[str] = None):
        self.output_dir = Path(output_dir)
        self.output_dir.mkdir(parents=True, exist_ok=True)
        ts = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
        safe_name = session_name.replace(" ", "_").replace("/", "_")
        self.session_id = f"{safe_name}-{ts}"
        self.label = label
        self.file_path = self.output_dir / f"{self.session_id}.csi.jsonl"
        self.meta_path = self.output_dir / f"{self.session_id}.csi.meta.json"
        self.frame_count = 0
        self.start_time = time.time()
        self.started_at = datetime.now(timezone.utc).isoformat()
        self._file = None
    def open(self):
        self._file = open(self.file_path, "a", encoding="utf-8")
        log.info(f"Recording to: {self.file_path}")
    def write_frame(self, frame: dict):
        """Write a single frame as a JSONL line."""
        if self._file is None:
            return
        record = {
            "timestamp": frame.get("timestamp", time.time()),
            "subcarriers": frame.get("subcarriers", []),
            "rssi": frame.get("rssi", 0.0),
            "noise_floor": frame.get("noise_floor", 0.0),
            "features": {
                k: v for k, v in frame.items()
                if k not in ("timestamp", "subcarriers", "rssi", "noise_floor", "type")
            },
        }
        line = json.dumps(record, separators=(",", ":"))
        self._file.write(line + "\n")
        self.frame_count += 1
        if self.frame_count % 500 == 0:
            self._file.flush()
    def close(self) -> dict:
        """Close the recording and write metadata. Returns session info."""
        if self._file:
            self._file.flush()
            self._file.close()
            self._file = None
        ended_at = datetime.now(timezone.utc).isoformat()
        elapsed = time.time() - self.start_time
        file_size = self.file_path.stat().st_size if self.file_path.exists() else 0
        meta = {
            "id": self.session_id,
            "name": self.session_id,
            "label": self.label,
            "started_at": self.started_at,
            "ended_at": ended_at,
            "duration_secs": round(elapsed, 2),
            "frame_count": self.frame_count,
            "file_size_bytes": file_size,
            "file_path": str(self.file_path),
            "fps": round(self.frame_count / elapsed, 1) if elapsed > 0 else 0,
        }
        with open(self.meta_path, "w", encoding="utf-8") as f:
            json.dump(meta, f, indent=2)
        log.info(
            f"Recording stopped: {self.frame_count} frames in {elapsed:.1f}s "
            f"({meta['fps']} fps, {file_size / 1024:.1f} KB)"
        )
        return meta
 # ── Manifest generation ──────────────────────────────────────────────────────
 def generate_manifest(output_dir: str) -> dict:
    """Scan recordings directory and generate a dataset manifest JSON."""
    rec_dir = Path(output_dir)
    sessions = []
    for meta_file in sorted(rec_dir.glob("*.csi.meta.json")):
        try:
            with open(meta_file, "r") as f:
                meta = json.load(f)
            sessions.append(meta)
        except (json.JSONDecodeError, OSError) as e:
            log.warning(f"Skipping {meta_file}: {e}")
    # Aggregate stats
    total_frames = sum(s.get("frame_count", 0) for s in sessions)
    total_bytes = sum(s.get("file_size_bytes", 0) for s in sessions)
    labels = sorted(set(s.get("label", "unlabeled") or "unlabeled" for s in sessions))
    manifest = {
        "dataset": "wifi-densepose-csi",
        "generated_at": datetime.now(timezone.utc).isoformat(),
        "directory": str(rec_dir),
        "num_sessions": len(sessions),
        "total_frames": total_frames,
        "total_size_bytes": total_bytes,
        "total_size_mb": round(total_bytes / (1024 * 1024), 2),
        "labels": labels,
        "sessions": sessions,
    }
    manifest_path = rec_dir / "manifest.json"
    with open(manifest_path, "w", encoding="utf-8") as f:
        json.dump(manifest, f, indent=2)
    log.info(
        f"Manifest: {len(sessions)} sessions, {total_frames} frames, "
        f"{manifest['total_size_mb']} MB, labels={labels}"
    )
    log.info(f"Written to: {manifest_path}")
    return manifest
 # ── UDP listener ─────────────────────────────────────────────────────────────
 def collect_session(
    port: int,
    port2: Optional[int],
    output_dir: str,
    label: str,
    duration: float,
    session_name: Optional[str] = None,
 ) -> dict:
    """Run a single collection session. Returns session metadata."""
    name = session_name or label or "session"
    recorder = CsiRecorder(output_dir, name, label)
    recorder.open()
    # Bind primary socket
    sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
    sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
    sock.bind(("0.0.0.0", port))
    sock.settimeout(1.0)
    sockets = [sock]
    # Bind secondary socket if specified
    if port2:
        sock2 = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
        sock2.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
        sock2.bind(("0.0.0.0", port2))
        sock2.settimeout(0.1)
        sockets.append(sock2)
    log.info(
        f"Collecting '{label}' for {duration}s on port(s) "
        f"{port}{f', {port2}' if port2 else ''}"
    )
    start = time.time()
    dropped = 0
    try:
        while time.time() - start < duration:
            for s in sockets:
                try:
                    data, addr = s.recvfrom(4096)
                except socket.timeout:
                    continue
                frame = parse_packet(data)
                if frame:
                    recorder.write_frame(frame)
                else:
                    dropped += 1
            # Progress update every 5s
            elapsed = time.time() - start
            if recorder.frame_count > 0 and int(elapsed) % 5 == 0 and int(elapsed) > 0:
                remaining = duration - elapsed
                if remaining > 0 and int(elapsed * 10) % 50 == 0:
                    log.info(
                        f"  {recorder.frame_count} frames collected, "
                        f"{remaining:.0f}s remaining..."
                    )
    except KeyboardInterrupt:
        log.info("Interrupted by user.")
    finally:
        for s in sockets:
            s.close()
    if dropped > 0:
        log.warning(f"  {dropped} unrecognized packets dropped")
    return recorder.close()
 # ── Main ─────────────────────────────────────────────────────────────────────
 def main():
    parser = argparse.ArgumentParser(
        description="Collect CSI training data from ESP32 nodes via UDP",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
 Examples:
  # Interactive label input
  python scripts/collect-training-data.py --port 5006
  # Fixed label, 60 seconds
  python scripts/collect-training-data.py --port 5006 --label walking --duration 60
  # Multiple scenarios
  python scripts/collect-training-data.py --port 5006 --scenarios walking,standing,sitting --duration 30
  # Dual ESP32 nodes
  python scripts/collect-training-data.py --port 5005 --port2 5006 --label test
  # Generate manifest from existing recordings
  python scripts/collect-training-data.py --manifest-only
 """,
    )
    parser.add_argument("--port", type=int, default=5006, help="Primary UDP port (default: 5006)")
    parser.add_argument("--port2", type=int, default=None, help="Secondary UDP port for dual-node")
    parser.add_argument("--output-dir", default="data/recordings", help="Output directory (default: data/recordings)")
    parser.add_argument("--label", default=None, help="Activity label for the recording")
    parser.add_argument("--duration", type=float, default=30.0, help="Recording duration in seconds (default: 30)")
    parser.add_argument("--scenarios", default=None, help="Comma-separated list of scenarios to record sequentially")
    parser.add_argument("--pause", type=float, default=5.0, help="Pause between scenarios in seconds (default: 5)")
    parser.add_argument("--manifest-only", action="store_true", help="Only generate manifest from existing recordings")
    parser.add_argument("--repeats", type=int, default=1, help="Number of repeats per scenario (default: 1)")
    args = parser.parse_args()
    # Manifest-only mode
    if args.manifest_only:
        generate_manifest(args.output_dir)
        return
    # Collect scenarios
    all_sessions = []
    if args.scenarios:
        # Multi-scenario sequential collection
        scenarios = [s.strip() for s in args.scenarios.split(",") if s.strip()]
        total = len(scenarios) * args.repeats
        idx = 0
        for repeat in range(args.repeats):
            for scenario in scenarios:
                idx += 1
                print(f"\n{'='*60}")
                print(f"  Scenario {idx}/{total}: '{scenario}' (repeat {repeat+1}/{args.repeats})")
                print(f"  Duration: {args.duration}s")
                print(f"{'='*60}")
                if idx > 1:
                    print(f"  Starting in {args.pause}s... (get into position)")
                    time.sleep(args.pause)
                meta = collect_session(
                    port=args.port,
                    port2=args.port2,
                    output_dir=args.output_dir,
                    label=scenario,
                    duration=args.duration,
                    session_name=f"{scenario}_r{repeat+1:02d}",
                )
                all_sessions.append(meta)
    elif args.label:
        # Single labeled recording
        meta = collect_session(
            port=args.port,
            port2=args.port2,
            output_dir=args.output_dir,
            label=args.label,
            duration=args.duration,
        )
        all_sessions.append(meta)
    else:
        # Interactive mode — prompt for labels
        print("\nInteractive data collection mode.")
        print("Type a label for each recording, or 'q' to quit.\n")
        while True:
            label = input("Label (or 'q' to quit): ").strip()
            if label.lower() in ("q", "quit", "exit"):
                break
            if not label:
                print("  Empty label. Try again.")
                continue
            duration = args.duration
            try:
                dur_input = input(f"Duration in seconds [{duration}]: ").strip()
                if dur_input:
                    duration = float(dur_input)
            except ValueError:
                pass
            print(f"  Recording '{label}' for {duration}s — starting now...")
            meta = collect_session(
                port=args.port,
                port2=args.port2,
                output_dir=args.output_dir,
                label=label,
                duration=duration,
            )
            all_sessions.append(meta)
            print()
    # Generate manifest
    if all_sessions:
        print(f"\nCollected {len(all_sessions)} session(s).")
        manifest = generate_manifest(args.output_dir)
        total_frames = sum(s.get("frame_count", 0) for s in all_sessions)
        print(f"\nSummary:")
        print(f"  Sessions: {len(all_sessions)}")
        print(f"  Total frames: {total_frames}")
        print(f"  Output: {args.output_dir}/")
        print(f"  Manifest: {args.output_dir}/manifest.json")
    else:
        print("No sessions recorded.")
 if __name__ == "__main__":
    main()
--- a/scripts/gcloud-train.sh
+++ b/scripts/gcloud-train.sh
@ -0,0 +1,469 @@
 #!/bin/bash
 # ==============================================================================
 # GCloud GPU Training Script for WiFi-DensePose
 # ==============================================================================
 #
 # Creates a GCloud VM with GPU, runs the Rust training pipeline, downloads
 # the trained model artifacts, and tears down the VM to avoid ongoing costs.
 #
 # Usage:
 #   bash scripts/gcloud-train.sh [OPTIONS]
 #
 # Options:
 #   --gpu        l4|a100|h100       GPU type (default: l4)
 #   --zone       ZONE               GCloud zone (default: us-central1-a)
 #   --hours      N                  Max VM lifetime in hours (default: 2)
 #   --config     FILE               Training config JSON (default: scripts/training-config-sweep.json entry 0)
 #   --data-dir   DIR                Local data directory to upload (default: data/recordings)
 #   --dry-run                       Run smoke test with synthetic data
 #   --sweep                         Run full hyperparameter sweep (all configs)
 #   --keep-vm                       Do not delete VM after training
 #   --instance   NAME               Custom VM instance name
 #
 # Prerequisites:
 #   - gcloud CLI authenticated: gcloud auth login
 #   - Project set: gcloud config set project cognitum-20260110
 #   - Quota for GPUs in the selected zone
 #
 # Cost estimates:
 #   L4 (~$0.80/hr) — good for prototyping and small sweeps
 #   A100 40GB (~$3.60/hr) — full training runs
 #   H100 80GB (~$11.00/hr) — large batch / fast iteration
 # ==============================================================================
 set -euo pipefail
 # ── Defaults ──────────────────────────────────────────────────────────────────
 PROJECT="cognitum-20260110"
 GPU_TYPE="l4"
 ZONE="us-central1-a"
 MAX_HOURS=2
 CONFIG_FILE=""
 DATA_DIR="data/recordings"
 DRY_RUN=false
 SWEEP=false
 KEEP_VM=false
 INSTANCE_NAME=""
 REPO_URL="https://github.com/ruvnet/wifi-densepose.git"
 BRANCH="main"
 # ── Parse arguments ───────────────────────────────────────────────────────────
 while [[ $# -gt 0 ]]; do
    case "$1" in
        --gpu)       GPU_TYPE="$2";      shift 2 ;;
        --zone)      ZONE="$2";          shift 2 ;;
        --hours)     MAX_HOURS="$2";     shift 2 ;;
        --config)    CONFIG_FILE="$2";   shift 2 ;;
        --data-dir)  DATA_DIR="$2";      shift 2 ;;
        --dry-run)   DRY_RUN=true;       shift   ;;
        --sweep)     SWEEP=true;         shift   ;;
        --keep-vm)   KEEP_VM=true;       shift   ;;
        --instance)  INSTANCE_NAME="$2"; shift 2 ;;
        --branch)    BRANCH="$2";        shift 2 ;;
        -h|--help)
            head -35 "$0" | tail -30
            exit 0
            ;;
        *)
            echo "ERROR: Unknown option: $1"
            exit 1
            ;;
    esac
 done
 # ── GPU configuration map ────────────────────────────────────────────────────
 declare -A GPU_ACCELERATOR=(
    [l4]="nvidia-l4"
    [a100]="nvidia-tesla-a100"
    [h100]="nvidia-h100-80gb"
 )
 declare -A GPU_MACHINE_TYPE=(
    [l4]="g2-standard-8"
    [a100]="a2-highgpu-1g"
    [h100]="a3-highgpu-1g"
 )
 declare -A GPU_BOOT_DISK=(
    [l4]="200"
    [a100]="300"
    [h100]="300"
 )
 if [[ -z "${GPU_ACCELERATOR[$GPU_TYPE]+x}" ]]; then
    echo "ERROR: Unknown GPU type '$GPU_TYPE'. Choose: l4, a100, h100"
    exit 1
 fi
 ACCELERATOR="${GPU_ACCELERATOR[$GPU_TYPE]}"
 MACHINE_TYPE="${GPU_MACHINE_TYPE[$GPU_TYPE]}"
 BOOT_DISK_GB="${GPU_BOOT_DISK[$GPU_TYPE]}"
 # ── Instance naming ──────────────────────────────────────────────────────────
 TIMESTAMP=$(date +%Y%m%d-%H%M%S)
 if [[ -z "$INSTANCE_NAME" ]]; then
    INSTANCE_NAME="wdp-train-${GPU_TYPE}-${TIMESTAMP}"
 fi
 # ── Announce plan ────────────────────────────────────────────────────────────
 echo "============================================================"
 echo "  WiFi-DensePose GCloud GPU Training"
 echo "============================================================"
 echo "  Project:      $PROJECT"
 echo "  Instance:     $INSTANCE_NAME"
 echo "  Zone:         $ZONE"
 echo "  GPU:          $GPU_TYPE ($ACCELERATOR)"
 echo "  Machine:      $MACHINE_TYPE"
 echo "  Boot disk:    ${BOOT_DISK_GB}GB"
 echo "  Max runtime:  ${MAX_HOURS}h"
 echo "  Data dir:     $DATA_DIR"
 echo "  Dry run:      $DRY_RUN"
 echo "  Sweep:        $SWEEP"
 echo "  Branch:       $BRANCH"
 echo "============================================================"
 echo ""
 # ── Verify gcloud auth ──────────────────────────────────────────────────────
 if ! gcloud auth list --filter=status:ACTIVE --format="value(account)" 2>/dev/null | head -1 | grep -q '@'; then
    echo "ERROR: No active gcloud account. Run: gcloud auth login"
    exit 1
 fi
 gcloud config set project "$PROJECT" --quiet
 # ── Build startup script ─────────────────────────────────────────────────────
 STARTUP_SCRIPT=$(cat <<'STARTUP_EOF'
 #!/bin/bash
 set -euo pipefail
 exec > /var/log/wdp-setup.log 2>&1
 echo "=== WiFi-DensePose GPU VM Setup ==="
 echo "Started: $(date)"
 # Wait for GPU driver
 echo "Waiting for NVIDIA driver..."
 for i in $(seq 1 60); do
    if nvidia-smi &>/dev/null; then
        echo "GPU ready after ${i}s"
        nvidia-smi
        break
    fi
    sleep 5
 done
 if ! nvidia-smi &>/dev/null; then
    echo "ERROR: GPU driver not available after 300s"
    exit 1
 fi
 # Install Rust toolchain
 echo "Installing Rust toolchain..."
 curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --default-toolchain stable
 source "$HOME/.cargo/env"
 rustc --version
 cargo --version
 # Install system dependencies
 echo "Installing system dependencies..."
 apt-get update -qq
 apt-get install -y -qq pkg-config libssl-dev cmake clang
 # Find libtorch from the Deep Learning VM's PyTorch installation
 echo "Locating libtorch..."
 PYTORCH_LIB=$(python3 -c "import torch; print(torch.__path__[0] + '/lib')" 2>/dev/null || echo "")
 if [[ -n "$PYTORCH_LIB" && -d "$PYTORCH_LIB" ]]; then
    export LIBTORCH="$PYTORCH_LIB"
    export LD_LIBRARY_PATH="${LIBTORCH}:${LD_LIBRARY_PATH:-}"
    echo "Found libtorch at: $LIBTORCH"
 else
    echo "WARNING: PyTorch not found in system Python. Installing via pip..."
    pip3 install torch --index-url https://download.pytorch.org/whl/cu121
    PYTORCH_LIB=$(python3 -c "import torch; print(torch.__path__[0] + '/lib')")
    export LIBTORCH="$PYTORCH_LIB"
    export LD_LIBRARY_PATH="${LIBTORCH}:${LD_LIBRARY_PATH:-}"
 fi
 # Persist env vars
 cat >> /etc/environment <<ENV_VARS
 LIBTORCH=$LIBTORCH
 LD_LIBRARY_PATH=$LIBTORCH:\$LD_LIBRARY_PATH
 PATH=$HOME/.cargo/bin:\$PATH
 ENV_VARS
 echo "=== Setup complete: $(date) ==="
 touch /tmp/wdp-setup-done
 STARTUP_EOF
 )
 # ── Step 1: Create the VM ────────────────────────────────────────────────────
 echo "[1/7] Creating VM instance: $INSTANCE_NAME ..."
 gcloud compute instances create "$INSTANCE_NAME" \
    --project="$PROJECT" \
    --zone="$ZONE" \
    --machine-type="$MACHINE_TYPE" \
    --accelerator="type=$ACCELERATOR,count=1" \
    --image-family="common-cu121-ubuntu-2204" \
    --image-project="deeplearning-platform-release" \
    --boot-disk-size="${BOOT_DISK_GB}GB" \
    --boot-disk-type="pd-ssd" \
    --maintenance-policy=TERMINATE \
    --metadata="install-nvidia-driver=True" \
    --metadata-from-file="startup-script=<(echo "$STARTUP_SCRIPT")" \
    --scopes="default,storage-rw" \
    --labels="purpose=wdp-training,gpu=${GPU_TYPE}" \
    --quiet
 echo "  VM created. Waiting for startup script to complete..."
 # ── Step 2: Wait for setup ───────────────────────────────────────────────────
 echo "[2/7] Waiting for setup to complete (GPU driver + Rust toolchain)..."
 for i in $(seq 1 60); do
    if gcloud compute ssh "$INSTANCE_NAME" --zone="$ZONE" --command="test -f /tmp/wdp-setup-done" --quiet 2>/dev/null; then
        echo "  Setup complete after $((i * 15))s"
        break
    fi
    if [[ $i -eq 60 ]]; then
        echo "ERROR: Setup timed out after 15 minutes."
        echo "Check logs: gcloud compute ssh $INSTANCE_NAME --zone=$ZONE --command='cat /var/log/wdp-setup.log'"
        if [[ "$KEEP_VM" == "false" ]]; then
            echo "Cleaning up VM..."
            gcloud compute instances delete "$INSTANCE_NAME" --zone="$ZONE" --quiet
        fi
        exit 1
    fi
    sleep 15
 done
 # ── Step 3: Clone repo and build ─────────────────────────────────────────────
 echo "[3/7] Cloning repository and building training binary..."
 gcloud compute ssh "$INSTANCE_NAME" --zone="$ZONE" --command="$(cat <<CLONE_EOF
 set -euo pipefail
 source \$HOME/.cargo/env
 # Clone the repo
 if [[ ! -d ~/wifi-densepose ]]; then
    git clone --depth 1 --branch "$BRANCH" "$REPO_URL" ~/wifi-densepose
 fi
 # Set libtorch environment
 export LIBTORCH=\$(python3 -c "import torch; print(torch.__path__[0] + '/lib')")
 export LD_LIBRARY_PATH="\${LIBTORCH}:\${LD_LIBRARY_PATH:-}"
 # Build the training binary with tch-backend
 cd ~/wifi-densepose/rust-port/wifi-densepose-rs
 echo "Building with LIBTORCH=\$LIBTORCH ..."
 cargo build --release --features tch-backend --bin train 2>&1 | tail -5
 echo "Build complete."
 ls -lh target/release/train
 CLONE_EOF
 )"
 # ── Step 4: Upload training data ─────────────────────────────────────────────
 echo "[4/7] Uploading training data..."
 if [[ -d "$DATA_DIR" ]] && [[ "$(ls -A "$DATA_DIR" 2>/dev/null)" ]]; then
    # Create a tarball of the data directory
    DATA_TAR="/tmp/wdp-training-data-${TIMESTAMP}.tar.gz"
    tar czf "$DATA_TAR" -C "$(dirname "$DATA_DIR")" "$(basename "$DATA_DIR")"
    DATA_SIZE=$(du -h "$DATA_TAR" | cut -f1)
    echo "  Uploading ${DATA_SIZE} of training data..."
    gcloud compute scp "$DATA_TAR" "${INSTANCE_NAME}:~/training-data.tar.gz" --zone="$ZONE" --quiet
    gcloud compute ssh "$INSTANCE_NAME" --zone="$ZONE" --command="
        mkdir -p ~/wifi-densepose/data
        tar xzf ~/training-data.tar.gz -C ~/wifi-densepose/data/
        echo 'Data extracted:'
        find ~/wifi-densepose/data -name '*.jsonl' -o -name '*.csi.jsonl' | head -20
    "
    rm -f "$DATA_TAR"
 else
    echo "  No local data at '$DATA_DIR'. Training will use --dry-run or MM-Fi."
    if [[ "$DRY_RUN" == "false" && "$SWEEP" == "false" ]]; then
        echo "  WARNING: No data and --dry-run not set. Forcing --dry-run."
        DRY_RUN=true
    fi
 fi
 # ── Step 5: Upload config and run training ────────────────────────────────────
 echo "[5/7] Running training..."
 # Upload sweep config if doing a sweep
 if [[ "$SWEEP" == "true" ]]; then
    SWEEP_FILE="scripts/training-config-sweep.json"
    if [[ -f "$SWEEP_FILE" ]]; then
        gcloud compute scp "$SWEEP_FILE" "${INSTANCE_NAME}:~/sweep-configs.json" --zone="$ZONE" --quiet
    else
        echo "ERROR: Sweep config not found at $SWEEP_FILE"
        exit 1
    fi
 fi
 # Upload single config if specified
 if [[ -n "$CONFIG_FILE" ]]; then
    gcloud compute scp "$CONFIG_FILE" "${INSTANCE_NAME}:~/train-config.json" --zone="$ZONE" --quiet
 fi
 # Build the training command
 TRAIN_CMD_BASE="
 set -euo pipefail
 source \$HOME/.cargo/env
 export LIBTORCH=\$(python3 -c \"import torch; print(torch.__path__[0] + '/lib')\")
 export LD_LIBRARY_PATH=\"\${LIBTORCH}:\${LD_LIBRARY_PATH:-}\"
 cd ~/wifi-densepose/rust-port/wifi-densepose-rs
 # Set auto-shutdown timer (safety net)
 sudo shutdown -P +$((MAX_HOURS * 60)) &
 TRAIN_BIN=./target/release/train
 "
 if [[ "$SWEEP" == "true" ]]; then
    # Run all configs in the sweep file
    gcloud compute ssh "$INSTANCE_NAME" --zone="$ZONE" --command="$(cat <<SWEEP_EOF
 $TRAIN_CMD_BASE
 echo "=== Hyperparameter Sweep ==="
 SWEEP_FILE=~/sweep-configs.json
 NUM_CONFIGS=\$(python3 -c "import json; print(len(json.load(open('\$SWEEP_FILE'))['configs']))")
 echo "Running \$NUM_CONFIGS configurations..."
 mkdir -p ~/results
 for i in \$(seq 0 \$((NUM_CONFIGS - 1))); do
    echo ""
    echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
    echo "  Config \$((i+1)) / \$NUM_CONFIGS"
    echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
    # Extract single config to temp file
    python3 -c "
 import json, sys
 sweep = json.load(open('\$SWEEP_FILE'))
 cfg = sweep['configs'][\$i]
 # Merge with base config
 base = sweep.get('base', {})
 merged = {**base, **cfg}
 # Set checkpoint dir per config
 merged['checkpoint_dir'] = f'checkpoints/sweep_{i:02d}'
 merged['log_dir'] = f'logs/sweep_{i:02d}'
 json.dump(merged, open('/tmp/sweep_config_\${i}.json', 'w'), indent=2)
 print(f\"Config \${i}: lr={merged.get('learning_rate', '?')}, bs={merged.get('batch_size', '?')}, bb={merged.get('backbone_channels', '?')}\")
 "
    START_TIME=\$(date +%s)
    \$TRAIN_BIN --config /tmp/sweep_config_\${i}.json --cuda $( [[ "$DRY_RUN" == "true" ]] && echo "--dry-run" ) 2>&1 | tee ~/results/sweep_\${i}.log || true
    END_TIME=\$(date +%s)
    ELAPSED=\$(( END_TIME - START_TIME ))
    echo "  Completed in \${ELAPSED}s"
 done
 echo ""
 echo "=== Sweep Complete ==="
 echo "Results in ~/results/"
 ls -lh ~/results/
 SWEEP_EOF
 )"
 elif [[ -n "$CONFIG_FILE" ]]; then
    # Single config run
    gcloud compute ssh "$INSTANCE_NAME" --zone="$ZONE" --command="$(cat <<SINGLE_EOF
 $TRAIN_CMD_BASE
 echo "=== Training with custom config ==="
 \$TRAIN_BIN --config ~/train-config.json --cuda $( [[ "$DRY_RUN" == "true" ]] && echo "--dry-run" ) 2>&1 | tee ~/train.log
 SINGLE_EOF
 )"
 else
    # Default config run
    gcloud compute ssh "$INSTANCE_NAME" --zone="$ZONE" --command="$(cat <<DEFAULT_EOF
 $TRAIN_CMD_BASE
 echo "=== Training with default config ==="
 \$TRAIN_BIN --cuda $( [[ "$DRY_RUN" == "true" ]] && echo "--dry-run --dry-run-samples 256" ) 2>&1 | tee ~/train.log
 DEFAULT_EOF
 )"
 fi
 # ── Step 6: Download results ─────────────────────────────────────────────────
 echo "[6/7] Downloading trained model artifacts..."
 LOCAL_RESULTS="training-results/${INSTANCE_NAME}"
 mkdir -p "$LOCAL_RESULTS"
 # Package results on the VM
 gcloud compute ssh "$INSTANCE_NAME" --zone="$ZONE" --command="
 cd ~/wifi-densepose/rust-port/wifi-densepose-rs
 tar czf ~/training-artifacts.tar.gz \
    checkpoints/ \
    logs/ \
    2>/dev/null || true
 # Also grab sweep results if they exist
 if [[ -d ~/results ]]; then
    tar czf ~/sweep-results.tar.gz -C ~ results/ 2>/dev/null || true
 fi
 ls -lh ~/training-artifacts.tar.gz ~/sweep-results.tar.gz 2>/dev/null || true
 "
 # Download artifacts
 gcloud compute scp "${INSTANCE_NAME}:~/training-artifacts.tar.gz" \
    "${LOCAL_RESULTS}/training-artifacts.tar.gz" --zone="$ZONE" --quiet 2>/dev/null || true
 if [[ "$SWEEP" == "true" ]]; then
    gcloud compute scp "${INSTANCE_NAME}:~/sweep-results.tar.gz" \
        "${LOCAL_RESULTS}/sweep-results.tar.gz" --zone="$ZONE" --quiet 2>/dev/null || true
 fi
 # Download training log
 gcloud compute scp "${INSTANCE_NAME}:~/train.log" \
    "${LOCAL_RESULTS}/train.log" --zone="$ZONE" --quiet 2>/dev/null || true
 # Extract locally
 if [[ -f "${LOCAL_RESULTS}/training-artifacts.tar.gz" ]]; then
    tar xzf "${LOCAL_RESULTS}/training-artifacts.tar.gz" -C "$LOCAL_RESULTS/"
    echo "  Artifacts extracted to: $LOCAL_RESULTS/"
    find "$LOCAL_RESULTS" -name "*.pt" -o -name "*.onnx" -o -name "*.rvf" 2>/dev/null | head -20
 fi
 # ── Step 7: Cleanup ──────────────────────────────────────────────────────────
 if [[ "$KEEP_VM" == "true" ]]; then
    echo "[7/7] Keeping VM alive (--keep-vm). Remember to delete it manually:"
    echo "  gcloud compute instances delete $INSTANCE_NAME --zone=$ZONE --quiet"
    echo "  SSH: gcloud compute ssh $INSTANCE_NAME --zone=$ZONE"
 else
    echo "[7/7] Deleting VM to avoid ongoing costs..."
    gcloud compute instances delete "$INSTANCE_NAME" --zone="$ZONE" --quiet
    echo "  VM deleted."
 fi
 # ── Summary ──────────────────────────────────────────────────────────────────
 echo ""
 echo "============================================================"
 echo "  Training Complete"
 echo "============================================================"
 echo "  Results:  $LOCAL_RESULTS/"
 echo "  GPU:      $GPU_TYPE ($ZONE)"
 echo "  Instance: $INSTANCE_NAME"
 if [[ "$KEEP_VM" == "true" ]]; then
    echo "  VM:       STILL RUNNING (delete manually!)"
 fi
 echo "============================================================"
--- a/scripts/training-config-sweep.json
+++ b/scripts/training-config-sweep.json
@ -0,0 +1,155 @@
 {
  "description": "WiFi-DensePose hyperparameter sweep — 10 configurations exploring learning rate, batch size, backbone width, window length, loss ratios, and warmup schedules.",
  "base": {
    "num_subcarriers": 56,
    "native_subcarriers": 114,
    "num_antennas_tx": 3,
    "num_antennas_rx": 3,
    "heatmap_size": 56,
    "num_keypoints": 17,
    "num_body_parts": 24,
    "weight_decay": 1e-4,
    "num_epochs": 50,
    "lr_gamma": 0.1,
    "grad_clip_norm": 1.0,
    "val_every_epochs": 1,
    "early_stopping_patience": 10,
    "save_top_k": 3,
    "use_gpu": true,
    "gpu_device_id": 0,
    "num_workers": 4,
    "seed": 42
  },
  "configs": [
    {
      "_name": "baseline",
      "_description": "Default config — reference baseline",
      "learning_rate": 1e-3,
      "batch_size": 8,
      "backbone_channels": 256,
      "window_frames": 100,
      "warmup_epochs": 5,
      "lr_milestones": [30, 45],
      "lambda_kp": 0.3,
      "lambda_dp": 0.6,
      "lambda_tr": 0.1
    },
    {
      "_name": "low_lr_large_batch",
      "_description": "Lower LR with larger batch — stable convergence",
      "learning_rate": 1e-4,
      "batch_size": 16,
      "backbone_channels": 256,
      "window_frames": 100,
      "warmup_epochs": 10,
      "lr_milestones": [30, 45],
      "lambda_kp": 0.3,
      "lambda_dp": 0.6,
      "lambda_tr": 0.1
    },
    {
      "_name": "high_lr_small_batch",
      "_description": "Higher LR with small batch — fast exploration",
      "learning_rate": 2e-3,
      "batch_size": 4,
      "backbone_channels": 256,
      "window_frames": 100,
      "warmup_epochs": 3,
      "lr_milestones": [20, 40],
      "lambda_kp": 0.3,
      "lambda_dp": 0.6,
      "lambda_tr": 0.1
    },
    {
      "_name": "narrow_backbone",
      "_description": "128-channel backbone — faster training, lower VRAM",
      "learning_rate": 1e-3,
      "batch_size": 16,
      "backbone_channels": 128,
      "window_frames": 100,
      "warmup_epochs": 5,
      "lr_milestones": [30, 45],
      "lambda_kp": 0.3,
      "lambda_dp": 0.6,
      "lambda_tr": 0.1
    },
    {
      "_name": "short_window",
      "_description": "50-frame window — lower latency, tests temporal sensitivity",
      "learning_rate": 5e-4,
      "batch_size": 16,
      "backbone_channels": 256,
      "window_frames": 50,
      "warmup_epochs": 5,
      "lr_milestones": [30, 45],
      "lambda_kp": 0.3,
      "lambda_dp": 0.6,
      "lambda_tr": 0.1
    },
    {
      "_name": "keypoint_heavy",
      "_description": "Heavier keypoint loss — prioritize skeleton accuracy",
      "learning_rate": 5e-4,
      "batch_size": 8,
      "backbone_channels": 256,
      "window_frames": 100,
      "warmup_epochs": 5,
      "lr_milestones": [30, 45],
      "lambda_kp": 0.5,
      "lambda_dp": 0.4,
      "lambda_tr": 0.1
    },
    {
      "_name": "contrastive_heavy",
      "_description": "Strong contrastive/transfer loss — self-supervised pretraining focus",
      "learning_rate": 5e-4,
      "batch_size": 8,
      "backbone_channels": 256,
      "window_frames": 100,
      "warmup_epochs": 10,
      "lr_milestones": [30, 45],
      "lambda_kp": 0.2,
      "lambda_dp": 0.3,
      "lambda_tr": 0.5
    },
    {
      "_name": "wide_backbone_long_warmup",
      "_description": "256-ch backbone + long warmup + moderate LR",
      "learning_rate": 5e-4,
      "batch_size": 8,
      "backbone_channels": 256,
      "window_frames": 100,
      "warmup_epochs": 10,
      "lr_milestones": [35, 48],
      "lambda_kp": 0.3,
      "lambda_dp": 0.6,
      "lambda_tr": 0.1
    },
    {
      "_name": "narrow_short_aggressive",
      "_description": "128-ch + 50-frame + high LR — fast cheap exploration",
      "learning_rate": 2e-3,
      "batch_size": 16,
      "backbone_channels": 128,
      "window_frames": 50,
      "warmup_epochs": 3,
      "lr_milestones": [20, 40],
      "lambda_kp": 0.4,
      "lambda_dp": 0.5,
      "lambda_tr": 0.1
    },
    {
      "_name": "balanced_medium",
      "_description": "Balanced loss, medium LR, medium batch — robust default",
      "learning_rate": 5e-4,
      "batch_size": 8,
      "backbone_channels": 256,
      "window_frames": 100,
      "warmup_epochs": 5,
      "lr_milestones": [25, 40],
      "lambda_kp": 0.35,
      "lambda_dp": 0.45,
      "lambda_tr": 0.2
    }
  ]
 }