diff --git a/aether-arena/calibration/README.md b/aether-arena/calibration/README.md new file mode 100644 index 00000000..6801f967 --- /dev/null +++ b/aether-arena/calibration/README.md @@ -0,0 +1,68 @@ +# RuView Calibration Service (reference implementation) + +Turn a **shared WiFi-CSI pose base model** into a room-specific one with a **30-second labeled +calibration** and a **~11 KB per-room LoRA adapter**. This is the deployable resolution of the +cross-subject / cross-environment generalization problem (full study: [ADR-150 §3.3–3.6](../../docs/adr/ADR-150-rf-foundation-encoder.md)). + +## Why + +Zero-shot WiFi pose generalizes poorly to a **new room or new person** — an unseen room can drop a +strong model to near-random. But that gap is **not** algorithmically closeable (CORAL, DANN, +instance-norm, contrastive foundation-pretraining all failed) and **not** closeable by collecting +more subjects (saturates ~64%). It **is** closeable, cheaply, at deployment time: a handful of +labeled frames from the actual room pin down its multipath instantly. + +| Deployment case | Zero-shot | + in-room calibration | +|-----------------|----------:|----------------------:| +| Same room, new person (cross-subject) | 64% | **76%** (200 samples) | +| **New room + new person (cross-environment)** | **~10%** | **60% @ 5 samples → 73% @ 200** | + +**Verified demo (this code, source-only base on an unseen MM-Fi room E04):** +`zero-shot 3.09% → after 200-sample calibration 74.29%` (+71 pts). + +## How it works + +A frozen shared **base** (transformer + temporal attention pool + skeleton-graph head, the published +[`ruvnet/wifi-densepose-mmfi-pose`](https://huggingface.co/ruvnet/wifi-densepose-mmfi-pose)) plus a +tiny **LoRA adapter** (rank 8 on the input projection + pose head — **11,200 params ≈ 11 KB int8 / +22 KB fp16**) fitted per room. Thousands of room-adapters hang off one base. + +## Usage + +```bash +# 1) Capture a short labeled clip in the deployment room -> calib.npz {X:[N,3,114,10], Y:[N,17,2]} +# (~100–200 samples recommended; below ~20 the adapter can underperform zero-shot) + +# 2) Fit the per-room adapter (~11 KB): +python calibrate.py --base pose_mmfi_best.pt --data calib.npz --out room.adapter.npz + +# 3) Run calibrated inference (base + room adapter): +python infer.py --base pose_mmfi_best.pt --adapter room.adapter.npz --data frames.npz --out kp.npy +# omit --adapter to run the uncalibrated (zero-shot) base +``` + +`X` is CSI amplitude `[N, 3 antennas, 114 subcarriers, 10 frames]` (per-sample standardization is +applied internally). `Y` is `[N,17,2]` COCO keypoints in `[0,1]`. + +## Calibration budget (measured, rank-8 LoRA, 3 seeds — ADR-150 §3.5) + +| Labeled samples/room | cross-subject | cross-environment | +|---------------------:|--------------:|------------------:| +| 0 (zero-shot) | 64% | ~10% | +| 5 | — | 60% | +| 20 | 66% | 66% | +| 50 | 70% | 70% | +| 200 | 72% | 73% | + +Knee at ~50 samples (~70%); **below ~20 samples the adapter can hurt** (too few to fit reliably). + +## Notes + +- **Calibration only helps when the base hasn't already seen the room.** The published flagship was + trained on MM-Fi `random_split`, so calibrating it on an MM-Fi subject is a near-no-op (it already + saw them); for a genuinely new real-world room it is zero-shot and calibration applies. To + *reproduce the demo* on a held-out MM-Fi room, train a source-only base (exclude the target + environment) — see `ADR-150 §3.6` and the few-shot harness in `aether-arena/staging/`. +- Adapter is saved fp16 (~22 KB); quantize to int8 for the ~11 KB on-device form. +- Inference is real-time on CPU (the 75 K-param `micro` variant runs in 0.135 ms single-thread x86; + see [`docs/benchmarks/wifi-pose-efficiency-frontier.md`](../../docs/benchmarks/wifi-pose-efficiency-frontier.md)). diff --git a/aether-arena/calibration/calibrate.py b/aether-arena/calibration/calibrate.py new file mode 100644 index 00000000..31ed08ec --- /dev/null +++ b/aether-arena/calibration/calibrate.py @@ -0,0 +1,71 @@ +"""RuView per-room calibration — fit a ~11 KB LoRA adapter from a short labeled in-room capture. + + python calibrate.py --base pose_mmfi_best.pt --data room_calib.npz --out room_A.adapter.npz + +`room_calib.npz` must contain `X` [N,3,114,10] CSI amplitude and `Y` [N,17,2] (or [N,34]) keypoints +in [0,1] — the labeled calibration samples from the deployment room (~100–200 recommended; ≥20). +Outputs a tiny adapter (.npz, ~11 KB) that, loaded over the shared base at inference, recovers +SOTA-level pose for that room/person (ADR-150 §3.5–3.6). +""" +import argparse +import numpy as np +import torch +import torch.nn as nn + +from model import PoseNet, standardize + + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("--base", required=True, help="base checkpoint (pose_mmfi_best.pt)") + ap.add_argument("--data", required=True, help="labeled calibration .npz with X and Y") + ap.add_argument("--out", required=True, help="output adapter .npz") + ap.add_argument("--rank", type=int, default=8) + ap.add_argument("--iters", type=int, default=600) + ap.add_argument("--lr", type=float, default=8e-4) + ap.add_argument("--device", default="cuda" if torch.cuda.is_available() else "cpu") + a = ap.parse_args() + + z = np.load(a.data) + X = torch.tensor(z["X"].astype(np.float32)) + Y = torch.tensor(z["Y"].reshape(len(z["Y"]), 34).astype(np.float32)) + n = len(X) + if n < 20: + print(f"WARNING: only {n} calibration samples — below ~20 the adapter may underperform " + f"zero-shot (ADR-150 §3.5). Recommend ~100–200.") + dev = a.device + + net = PoseNet().to(dev) + net.load_state_dict(torch.load(a.base, map_location=dev), strict=False) + net.add_lora(r=a.rank).to(dev) + for k, p in net.named_parameters(): + p.requires_grad = k.endswith(".A") or k.endswith(".B") + trainable = [p for p in net.parameters() if p.requires_grad] + n_tr = sum(p.numel() for p in trainable) + + Xs = standardize(X.to(dev)) + Yt = Y.to(dev) + opt = torch.optim.AdamW(trainable, lr=a.lr, weight_decay=0.0) + lossf = nn.SmoothL1Loss(beta=0.1) + bs = min(128, n) + net.train() + for it in range(a.iters): + bi = torch.randint(0, n, (bs,), device=dev) + xb = Xs[bi] + # light augmentation (subcarrier dropout + noise) — matches training-time regularization + m = (torch.rand(xb.shape[0], xb.shape[1], 1, 1, device=dev) > 0.15).float() + xb = xb * m + 0.03 * torch.randn_like(xb) * torch.rand(xb.shape[0], 1, 1, 1, device=dev) + opt.zero_grad() + lossf(net(xb), Yt[bi]).backward() + opt.step() + + adapter = net.lora_state() + nbytes = sum(v.astype(np.float16).nbytes for v in adapter.values()) + np.savez(a.out, **{k: v.astype(np.float16) for k, v in adapter.items()}, + _meta=np.array([a.rank, n, n_tr], dtype=np.int64)) + print(f"saved {a.out} | rank {a.rank} | {n_tr:,} params | ~{nbytes/1024:.1f} KB fp16 | " + f"from {n} labeled samples") + + +if __name__ == "__main__": + main() diff --git a/aether-arena/calibration/infer.py b/aether-arena/calibration/infer.py new file mode 100644 index 00000000..2d5deb79 --- /dev/null +++ b/aether-arena/calibration/infer.py @@ -0,0 +1,49 @@ +"""Run calibrated WiFi-CSI pose inference: shared base + a per-room LoRA adapter. + + python infer.py --base pose_mmfi_best.pt --adapter room_A.adapter.npz --data frames.npz + +`frames.npz` contains `X` [N,3,114,10] CSI amplitude. Prints/saves [N,17,2] keypoints in [0,1]. +Omit --adapter to run the uncalibrated (zero-shot) base. With a room adapter, expect SOTA-level +accuracy in that room/person; without one, zero-shot degrades in unseen rooms (ADR-150 §3.6). +""" +import argparse +import numpy as np +import torch + +from model import PoseNet, standardize + + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("--base", required=True) + ap.add_argument("--adapter", default=None, help="per-room .adapter.npz (omit for zero-shot)") + ap.add_argument("--data", required=True, help=".npz with X [N,3,114,10]") + ap.add_argument("--out", default=None, help="optional .npy to save [N,17,2] keypoints") + ap.add_argument("--rank", type=int, default=8) + ap.add_argument("--device", default="cuda" if torch.cuda.is_available() else "cpu") + a = ap.parse_args() + dev = a.device + + net = PoseNet().to(dev) + net.load_state_dict(torch.load(a.base, map_location=dev), strict=False) + if a.adapter: + net.add_lora(r=a.rank).to(dev) + z = np.load(a.adapter) + net.load_lora({k: z[k].astype(np.float32) for k in z.files if k.endswith(".A") or k.endswith(".B")}) + net.eval() + + X = torch.tensor(np.load(a.data)["X"].astype(np.float32)).to(dev) + Xs = standardize(X) + out = [] + with torch.no_grad(): + for i in range(0, len(Xs), 4096): + out.append(net(Xs[i:i + 4096]).cpu().numpy()) + kp = np.concatenate(out).reshape(-1, 17, 2) + print(f"inferred {len(kp)} frames | adapter={'yes' if a.adapter else 'NONE (zero-shot)'}") + if a.out: + np.save(a.out, kp) + print(f"saved keypoints -> {a.out}") + + +if __name__ == "__main__": + main() diff --git a/aether-arena/calibration/model.py b/aether-arena/calibration/model.py new file mode 100644 index 00000000..142b1f0b --- /dev/null +++ b/aether-arena/calibration/model.py @@ -0,0 +1,107 @@ +"""WiFi-CSI pose model + LoRA adapter for the RuView calibration service. + +Architecture matches the published flagship checkpoint +[`ruvnet/wifi-densepose-mmfi-pose`](https://huggingface.co/ruvnet/wifi-densepose-mmfi-pose) +(`pose_mmfi_best.pt`): transformer encoder + temporal attention pooling + skeleton-graph head. + +The calibration service freezes this base and fits a tiny per-room **LoRA adapter** (rank 8 on the +input projection + pose head ≈ 11 KB) from ~100–200 labeled in-room samples. Empirically that lifts +cross-subject 64→72% and cross-environment 11→73% (ADR-150 §3.3–3.6). +""" +import numpy as np +import torch +import torch.nn as nn + +# COCO-17 skeleton edges for the graph-refinement head. +EDGES = [(0, 1), (0, 2), (1, 3), (2, 4), (5, 6), (5, 7), (7, 9), (6, 8), (8, 10), + (5, 11), (6, 12), (11, 12), (11, 13), (13, 15), (12, 14), (14, 16)] +_A = np.eye(17, dtype=np.float32) +for _i, _j in EDGES: + _A[_i, _j] = _A[_j, _i] = 1.0 +_A = _A / _A.sum(1, keepdims=True) + + +class LoRA(nn.Module): + """Low-rank adapter wrapping a frozen Linear: y = W·x + (x·A·B)·(alpha/r).""" + + def __init__(self, base: nn.Linear, r: int = 8, alpha: int = 16): + super().__init__() + self.base = base + for p in self.base.parameters(): + p.requires_grad = False + self.A = nn.Parameter(torch.zeros(base.in_features, r)) + self.B = nn.Parameter(torch.zeros(r, base.out_features)) + nn.init.normal_(self.A, std=0.02) + self.scale = alpha / r + + def forward(self, x): + return self.base(x) + (x @ self.A @ self.B) * self.scale + + +class GR(nn.Module): + """Skeleton-graph refinement: nudges joints toward anatomically consistent positions.""" + + def __init__(self, d=256, h=96): + super().__init__() + self.je = nn.Parameter(torch.randn(17, 32) * 0.02) + self.inp = nn.Linear(d + 34, h) + self.g1 = nn.Linear(h, h) + self.g2 = nn.Linear(h, h) + self.out = nn.Linear(h, 2) + self.register_buffer("A", torch.tensor(_A)) + + def forward(self, z, kp0): + B = z.shape[0] + f = torch.relu(self.inp(torch.cat( + [z.unsqueeze(1).expand(-1, 17, -1), self.je.unsqueeze(0).expand(B, -1, -1), kp0], -1))) + f = torch.relu(self.g1(torch.einsum('ij,bjh->bih', self.A, f))) + f = torch.relu(self.g2(torch.einsum('ij,bjh->bih', self.A, f))) + return kp0 + 0.3 * torch.tanh(self.out(f)) + + +class PoseNet(nn.Module): + """Flagship pose model. Input [B,3,114,10] CSI amplitude (per-sample standardized) -> [B,34].""" + + def __init__(self, na=3, nsc=114, nt=10, d=256, L=4, H=8): + super().__init__() + self.proj = nn.Linear(na * nsc, d) + self.pos = nn.Parameter(torch.randn(1, nt, d) * 0.02) + enc = nn.TransformerEncoderLayer(d, H, d * 2, dropout=0.2, batch_first=True, activation='gelu') + self.tf = nn.TransformerEncoder(enc, L) + self.att = nn.Linear(d, 1) + self.head = nn.Sequential(nn.Linear(d, 256), nn.GELU(), nn.Dropout(0.3), nn.Linear(256, 34)) + self.gr = GR(d) + self.na, self.nsc, self.nt = na, nsc, nt + + def forward(self, x): + B = x.shape[0] + t = x.permute(0, 3, 1, 2).reshape(B, self.nt, self.na * self.nsc) + h = self.tf(self.proj(t) + self.pos) + w = torch.softmax(self.att(h), 1) + z = (h * w).sum(1) + kp0 = torch.sigmoid(self.head(z)).reshape(B, 17, 2) + return self.gr(z, kp0).reshape(B, 34) + + def add_lora(self, r=8, alpha=16): + """Wrap the input projection + pose head with LoRA adapters (the ~11 KB calibration set).""" + self.proj = LoRA(self.proj, r, alpha) + self.head[0] = LoRA(self.head[0], r, alpha) + self.head[3] = LoRA(self.head[3], r, alpha) + return self + + def lora_state(self) -> dict: + """Extract just the LoRA A/B tensors (the per-room adapter to save).""" + return {k: v.detach().cpu().numpy() for k, v in self.state_dict().items() + if k.endswith(".A") or k.endswith(".B")} + + def load_lora(self, adapter: dict): + sd = self.state_dict() + for k, v in adapter.items(): + sd[k] = torch.tensor(v) + self.load_state_dict(sd) + return self + + +def standardize(x: torch.Tensor) -> torch.Tensor: + """Per-sample standardization used in training/inference.""" + return (x - x.mean((1, 2, 3), keepdim=True)) / (x.std((1, 2, 3), keepdim=True) + 1e-6)