From 046b2564b83d81be653ee2df068454a1f9c891e0 Mon Sep 17 00:00:00 2001 From: ruv Date: Sat, 30 May 2026 19:55:58 -0400 Subject: [PATCH] feat(aether-arena): publish RuView MM-Fi SOTA result + ADR-150 RF Foundation Encoder MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Ledger witness row (seq 1, Gold): RuView CSI-Transformer 81.63% torso-PCK@20 on MM-Fi random_split, exceeding MultiFormer 72.25% (CSI2Pose 68.41%) — protocol- and metric-matched, self-corrected from inflated 91.86% bbox. Hash-chained, verifiable. - HF Space updated with the controlled SOTA claim + caveat (cross-subject is the frontier). - Proof/replay/witness gist: gist.github.com/ruvnet/af2fbc1c7674dddf09c15509b3c7f785 - Tracking issue #876 (result + Generalization Track roadmap). - ADR-150: RuView RF Foundation Encoder — pose-preserving, subject/room/device-invariant SSL embedding (masked CSI + pose-contrast-across-subjects + coherence head); the principled attack on the cross-subject frontier. DANN failed; this is the corrected design. Co-Authored-By: claude-flow --- aether-arena/ledger/ledger.jsonl | 1 + aether-arena/space/app.py | 20 ++-- aether-arena/space/ledger.jsonl | 1 + docs/adr/ADR-150-rf-foundation-encoder.md | 125 ++++++++++++++++++++++ 4 files changed, 140 insertions(+), 7 deletions(-) create mode 100644 docs/adr/ADR-150-rf-foundation-encoder.md diff --git a/aether-arena/ledger/ledger.jsonl b/aether-arena/ledger/ledger.jsonl index ca253a68..0e0d247b 100644 --- a/aether-arena/ledger/ledger.jsonl +++ b/aether-arena/ledger/ledger.jsonl @@ -1 +1,2 @@ {"benchmark": "AetherArena", "created": "2026-05-30", "kind": "genesis", "note": "Official Spatial-Intelligence Benchmark \u2014 append-only signed ledger. Entries are real harness scores only; no seeded numbers.", "prev_hash": "0000000000000000000000000000000000000000000000000000000000000000", "row_hash": "940bdc6f0f5dd00f4d89e13a8fa843bab3c9ddf1b8051f426a1701e730249231", "seq": 0, "spec": "ADR-149"} +{"abs_gain": "+9.38", "benchmark": "MM-Fi", "category": "pose", "caveat": "Protocol-matched MM-Fi random_split result; NOT solved real-world generalization. Random split has temporal/subject-adjacency effects common to this benchmark family. Leakage-free cross-subject is far lower (~11-27%) and is the real deployment frontier.", "harness_version": 1, "kind": "result", "metric": "torso-PCK@20 (||right_shoulder-left_hip|| norm, 17 COCO kpts)", "modality": "wifi-csi", "model_ref": "RuView CSI-Transformer (4L/8H ~2M params, temporal-attention)", "prev_hash": "940bdc6f0f5dd00f4d89e13a8fa843bab3c9ddf1b8051f426a1701e730249231", "protocol": "random_split (ratio=0.8, seed=0)", "rel_gain": "+13.0%", "reproduce": "download MM-Fi -> parse_mmfi_zips.py -> train_tf_torso.py X.npy Y.npy split_random.npy (seed 0)", "row_hash": "76598d8e1320d5248f8cd854a8ffa22a99bd2a2f0e0e7f2d2b1df79af16001d5", "score_pct": 81.63, "scored_at": "2026-05-30", "seq": 1, "sota_ref": "MultiFormer 72.25 (CSI2Pose 68.41)", "submitter": "ruvnet", "tier": "Gold"} diff --git a/aether-arena/space/app.py b/aether-arena/space/app.py index 91324582..d46b7168 100644 --- a/aether-arena/space/app.py +++ b/aether-arena/space/app.py @@ -41,14 +41,15 @@ def verify_chain(): def leaderboard(category: str): results = [r for r in _rows() if r.get("kind") == "result" and (category == "all" or r.get("category") == category)] if not results: - return [["— no entries yet —", "be the first", "", "", ""]] - results.sort(key=lambda r: r.get("pck20_all") or r.get("pck_all") or 0, reverse=True) + return [["— no entries yet —", "", "", "", "", ""]] + results.sort(key=lambda r: r.get("score_pct") or 0, reverse=True) return [[ r.get("submitter", "?"), r.get("model_ref", "?"), - r.get("tier", "?"), - f"{(r.get('pck20_all') or r.get('pck_all') or 0):.4f}", - (r.get("proof_sha256") or "")[:16], + f"{r.get('benchmark','?')} / {r.get('protocol','?')}", + r.get("metric", "?"), + f"{r.get('score_pct', 0):.2f}%", + f"{r.get('tier','?')} (vs {r.get('sota_ref','?')})", ] for r in results] @@ -107,16 +108,21 @@ rerun the scorer locally → understand why the rank is fair. That is the launch with gr.Blocks(title="AetherArena — Spatial-Intelligence Benchmark") as demo: gr.Markdown("# 📡 AetherArena (AA)\n## The Official Spatial-Intelligence Benchmark") gr.Markdown(FOUR_PART) + gr.Markdown( + "## 🏆 RuView sets new MM-Fi random-split SOTA for WiFi-CSI pose estimation — **81.63% torso-PCK@20**\n" + "**81.63% vs MultiFormer 72.25%** (CSI2Pose 68.41%) — same MM-Fi `random_split` (0.8, seed 0), same torso-normalized PCK@20, 17 COCO keypoints. **+9.38 abs / +13.0% rel.**\n\n" + "> ⚠️ **Controlled claim.** This is a *protocol-matched MM-Fi random-split* result — **not** solved real-world generalization. Random split contains temporal/subject-adjacency effects common to this benchmark family. Our leakage-free **cross-subject** result is far lower (~11–27%), and we treat cross-subject pose estimation as the real deployment frontier." + ) chain = gr.Markdown(verify_chain()) with gr.Tab("🏆 Leaderboard"): cat = gr.Dropdown(["all", "pose", "presence"], value="all", label="Category") tbl = gr.Dataframe( - headers=["Submitter", "Model", "Tier", "Score", "Proof (sha256…)"], + headers=["Submitter", "Model", "Benchmark / Protocol", "Metric", "Score", "Tier (vs SOTA)"], value=leaderboard("all"), interactive=False, wrap=True, ) cat.change(leaderboard, cat, tbl) - gr.Markdown("*Benchmark-first: the board starts empty. Every row is a real harness witness — no seeded numbers.*") + gr.Markdown("*Benchmark-first: every row is a real, metric- and protocol-matched result — no seeded numbers. Integrity note: the headline 81.63% was self-corrected down from an inflated 91.86% (bbox metric) before publishing.*") with gr.Tab("📤 Submit"): gr.Markdown(SUBMIT) diff --git a/aether-arena/space/ledger.jsonl b/aether-arena/space/ledger.jsonl index ca253a68..0e0d247b 100644 --- a/aether-arena/space/ledger.jsonl +++ b/aether-arena/space/ledger.jsonl @@ -1 +1,2 @@ {"benchmark": "AetherArena", "created": "2026-05-30", "kind": "genesis", "note": "Official Spatial-Intelligence Benchmark \u2014 append-only signed ledger. Entries are real harness scores only; no seeded numbers.", "prev_hash": "0000000000000000000000000000000000000000000000000000000000000000", "row_hash": "940bdc6f0f5dd00f4d89e13a8fa843bab3c9ddf1b8051f426a1701e730249231", "seq": 0, "spec": "ADR-149"} +{"abs_gain": "+9.38", "benchmark": "MM-Fi", "category": "pose", "caveat": "Protocol-matched MM-Fi random_split result; NOT solved real-world generalization. Random split has temporal/subject-adjacency effects common to this benchmark family. Leakage-free cross-subject is far lower (~11-27%) and is the real deployment frontier.", "harness_version": 1, "kind": "result", "metric": "torso-PCK@20 (||right_shoulder-left_hip|| norm, 17 COCO kpts)", "modality": "wifi-csi", "model_ref": "RuView CSI-Transformer (4L/8H ~2M params, temporal-attention)", "prev_hash": "940bdc6f0f5dd00f4d89e13a8fa843bab3c9ddf1b8051f426a1701e730249231", "protocol": "random_split (ratio=0.8, seed=0)", "rel_gain": "+13.0%", "reproduce": "download MM-Fi -> parse_mmfi_zips.py -> train_tf_torso.py X.npy Y.npy split_random.npy (seed 0)", "row_hash": "76598d8e1320d5248f8cd854a8ffa22a99bd2a2f0e0e7f2d2b1df79af16001d5", "score_pct": 81.63, "scored_at": "2026-05-30", "seq": 1, "sota_ref": "MultiFormer 72.25 (CSI2Pose 68.41)", "submitter": "ruvnet", "tier": "Gold"} diff --git a/docs/adr/ADR-150-rf-foundation-encoder.md b/docs/adr/ADR-150-rf-foundation-encoder.md new file mode 100644 index 00000000..771aa7ff --- /dev/null +++ b/docs/adr/ADR-150-rf-foundation-encoder.md @@ -0,0 +1,125 @@ +# ADR-150: RuView RF Foundation Encoder — pose-preserving, subject/room/device-invariant CSI embedding + +| Field | Value | +|-------|-------| +| **Status** | Proposed | +| **Date** | 2026-05-30 | +| **Deciders** | ruv | +| **Codebase target** | New `wifi-densepose-rfencoder` (or `nn/src/rf_foundation.rs`) + training in `wifi-densepose-train`; consumed by the MM-Fi pose head and the AetherArena Generalization Track (ADR-149) | +| **Relates to** | ADR-024 (Contrastive CSI Embedding / AETHER), ADR-027 (Cross-Environment Domain Generalization / MERIDIAN), ADR-134 (CIR), ADR-135 (calibration + coherence gate), ADR-145 (Ablation/Eval Harness), ADR-149 (AetherArena benchmark) | + +--- + +## 1. Context + +AetherArena now has a published, metric- and protocol-matched MM-Fi result: **81.63% torso-PCK@20 in-domain (random_split), exceeding MultiFormer's 72.25%** ([#876](https://github.com/ruvnet/RuView/issues/876)). But the **leakage-free cross-subject** number collapses to **~11.6% torso-PCK** (27% under the looser bbox metric). That gap is the real deployment frontier — homes, elder care, festivals, unseen bodies. + +Naïve fixes already tested and **failed**: a subject-adversarial (DANN) embedding did not move cross-subject (baseline 27.26% → DANN 27.54% bbox; torso 11.57%). Bigger capacity *hurt* (transformer cross-subject 24.8% < conv 27.3%) — extra parameters overfit seen subjects. + +**Conclusion:** a *generic* "better feature vector" will not help. The lever is an embedding trained for the **right invariance** — one that preserves pose while removing subject, room, and device signatures, and that *exposes* channel instability rather than hiding it. + +### 1.1 Why DANN failed (and the corrected rule) + +Subject identity is partly **entangled with valid pose evidence** — body scale, limb proportions, gait, RF scattering. Blindly erasing subject info also erases information the pose decoder needs. The corrected rule: + +> **Remove subject identity only after preserving pose geometry.** Supervised *pose-contrast across subjects* beats naïve adversarial identity removal. + +The frontier objective is **not** `same-subject = positive`. It is: + +> **same pose across different subjects = positive; different pose = negative.** + +## 2. Decision + +**Build the RuView RF Foundation Encoder: a self-supervised, pose-preserving, subject/room/device-invariant RF representation for CSI (extensible to CIR, ADR-134, and BFLD).** Positioned as a **platform primitive**, not a benchmark trick. + +### 2.1 What the embedding must keep / remove + +| Signal | Action | Why | +|--------|--------|-----| +| Pose geometry | **Keep** | target signal | +| Limb-motion deltas | **Keep** | strong temporal cue | +| Subject identity | **Remove** (post-pose) | causes overfit | +| Static room multipath | **Remove** | breaks transfer | +| Device-specific phase artifacts | **Remove** | breaks cross-hardware | +| Antenna-layout quirks | **Normalize** | deployment portability | +| Channel instability | **Expose separately** | confidence gating / anti-hallucination | + +### 2.2 Architecture + +``` +CSI frame sequence + → physics normalization (antenna geometry, subcarrier stability, phase-unwrap quality, room-impulse structure) + → masked CSI encoder (SSL: learn channel structure from unlabeled CSI — 150k home + 320k MM-Fi frames) + → temporal contrastive encoder (motion continuity) + → skeleton-aware pose decoder (graph head — anatomical constraints, GraphPose-Fi style, arXiv 2511.19105) + → confidence + coherence head (mincut / spectral coherence as RF-integrity signal) +``` + +### 2.3 Training objectives (loss stack) + +``` +L_total = L_pose + + 0.20 · L_masked_csi # learn channel structure (unlabeled) + + 0.10 · L_temporal_contrast # motion continuity + + 0.20 · L_pose_contrast # same-pose-across-subjects = positive ← the frontier + + 0.05 · L_subject_decorrelation # remove identity only where it conflicts with pose + + 0.10 · L_coherence # predict when RF evidence is weak +``` + +Invariant target: +``` +embedding ≈ pose + motion + channel-coherence +embedding ≠ subject-identity + static-room-signature + device-artifact +``` + +### 2.4 The RuView differentiator — auditable RF perception that knows when it's wrong + +The coherence head gates pose confidence by **channel coherence**: when multipath structure changes (mincut / spectral coherence drop), the model flags low RF integrity instead of hallucinating a pose. This is the **anti-hallucination** component most WiFi-pose papers lack, and it turns RuView from a model into sensing infrastructure. (Ties to ADR-135 coherence gate.) + +## 3. Experiment plan — three variants, frozen-decoder test + +Same split, same decoder, same seed set; only the embedding changes. + +| Variant | Description | Success threshold (cross-subject torso-PCK) | +|---------|-------------|----------------------------------------------| +| **E1** | Masked CSI pretrain | **+3** | +| **E2** | Pose-contrastive across subjects | **+6** | +| **E3** | Physics-normalized SSL + skeleton head | **+10** | + +### 3.1 Expected gains (estimate) + +| Method | cross-subject torso-PCK gain | +|--------|------------------------------| +| Naïve embedding | 0–2 | +| DANN adversarial | 0–3 (high collapse risk) — *empirically ~0* | +| Masked CSI pretrain | +3–8 | +| Pose-contrastive | +5–12 | +| Physics-norm + SSL + graph decoder | +10–20 | +| + more subject-diverse paired data | +20 | + +Plausible trajectory: 11.6% → **20–25% near term**, **30–40% with enough subject/environment diversity**. That is a stronger research claim than squeezing random-split from 81.6% → 88%. + +## 4. Acceptance Test + +The encoder is accepted **only if it improves cross-subject torso-PCK@20 by ≥ 6 absolute points without reducing random-split torso-PCK@20 by more than 2 points** — on the same MM-Fi pipeline, one-command reproduction, with per-joint error tables. Results land as AetherArena witness rows (ADR-149), nothing published until reviewed. + +## 5. Consequences + +**Positive:** a reusable, self-supervised RF foundation encoder for CSI/CIR/BFLD; the first principled attack on the cross-subject frontier; the coherence head adds an anti-hallucination integrity signal no competitor has. + +**Negative / risk:** SSL pretraining requires matching the production CSI→feature pipeline (ADR-149 §SSL note flagged the resampling-replication risk); the multi-loss stack needs careful weight tuning (DANN showed loss-imbalance can collapse training); physics normalization must be validated not to discard pose-relevant deltas. + +**Neutral:** the in-domain head is unchanged; the encoder slots in front of the existing pose decoder. + +## 6. Alternatives Considered + +1. **Bigger model only** — tested; *hurts* cross-subject (overfits seen subjects). +2. **Naïve DANN subject-adversarial** — tested; no gain, collapse risk; entangles pose evidence. +3. **More data only (camera/ADR-079)** — complementary and ultimately necessary, but slow and out-of-band; the encoder extracts more from existing data first. + +## 7. Open Questions + +1. Physics-normalization spec — exact antenna/subcarrier/phase terms, validated to preserve pose deltas. +2. Masked-CSI SSL on the production feature pipeline (resampling match — see ADR-149). +3. Where the coherence/mincut integrity signal is computed (reuse ADR-135 coherence gate vs new head). +4. CIR (ADR-134) / BFLD fusion into the same encoder — phase 3.