From cafbeb1e817e87b944902c539db800d2e1074e67 Mon Sep 17 00:00:00 2001 From: rUv Date: Mon, 15 Jun 2026 13:06:46 -0400 Subject: [PATCH] =?UTF-8?q?fix(wasm-edge):=20sanitize=20non-finite=20host?= =?UTF-8?q?=20floats=20at=20the=20WASM=E2=86=94host=20frame=20boundary=20(?= =?UTF-8?q?#1102)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Closing beyond-SOTA security review of wifi-densepose-wasm-edge (ADR-040, ~70 edge modules). The two WASM↔host boundaries (lib.rs::on_frame/on_timer and bin/ghost_hunter.rs::on_frame) read raw IEEE-754 f32 from the csi_get_* imports with no finiteness check — the crate had zero is_finite/is_nan guards and its clamp helpers propagate NaN. A single non-finite host value latches NaN into long-lived per-module accumulators (EMA / Welford / phasor sums / anomaly baselines), after which detectors fail degraded (stuck gate state, silently-disabled checks) — silent corruption, not a crash. Add sanitize_host_f32() (non-finite -> 0.0, core-only for no_std) applied at every host_get_* float read: one chokepoint covering all downstream modules, mirroring the existing M-01 negative-n_subcarriers boundary clamp. LOW / defense-in-depth (the Tier-2 DSP firmware supplies the imports, a semi-trusted boundary). Pinned by boundary_tests::{sanitize_passes_finite_values_through, sanitize_maps_non_finite_to_zero, coherence_monitor_nan_latches_without_sanitize_but_not_with} — the last asserts on the current CoherenceMonitor that a raw NaN frame latches the smoothed score while the sanitized path stays finite. Other review dimensions attested clean with evidence (see CHANGELOG): no hot-path panics (all unwrap/expect are test-only or std-gated RVF builder), all bounds min()-clamped, all index-by-cast const-bounded or guarded, no leaking closures (no move||/forget/leak), no secrets. Verified: host `cargo test --features std,medical-experimental` 672 passed / 0 failed (+3 new tests); all three wasm32-unknown-unknown release artifacts build clean (lib default no_std/panic=abort, ghost_hunter standalone-bin, medical-experimental); Python proof VERDICT PASS, hash unchanged. --- CHANGELOG.md | 1 + .../src/bin/ghost_hunter.rs | 11 ++- v2/crates/wifi-densepose-wasm-edge/src/lib.rs | 98 ++++++++++++++++++- 3 files changed, 103 insertions(+), 7 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 913dcbbe..69112085 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Security - **`wifi-densepose-occworld-candle` — beyond-SOTA security + correctness review (Milestone #9, crate 4/4).** (1) **HIGH (MEASURED) — checkpoint-load crash on any int32 tensor** (`model.rs::safetensor_dtype_to_candle`). `safetensors::Dtype::I32` was mapped to `candle_core::DType::I64` and the raw int32 byte buffer (4 bytes/elem) was then handed to `Tensor::from_raw_buffer(.., I64, shape, ..)`. Candle derives `elem_count = data.len() / dtype.size_in_bytes()`, so the I64 path halved the element count while keeping the *original* shape — yielding a tensor whose declared shape claims twice as many elements as its backing storage holds. Reading it **panics** (`range end index 6 out of range for slice of length 3` — slice OOB inside candle-core) on any attacker-supplied or PyTorch-exported checkpoint containing an int32 tensor (common: index/buffer tensors). Fixed by mapping `I32 → DType::I32` (and `I16 → DType::I16`), both first-class candle dtypes. Reproduction recorded on old code; pinned by `tests/checkpoint_loading.rs::int32_tensor_loads_with_consistent_shape_and_values` (panics on old, passes on new) plus F32/I64/corrupt-file control cases. (2) **LOW (MEASURED) — `predict()` lacked frame/batch validation at the input boundary** (`inference.rs`). It validated H/W/D but not the externally-supplied frame count; an `f_in > num_frames*2` over-indexed the temporal positional embedding deep in the transformer and surfaced as a cryptic candle "gather" `InvalidIndex` (returned error, not a panic — candle bounds-checks), and a zero frame/batch dim fed a zero-element tensor into the pipeline. Now rejected at the boundary with a clear `ShapeMismatch`. Pinned by `predict_rejects_zero_frames` / `predict_rejects_too_many_frames` / `predict_accepts_frame_count_at_capacity`. (3) **LOW (MEASURED) — divide-by-zero panic on a degenerate input to the public `VQCodebook::encode`** (`vqvae.rs`): a rank-0 / empty-last-dim tensor made `last == 0` and panicked on `elem_count() / last`. Now fails closed with a clear error. Pinned by `encode_rejects_scalar_without_panicking`. **Dimensions confirmed CLEAN with evidence:** panic surface — zero `unwrap()`/`expect()`/`panic!`/`unreachable!` in production code paths (grep evidence; all error handling via `?`/`map_err`); NaN-state-poisoning — N/A (engine is stateless between `predict` calls, input is `u8` class indices so non-finite input is structurally impossible, no persistent world-model buffer to latch into); unbounded-alloc / shape-data mismatch from malformed weights — defended upstream by `safetensors::validate()` (overflow-checked `nelements*dtype.size()` vs declared byte range, rejected before reaching candle); secrets — none (grep clean, only `token_h`/`token_w` config fields match). `unsafe_code = forbid` in the crate manifest. **Build/validation status (MEASURED on Windows):** crate builds and tests under `cargo test -p wifi-densepose-occworld-candle --no-default-features` — **29/29 pass** (20 unit + 4 checkpoint_loading + 3 predict_honesty + 2 doc) after fixes; `cargo test --workspace --no-default-features` = 0 failed across all crates (lone `wifi-densepose-desktop` `api_integration` failure was a Windows "Access is denied (os error 5)" file-lock flake — re-ran in isolation **21/21 pass**); Python proof VERDICT PASS, hash `f8e76f21…446f7a` unchanged. *Warrants ADR slot 179 (parent to author).* +- **`wifi-densepose-wasm-edge` beyond-SOTA closing review — boundary NaN-state-poisoning guard + clean-with-evidence attestation (ADR-040 edge crate, ~70 modules).** Closing pass of the security campaign over the last untouched sizeable crate. **One real finding fixed (LOW / source-analysis + reproduced):** the two WASM↔host frame boundaries (`lib.rs::on_frame`/`on_timer` and `bin/ghost_hunter.rs::on_frame`) read raw IEEE-754 `f32` from the `csi_get_phase`/`csi_get_amplitude`/`csi_get_variance`/`csi_get_motion_energy` host imports **without any finiteness check** — the entire crate had **zero** `is_finite`/`is_nan` guards, and the in-crate `clamp` helpers propagate NaN (`NaN < lo` and `NaN > hi` are both false). A single non-finite value (firmware DSP bug, uninitialised buffer, or hostile host) latches NaN into the long-lived per-module accumulators (EMA, Welford, phasor sums, anomaly baselines); once latched, every downstream comparison evaluates `false`, so detectors fail **degraded** (stuck gate state, silently-disabled anomaly checks) — silent corruption, not a crash (WASM `panic=abort` is *not* tripped: no indexing/`unwrap` on the poisoned value). Threat model is a **semi-trusted** boundary (the Tier-2 DSP firmware supplies the imports, not direct network/JS), hence LOW severity / defense-in-depth. **Fix:** added `sanitize_host_f32()` (maps non-finite→`0.0`, `core`-only so it holds in `no_std`) applied at every `host_get_*` float read — a single chokepoint covering all ~70 downstream modules, mirroring the existing M-01 negative-`n_subcarriers` boundary clamp. **Pinned by** `boundary_tests::{sanitize_passes_finite_values_through, sanitize_maps_non_finite_to_zero, coherence_monitor_nan_latches_without_sanitize_but_not_with}` — the last asserts on the *current* `CoherenceMonitor` that a raw NaN frame latches the smoothed score (documents the hazard) while the boundary-sanitized path stays finite. **Dimensions attested CLEAN with evidence (source-analysis):** (a) **panic-on-input** — every non-test `unwrap()`/`expect()` is either `#[cfg(test)]` or in the `std`-gated RVF *builder* host tool writing to an in-memory `Vec` (infallible); no `panic!`/`unreachable!`/`todo!`/`get_unchecked` in any hot path. (b) **shape/bounds** — all frame-buffer access is `min()`-clamped (`MAX_SC=32`, `DTW_MAX_LEN`, `LCS_WINDOW`, `PATTERN_LEN`), all index-by-cast sites (`feature_id as usize`, `conclusion_id`, `minute_counter`, `plan_step`) are either compile-time-const-bounded or `if idx <`/`%`-guarded; negative `n_subcarriers` already mapped to 0 (M-01). (c) **memory/leak** — no `move ||` closures, no `mem::forget`/`Box::leak`/`.leak()`; the only `Box::new` is in the `std`-gated `skill_registry` (one-time init, bounded). (d) **secrets** — none (grep clean). **MEASURED build/test evidence:** host `cargo test --features std,medical-experimental` = **672 passed / 0 failed** (was 669 pre-fix; +3 new tests); the real deployment artifacts all build clean on the actual target — `cargo build --target wasm32-unknown-unknown --release` (no_std/panic=abort default lib), `--bin ghost_hunter --no-default-features --features standalone-bin`, and `--features medical-experimental` (toolchain 1.89 per `rust-toolchain.toml`). No ADR slot needed — a single LOW defense-in-depth boundary fix; CHANGELOG attestation suffices. - **ADR-131 HOMECORE-UI BFF gateway — public-PR review fixes (PR #1082).** (1) **HIGH — path-traversal / confused-deputy SSRF closed in the `/api/cal/*` reverse-proxy** (`homecore-server/src/gateway.rs`). The wildcard proxy path was interpolated straight into the upstream URL while `proxy()` attaches the server-side calibration bearer, so `/api/cal/v1/../../x` (and percent-encoded `..%2f`, `%2e%2e`, leading `/`, backslash, double-encoded `%252e`) could escape the `…/api/` scope **with the privileged token**. Now `validate_proxy_path()` decode-then-checks and rejects absolute/backslash/dot-segment/encoded-traversal paths with a typed **400 BEFORE the URL is built** (applies to GET **and** POST); legit `v1/...` paths still pass. Pinned by `cal_proxy_rejects_traversal_with_400_before_upstream` (fails on old code) + `validate_proxy_path_rejects_traversal_variants`. (2) **CORS + request-tracing now cover the gateway routes.** `/api/homecore/*` and `/api/cal/*` were `.merge()`d **outside** the layers `homecore-api::router()` applies, leaving them with no CORS allowlist and untraced; the audited `build_cors_layer()` (HC-05) + `TraceLayer` are now applied to the whole merged surface in `main.rs`. Pinned by `gateway_routes_are_cors_covered_after_merge` (Vite-dev-origin preflight succeeds on a gateway route). (3) **Fabricated-data honesty (§6 invariant 3):** the gateway no longer injects a hardcoded `anomaly.threshold: 0.5` — it passes through the REAL upstream threshold or emits `null` (withheld); the dashboard renders a not-available `—` instead of `"null%"`/`"null°C"` for null appliance metrics; the COG panel's Hailo-worker pill reflects the real appliance probe instead of a hardcoded `"connected"`; `rooms.js` treats a null anomaly threshold as withheld, not a fake `0.8` default. (4) **Robustness:** a forwarded `hef` that is a string (not an array) no longer throws in the COG panel; the calibration wizard guards `frames/target` against `NaN%`/`Infinity%` and clears its baseline poll timer on Restart / panel teardown (leaked `setTimeout` loop fixed). (5) **Perf:** per-bank RoomState fetches and the appliance service probes now run concurrently (`futures::join_all`; async `tokio::net::TcpStream` + `timeout` replaces the blocking `connect_timeout` that parked a worker per probe); the mock fixture module is now a dynamic `import()` gated on demo mode so production never bundles it. **Note (workspace-wide, not fixed here):** `homecore-server` requests `reqwest`'s `rustls-tls` only, but cargo feature-unification means a sibling crate enabling the default `native-tls` re-introduces OpenSSL into the final binary regardless — a true "no OpenSSL on the appliance" guarantee requires aligning every reqwest-pulling crate on rustls-only. **Note (pre-existing, out of scope):** DEV-mode `allow_any_non_empty()` bearer auth when `HOMECORE_TOKENS` is unset on `0.0.0.0` is unchanged; the loud `warn!` at boot is retained — provision real tokens before network exposure. **Verified:** `cargo test -p homecore-server --no-default-features` = **18/18 pass**, `cargo build -p homecore-server` clean, UI suite (`node tests`) all green, Python proof VERDICT PASS (hash unchanged). - **`wifi-densepose-desktop` (Tauri v2 desktop app) beyond-SOTA security review (needs ADR slot 178) — one real IPC serial-command-injection fix + one over-broad shell-capability removal, each MEASURED on Windows; remaining IPC/path/secret dimensions confirmed clean with evidence.** Beyond-SOTA review of the Tauri desktop crate (the real attack surface is the webview→Rust IPC boundary + the capability allowlist). The crate **builds + tests on this Windows box** (`cargo check`/`cargo test -p wifi-densepose-desktop --no-default-features` — Tauri 2.10 + GTK-less Windows webview2 path), so both findings are **MEASURED**, not source-analysis. **WDP-DESK-01 (serial command injection via `configure_esp32_wifi`, MODERATE) — FIXED.** The `#[tauri::command] configure_esp32_wifi(port, ssid, password)` handler took `ssid`/`password` straight from the webview and concatenated them into newline-terminated serial commands (`format!("wifi_config {} {}\r\n", ssid, password)`, `set ssid {}\r\n`, …) with **zero validation** before writing them to the ESP32 over the line-oriented serial protocol. A `\r\n` embedded in either field lets a malicious/compromised webview **terminate the command line early and inject an arbitrary follow-up firmware command** (`reboot`, `erase_nvs`, etc.) — a command-injection-into-device-protocol crossing the IPC trust boundary. Ironic note: the crate already shipped `test_wifi_credentials_validation` documenting the WPA2 length bounds, but the handler never enforced them. **Fix:** a new `validate_wifi_credentials(ssid, password)` rejects out-of-range lengths (SSID 1-32, password 8-63 — WPA2 PSK bounds) **and any control character** (`char::is_control()` catches `\r`/`\n`/NUL), called at the top of the handler before any serial write — fail-closed (`Err` → no bytes sent). Pinned fails-on-old / passes-on-new by `test_validate_wifi_credentials_rejects_injection` (`"net\r\nreboot"`, `"net\ninjected"`, `"pass\r\nerase_nvs"`, embedded NUL — all rejected; would splice into the command stream pre-fix), `test_validate_wifi_credentials_rejects_out_of_range`, and `test_validate_wifi_credentials_accepts_valid` (boundary 32-char SSID / 8- and 63-char passwords still accepted). **WDP-DESK-02 (over-broad shell capability, MODERATE) — REMOVED.** `capabilities/default.json` granted the webview `shell:allow-execute` + `shell:allow-open`, but the Rust backend spawns every process via `std::process::Command` directly (espflash/which/sensing-server — which **bypasses** the Tauri allowlist entirely) and the React UI only ever calls `dialog.open` (file picker) — verified by grep: `tauri_plugin_shell` is `init()`-ed but its `Command`/`open` API is **never invoked from Rust or TS**. The two `shell:` permissions were therefore unused privilege: a webview compromise (e.g. XSS in a UI dep) would have gained **arbitrary host command execution** via `shell.execute` with no scope restriction (no `shell` scope object was even defined). **Fix:** removed both `shell:` permissions from the capability (kept `core:default` + the two `dialog:` perms the UI actually uses). MEASURED: the build-regenerated `gen/schemas/capabilities.json` now reads `"permissions":["core:default","dialog:allow-open","dialog:allow-save"]` (shell perms gone), and the crate still builds + all tests pass — confirming nothing depended on the granted shell scope. (Plugin `init()` + the npm dep left in place to keep the blast radius minimal and avoid touching the off-limits generated ACL manifests; with no permission granted the plugin is inert.) **Dimensions confirmed clean (with evidence):** (1) **No directory-traversal / arbitrary-file primitive crossing the boundary** — the path-taking commands (`flash_firmware`/`verify_firmware`/`ota_update`/`wasm_upload`/`provision_*`) pass the webview-supplied path to `std::fs`/`espflash` to **read a firmware/wasm blob the local user themselves selected via the `dialog.open` native picker**; there is no command that *writes* to or *reads back* an arbitrary attacker-named path to the webview — `settings` read/write is confined to `app_data_dir().join("settings.json")` (fixed filename, no user path component), so no traversal sink exists. (2) **No shell-string interpolation** — every subprocess uses `Command::new(prog).args([...])` (argv vector, no shell), so the `port`/`source`/`chip`/`baud` args cannot inject a second command even though they are unvalidated (the `source` value flows only as a single `--source ` argv element). (3) **No SSRF-to-secret** — the `node_ip`-built URLs (`http://{ip}:8032/...`) target the local ESP32 mesh and return only device status; no credential is returned to the webview. (4) **Panic-on-input** — handlers use `.map_err(|e| e.to_string())?` throughout; the one `srv.pid.expect(...)` in `server_status` is guarded by an explicit `is_none()` early-return on the line above (unreachable), and the discovery/provision deserializers bounds-check before every slice index (`pos + len > data.len()` guards, NVS size capped at 4096). (5) **No hardcoded secrets** — `ota_psk` is an `Option` supplied per-call/from settings, never embedded; grep for embedded keys/tokens over `src/` is empty. (6) **Tauri config** — `tauri.conf.json` ships no `"all": true` / `"$HOME/**"` FS or HTTP scope (no `fs`/`http` plugin enabled at all); the window set is a single fixed main window. `cargo test -p wifi-densepose-desktop --no-default-features`: lib **18 → 21 passed** (+3 validator pins), integration **21 → 21**, 0 failed. Workspace otherwise unchanged; Python deterministic proof unchanged (`f8e76f21a0f9852b70b6d9dd5318239f6b20cbcb4cdd995863263cecdc446f7a`, bit-exact — the desktop crate is off the signal proof path). Both findings warrant **ADR slot 178**. - **`ruview-swarm` beyond-SOTA security + correctness review (ADR-148 drone swarm control plane; needs ADR slot 176) — 4 real fail-open / DoS bugs fixed in the NaN-state-poisoning class, each pinned fails-on-old / passes-on-new; 5 dimensions confirmed clean with evidence.** The shared theme is **IEEE-754 NaN/Inf silently defeating a safety comparison** on data that crosses the untrusted swarm-comm trust boundary (`SwarmOrchestrator::receive_peer_state` / `receive_peer_detection` accept full `DroneState`/`CsiDetection` whose f64/f32 fields deserialize with no finite-check; the integer-encoded MAVLink wire formats in `mavlink_messages.rs` cannot carry NaN, but the serde struct path can). **(1) HIGH — `failsafe::FailSafeMachine::tick` collision-avoidance + battery fail-open** (`failsafe/mod.rs:51,75`). `nearest_neighbor_dist < collision_dist_m` and `battery_pct <= rth_pct` both evaluate `false` for a NaN operand, so a poisoned peer position (→ NaN `nearest_peer_distance` via `Position3D::distance_to`) **silently disabled collision avoidance** and a NaN battery reading kept a drone Nominal — the worst failure for a physical airframe. Fixed to fail CLOSED (`!is_finite() ||` → `EmergencyDiverge` / `ReturnToHome`). MEASURED fails-on-old: `test_nan_neighbor_distance_fails_closed_to_diverge` / `test_nan_battery_fails_closed_to_rth` both returned `Nominal` pre-fix. **(2) MEDIUM — `security::geofence::Geofence::check` NaN-altitude bypass** (`security/geofence.rs:33`). A NaN `z` (altitude) with valid x/y skipped the altitude breach (`NaN < min || NaN > max` = `false`) and returned **`Safe`** through the point-in-polygon path — a silent geofence bypass. Fixed with a leading non-finite-coordinate → `HardBreach` guard. MEASURED fails-on-old: `test_nan_altitude_fails_closed` returned `Safe` pre-fix. **(3) MEDIUM/DoS — `security::antijamming::FhssRadio` `% 0` panic on empty `channels_mhz`** (`security/antijamming.rs:65,71,102`). `FhssConfig` is `Deserialize`; an empty channel list (malformed/hostile config) made `next_hop`/`current_channel_mhz`/`evasive_hop`/`tick` panic with `remainder with a divisor of zero`, crashing the radio task. Fixed with `len == 0` early-returns (benign `0.0` sentinel). MEASURED fails-on-old: `test_empty_channels_does_not_panic` **panicked** (`divisor of zero`) pre-fix. **(4) LOW — `sensing::multiview::MultiViewFusion::fuse` NaN victim-position propagation** (`sensing/multiview.rs:70`). A NaN `victim_position` passed the `is_some()` filter and propagated through the confidence-weighted average into the fused "confirmed victim" location dispatched to the swarm. Fixed by requiring finite `confidence` + finite position components (fail-closed drop). MEASURED fails-on-old: `test_nan_victim_position_dropped_from_fusion` produced a non-finite fused position pre-fix. **Dimensions confirmed clean (with evidence):** (a) **MAVLink decode panic-safety** — `SwarmNodeState::decode(&[u8;20])` `try_into().unwrap()`s are over fixed const ranges of a fixed-size array (provably infallible; no arbitrary-length `&[u8]` path exists). (b) **UWB GPS anti-spoofing is NaN-safe** — `(gps_dist - uwb_dist).abs() <= tol` already fails CLOSED on a NaN range/position (counts as inconsistent → spoof rejected), verified by reasoning + existing `test_spoofed_gps_invalid`. (c) **Bounded grid / no allocation-from-length-field** — `ProbabilityGrid::update_bayesian`/`mark_scanned` bounds-check `cx >= width || cy >= height`; `pos_to_cell` uses saturating `as u32` (Rust `as` saturates, no UB). (d) **Mesh `nearest_k` NaN-safe sort** — `partial_cmp(..).unwrap_or(Equal)` cannot panic on NaN distances. (e) **No hardcoded secrets** — `MavlinkSigner` key is constructor-injected (`[u8;32]`), nothing embedded. **Documented-not-fixed (for ADR-176, not churned to avoid test-rewrite risk):** (i) **Raft `AppendEntries` lacks the Log-Matching consistency check** (`topology/raft.rs:187`) — a follower appends leader entries on `term >= current_term` without validating `prev_log_index`/`prev_log_term`, so a malformed/byzantine leader can corrupt a follower's log (a genuine consensus-safety gap; vote tallying is also delegated to the caller per the existing `handle_message` comment). (ii) **`MavlinkSigner::verify` uses a non-constant-time tag `==` and has no replay/timestamp-window rejection** (`security/mavlink_signing.rs:64`) — the doc comment already flags the replay limitation as a known demo/test simplification. `cargo test -p ruview-swarm --no-default-features`: **117 → 123 passed, 0 failed** (+6 pins). Workspace green; Python deterministic proof unchanged (`f8e76f21a0f9852b70b6d9dd5318239f6b20cbcb4cdd995863263cecdc446f7a`, bit-exact — `ruview-swarm` is off the signal proof path). diff --git a/v2/crates/wifi-densepose-wasm-edge/src/bin/ghost_hunter.rs b/v2/crates/wifi-densepose-wasm-edge/src/bin/ghost_hunter.rs index 5d40314b..f60f1b74 100644 --- a/v2/crates/wifi-densepose-wasm-edge/src/bin/ghost_hunter.rs +++ b/v2/crates/wifi-densepose-wasm-edge/src/bin/ghost_hunter.rs @@ -20,6 +20,7 @@ use wifi_densepose_wasm_edge::{ host_get_phase, host_get_amplitude, host_get_variance, host_get_presence, host_get_motion_energy, host_emit_event, host_log, + sanitize_host_f32, exo_ghost_hunter::GhostHunterDetector, }; @@ -64,14 +65,16 @@ pub extern "C" fn on_frame(n_subcarriers: i32) { for i in 0..max_sc { unsafe { - phases[i] = host_get_phase(i as i32); - amplitudes[i] = host_get_amplitude(i as i32); - variances[i] = host_get_variance(i as i32); + // Sanitize at the boundary: a non-finite host value would otherwise + // latch NaN into the detector's persistent anomaly-energy state. + phases[i] = sanitize_host_f32(host_get_phase(i as i32)); + amplitudes[i] = sanitize_host_f32(host_get_amplitude(i as i32)); + variances[i] = sanitize_host_f32(host_get_variance(i as i32)); } } let presence = unsafe { host_get_presence() }; - let motion_energy = unsafe { host_get_motion_energy() }; + let motion_energy = sanitize_host_f32(unsafe { host_get_motion_energy() }); let detector = unsafe { &mut *core::ptr::addr_of_mut!(DETECTOR) }; let events = detector.process_frame( diff --git a/v2/crates/wifi-densepose-wasm-edge/src/lib.rs b/v2/crates/wifi-densepose-wasm-edge/src/lib.rs index 2b3b7065..6120d58a 100644 --- a/v2/crates/wifi-densepose-wasm-edge/src/lib.rs +++ b/v2/crates/wifi-densepose-wasm-edge/src/lib.rs @@ -572,6 +572,35 @@ pub mod event_types { pub const HEALING_COMPLETE: i32 = 888; } +/// Sanitize a raw `f32` read from the host CSI imports. +/// +/// ## NaN-state-poisoning guard (ADR-040 boundary hardening) +/// +/// The `csi_get_phase`/`csi_get_amplitude`/`csi_get_variance`/… host imports +/// return raw IEEE-754 `f32`. A single non-finite value (NaN / ±∞) — from a +/// firmware DSP bug, an uninitialised buffer, or a hostile host — propagates +/// silently into the long-lived per-module accumulators (EMA, Welford, +/// phasor sums, baseline means). Once latched, every downstream comparison +/// against the poisoned state evaluates `false`, so detectors fail *degraded* +/// (stuck gate state, suppressed anomaly checks) rather than recovering. +/// +/// This is the single chokepoint: every one of the ~70 edge modules receives +/// its frame data from the `on_frame` boundaries below, so mapping non-finite +/// host floats to `0.0` here protects the entire surface without per-module +/// churn. Mirrors the M-01 negative-`n_subcarriers` clamp at the same site. +/// +/// `0.0` is the neutral choice: a zero phase/amplitude/variance reads as a +/// quiet subcarrier, which the detectors already handle (it cannot, itself, +/// trip an anomaly the way a poisoned NaN can permanently disable one). +#[inline] +pub fn sanitize_host_f32(v: f32) -> f32 { + if v.is_finite() { + v + } else { + 0.0 + } +} + /// Log a message string to the ESP32 console (via host_log import). #[cfg(target_arch = "wasm32")] pub fn log_msg(msg: &str) { @@ -650,8 +679,10 @@ pub extern "C" fn on_frame(n_subcarriers: i32) { for i in 0..max_sc { unsafe { - phases[i] = host_get_phase(i as i32); - amps[i] = host_get_amplitude(i as i32); + // Sanitize at the boundary: a non-finite host value would otherwise + // latch NaN into the gesture/coherence/anomaly persistent state. + phases[i] = sanitize_host_f32(host_get_phase(i as i32)); + amps[i] = sanitize_host_f32(host_get_amplitude(i as i32)); } } @@ -677,10 +708,71 @@ pub extern "C" fn on_frame(n_subcarriers: i32) { pub extern "C" fn on_timer() { // Periodic summary. let state = unsafe { &*core::ptr::addr_of!(STATE) }; - let motion = unsafe { host_get_motion_energy() }; + let motion = sanitize_host_f32(unsafe { host_get_motion_energy() }); emit(event_types::CUSTOM_METRIC, motion); if state.frame_count % 100 == 0 { log_msg("wasm-edge: heartbeat"); } } + +// ── Boundary-hardening tests (ADR-040) ─────────────────────────────────────── + +#[cfg(test)] +mod boundary_tests { + use super::*; + + #[test] + fn sanitize_passes_finite_values_through() { + assert_eq!(sanitize_host_f32(0.0), 0.0); + assert_eq!(sanitize_host_f32(-3.5), -3.5); + assert_eq!(sanitize_host_f32(1234.5), 1234.5); + assert_eq!(sanitize_host_f32(f32::MIN), f32::MIN); + assert_eq!(sanitize_host_f32(f32::MAX), f32::MAX); + } + + #[test] + fn sanitize_maps_non_finite_to_zero() { + // NaN / ±∞ from a buggy or hostile host must not reach module state. + assert_eq!(sanitize_host_f32(f32::NAN), 0.0); + assert_eq!(sanitize_host_f32(f32::INFINITY), 0.0); + assert_eq!(sanitize_host_f32(f32::NEG_INFINITY), 0.0); + // A subnormal-resulting NaN (0.0 * inf) is also caught. + assert_eq!(sanitize_host_f32(0.0f32 * f32::INFINITY), 0.0); + } + + /// Demonstrates the downstream hazard the boundary guard prevents: + /// feeding a raw NaN phase into a persistent module permanently latches + /// its smoothed state, whereas a boundary-sanitized 0.0 keeps it healthy. + #[test] + fn coherence_monitor_nan_latches_without_sanitize_but_not_with() { + use crate::coherence::CoherenceMonitor; + + // Without sanitize: a single NaN frame poisons the EMA forever. + let mut poisoned = CoherenceMonitor::new(); + poisoned.process_frame(&[0.1, 0.2, 0.3]); // init + let _ = poisoned.process_frame(&[f32::NAN, 0.2, 0.3]); // raw host NaN + // Subsequent *clean* frames can never restore a finite score. + for _ in 0..50 { + poisoned.process_frame(&[0.1, 0.2, 0.3]); + } + assert!( + poisoned.coherence_score().is_nan(), + "raw NaN should latch the smoothed coherence (documents the hazard)" + ); + + // With the boundary guard applied (what on_frame now does), the NaN is + // mapped to a finite value before it ever reaches the module. + let mut guarded = CoherenceMonitor::new(); + let f = |x: f32| sanitize_host_f32(x); + guarded.process_frame(&[f(0.1), f(0.2), f(0.3)]); // init + let _ = guarded.process_frame(&[f(f32::NAN), f(0.2), f(0.3)]); + for _ in 0..50 { + guarded.process_frame(&[f(0.1), f(0.2), f(0.3)]); + } + assert!( + guarded.coherence_score().is_finite(), + "boundary-sanitized input keeps the module state finite" + ); + } +}