From 9c49ff1a38ee8960c19e3e418ab65e2ab9099266 Mon Sep 17 00:00:00 2001 From: ruv Date: Sat, 23 May 2026 15:08:16 -0400 Subject: [PATCH] feat(adr-110): fleet cardinality gauge wifi_densepose_mesh_node_total MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Iter 37 — adds a fleet-summary gauge to the iter-36 Prometheus exposition. Ops dashboards now answer "how many leaders / followers / no-sync nodes are there right now" in one scrape, without having to scrape every per-node series and aggregate client-side. # HELP wifi_densepose_mesh_node_total Per-state node count across the fleet # TYPE wifi_densepose_mesh_node_total gauge wifi_densepose_mesh_node_total{state="leader"} 1 wifi_densepose_mesh_node_total{state="follower"} 2 wifi_densepose_mesh_node_total{state="no_sync"} 0 - leader / follower split derived from snapshot.is_leader - no_sync = total_nodes_in_state - nodes_with_snapshot (so a node that has sent CSI frames but never a sync packet shows up here, which is what an operator wants to alert on) Implementation factored as a free function `fleet_role_counts` so the math is testable without spinning up the axum handler. Same pattern iter 18 (update_csi_fps_ema) and iter 30 (sync_snapshot) used. Test added (9/9 sync_snapshot_helper_tests now green): fleet_role_counts_classifies_correctly Three cases: - empty fleet → (0, 0) - 1 leader + 2 followers → (1, 2) - all-leaders edge case → (2, 0) (election prevents this in practice but the gauge math must still be consistent) Useful Grafana queries this unlocks: - sum(wifi_densepose_mesh_node_total{state="follower"}) → total reachable follower count - wifi_densepose_mesh_node_total{state="no_sync"} > 0 → alert when any node has dropped off the mesh Co-Authored-By: claude-flow --- .../wifi-densepose-sensing-server/src/main.rs | 39 +++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/v2/crates/wifi-densepose-sensing-server/src/main.rs b/v2/crates/wifi-densepose-sensing-server/src/main.rs index 014a61a0..de8026c6 100644 --- a/v2/crates/wifi-densepose-sensing-server/src/main.rs +++ b/v2/crates/wifi-densepose-sensing-server/src/main.rs @@ -4179,6 +4179,18 @@ async fn mesh_metrics_endpoint(State(state): State) -> impl IntoRes .filter_map(|(&id, ns)| ns.sync_snapshot().map(|snap| (id, snap))) .collect(); + // Iter 37: fleet cardinality summary — Ops dashboards want the + // "how many leaders / followers / no-sync" tally at a glance + // without scraping every per-node series and counting. + let (leaders, followers) = fleet_role_counts(&snaps); + let no_sync = s.node_states.len().saturating_sub(snaps.len()) as u64; + let _ = writeln!(body, + "# HELP wifi_densepose_mesh_node_total Per-state node count across the fleet"); + let _ = writeln!(body, "# TYPE wifi_densepose_mesh_node_total gauge"); + let _ = writeln!(body, "wifi_densepose_mesh_node_total{{state=\"leader\"}} {leaders}"); + let _ = writeln!(body, "wifi_densepose_mesh_node_total{{state=\"follower\"}} {followers}"); + let _ = writeln!(body, "wifi_densepose_mesh_node_total{{state=\"no_sync\"}} {no_sync}"); + for (name, help, kind) in metrics { let _ = writeln!(body, "# HELP {name} {help}"); let _ = writeln!(body, "# TYPE {name} {kind}"); @@ -4203,6 +4215,14 @@ async fn mesh_metrics_endpoint(State(state): State) -> impl IntoRes fn bool_metric(b: bool) -> String { (if b { 1 } else { 0 }).to_string() } +/// ADR-110 iter 37 — count (leaders, followers) in a populated snapshot set. +/// Free function for testability — same pattern as iter 18's `update_csi_fps_ema`. +pub(crate) fn fleet_role_counts(snaps: &[(u8, NodeSyncSnapshot)]) -> (u64, u64) { + let leaders = snaps.iter().filter(|(_, s)| s.is_leader).count() as u64; + let followers = (snaps.len() as u64).saturating_sub(leaders); + (leaders, followers) +} + async fn mesh_endpoint(State(state): State) -> Json { let s = state.read().await; let mut nodes = serde_json::Map::new(); @@ -6045,6 +6065,25 @@ mod sync_snapshot_helper_tests { "expected ~750 ms staleness, got {} ms", st); } + #[test] + fn fleet_role_counts_classifies_correctly() { + // Iter 37 — verify the leader/follower split that drives the + // Prometheus `wifi_densepose_mesh_node_total{state=...}` gauge. + // Local fixture rather than reaching across test modules. + fn snap(is_leader: bool) -> NodeSyncSnapshot { + NodeSyncSnapshot { + offset_us: 0, is_leader, is_valid: true, smoothed: true, + sequence: 0, csi_fps_ema: 10.0, csi_fps_samples: 10, + staleness_ms: Some(0), + } + } + assert_eq!(super::fleet_role_counts(&[]), (0, 0)); + let snaps = vec![(12u8, snap(true)), (9, snap(false)), (3, snap(false))]; + assert_eq!(super::fleet_role_counts(&snaps), (1, 2)); + // Edge: all leaders (election would prevent this but gauge math must hold). + assert_eq!(super::fleet_role_counts(&[(1u8, snap(true)), (2, snap(true))]), (2, 0)); + } + #[test] fn bool_metric_returns_zero_or_one_as_text() { // Locks the Prometheus exposition convention: gauges holding a