feat(adr-110): fleet cardinality gauge wifi_densepose_mesh_node_total
Iter 37 — adds a fleet-summary gauge to the iter-36 Prometheus
exposition. Ops dashboards now answer "how many leaders / followers
/ no-sync nodes are there right now" in one scrape, without having
to scrape every per-node series and aggregate client-side.
# HELP wifi_densepose_mesh_node_total Per-state node count across the fleet
# TYPE wifi_densepose_mesh_node_total gauge
wifi_densepose_mesh_node_total{state="leader"} 1
wifi_densepose_mesh_node_total{state="follower"} 2
wifi_densepose_mesh_node_total{state="no_sync"} 0
- leader / follower split derived from snapshot.is_leader
- no_sync = total_nodes_in_state - nodes_with_snapshot
(so a node that has sent CSI frames but never a sync packet
shows up here, which is what an operator wants to alert on)
Implementation factored as a free function `fleet_role_counts` so the
math is testable without spinning up the axum handler. Same pattern
iter 18 (update_csi_fps_ema) and iter 30 (sync_snapshot) used.
Test added (9/9 sync_snapshot_helper_tests now green):
fleet_role_counts_classifies_correctly
Three cases:
- empty fleet → (0, 0)
- 1 leader + 2 followers → (1, 2)
- all-leaders edge case → (2, 0) (election prevents this in
practice but the gauge math must still be consistent)
Useful Grafana queries this unlocks:
- sum(wifi_densepose_mesh_node_total{state="follower"})
→ total reachable follower count
- wifi_densepose_mesh_node_total{state="no_sync"} > 0
→ alert when any node has dropped off the mesh
Co-Authored-By: claude-flow <ruv@ruv.net>
This commit is contained in:
parent
74eb09f604
commit
9c49ff1a38
|
|
@ -4179,6 +4179,18 @@ async fn mesh_metrics_endpoint(State(state): State<SharedState>) -> impl IntoRes
|
||||||
.filter_map(|(&id, ns)| ns.sync_snapshot().map(|snap| (id, snap)))
|
.filter_map(|(&id, ns)| ns.sync_snapshot().map(|snap| (id, snap)))
|
||||||
.collect();
|
.collect();
|
||||||
|
|
||||||
|
// Iter 37: fleet cardinality summary — Ops dashboards want the
|
||||||
|
// "how many leaders / followers / no-sync" tally at a glance
|
||||||
|
// without scraping every per-node series and counting.
|
||||||
|
let (leaders, followers) = fleet_role_counts(&snaps);
|
||||||
|
let no_sync = s.node_states.len().saturating_sub(snaps.len()) as u64;
|
||||||
|
let _ = writeln!(body,
|
||||||
|
"# HELP wifi_densepose_mesh_node_total Per-state node count across the fleet");
|
||||||
|
let _ = writeln!(body, "# TYPE wifi_densepose_mesh_node_total gauge");
|
||||||
|
let _ = writeln!(body, "wifi_densepose_mesh_node_total{{state=\"leader\"}} {leaders}");
|
||||||
|
let _ = writeln!(body, "wifi_densepose_mesh_node_total{{state=\"follower\"}} {followers}");
|
||||||
|
let _ = writeln!(body, "wifi_densepose_mesh_node_total{{state=\"no_sync\"}} {no_sync}");
|
||||||
|
|
||||||
for (name, help, kind) in metrics {
|
for (name, help, kind) in metrics {
|
||||||
let _ = writeln!(body, "# HELP {name} {help}");
|
let _ = writeln!(body, "# HELP {name} {help}");
|
||||||
let _ = writeln!(body, "# TYPE {name} {kind}");
|
let _ = writeln!(body, "# TYPE {name} {kind}");
|
||||||
|
|
@ -4203,6 +4215,14 @@ async fn mesh_metrics_endpoint(State(state): State<SharedState>) -> impl IntoRes
|
||||||
|
|
||||||
fn bool_metric(b: bool) -> String { (if b { 1 } else { 0 }).to_string() }
|
fn bool_metric(b: bool) -> String { (if b { 1 } else { 0 }).to_string() }
|
||||||
|
|
||||||
|
/// ADR-110 iter 37 — count (leaders, followers) in a populated snapshot set.
|
||||||
|
/// Free function for testability — same pattern as iter 18's `update_csi_fps_ema`.
|
||||||
|
pub(crate) fn fleet_role_counts(snaps: &[(u8, NodeSyncSnapshot)]) -> (u64, u64) {
|
||||||
|
let leaders = snaps.iter().filter(|(_, s)| s.is_leader).count() as u64;
|
||||||
|
let followers = (snaps.len() as u64).saturating_sub(leaders);
|
||||||
|
(leaders, followers)
|
||||||
|
}
|
||||||
|
|
||||||
async fn mesh_endpoint(State(state): State<SharedState>) -> Json<serde_json::Value> {
|
async fn mesh_endpoint(State(state): State<SharedState>) -> Json<serde_json::Value> {
|
||||||
let s = state.read().await;
|
let s = state.read().await;
|
||||||
let mut nodes = serde_json::Map::new();
|
let mut nodes = serde_json::Map::new();
|
||||||
|
|
@ -6045,6 +6065,25 @@ mod sync_snapshot_helper_tests {
|
||||||
"expected ~750 ms staleness, got {} ms", st);
|
"expected ~750 ms staleness, got {} ms", st);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn fleet_role_counts_classifies_correctly() {
|
||||||
|
// Iter 37 — verify the leader/follower split that drives the
|
||||||
|
// Prometheus `wifi_densepose_mesh_node_total{state=...}` gauge.
|
||||||
|
// Local fixture rather than reaching across test modules.
|
||||||
|
fn snap(is_leader: bool) -> NodeSyncSnapshot {
|
||||||
|
NodeSyncSnapshot {
|
||||||
|
offset_us: 0, is_leader, is_valid: true, smoothed: true,
|
||||||
|
sequence: 0, csi_fps_ema: 10.0, csi_fps_samples: 10,
|
||||||
|
staleness_ms: Some(0),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
assert_eq!(super::fleet_role_counts(&[]), (0, 0));
|
||||||
|
let snaps = vec![(12u8, snap(true)), (9, snap(false)), (3, snap(false))];
|
||||||
|
assert_eq!(super::fleet_role_counts(&snaps), (1, 2));
|
||||||
|
// Edge: all leaders (election would prevent this but gauge math must hold).
|
||||||
|
assert_eq!(super::fleet_role_counts(&[(1u8, snap(true)), (2, snap(true))]), (2, 0));
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn bool_metric_returns_zero_or_one_as_text() {
|
fn bool_metric_returns_zero_or_one_as_text() {
|
||||||
// Locks the Prometheus exposition convention: gauges holding a
|
// Locks the Prometheus exposition convention: gauges holding a
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue