From 74eb09f60404f2c828a7aa26a8f553cfb48cda40 Mon Sep 17 00:00:00 2001 From: ruv Date: Sat, 23 May 2026 15:03:51 -0400 Subject: [PATCH] feat(adr-110): Prometheus exposition endpoint /api/v1/mesh/metrics MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Iter 36 — Grafana / Home Assistant Prometheus integration / Cognitum Seed observability stack can now scrape mesh state directly with no JSON-to-metric translation layer. Endpoint: GET /api/v1/mesh/metrics → text/plain (Prometheus exposition format v0.0.4). Eight gauges, one per NodeSyncSnapshot field, labeled by node: wifi_densepose_mesh_offset_us{node="N"} wifi_densepose_mesh_is_leader{node="N"} 0|1 wifi_densepose_mesh_is_valid{node="N"} 0|1 wifi_densepose_mesh_smoothed{node="N"} 0|1 wifi_densepose_mesh_sequence{node="N"} wifi_densepose_mesh_csi_fps{node="N"} wifi_densepose_mesh_csi_fps_samples{node="N"} wifi_densepose_mesh_staleness_ms{node="N"} Each metric carries the standard `# HELP` + `# TYPE` headers before its series block, exactly the format Prometheus + most scrape-format implementations expect. Implementation reuses iter-30's `NodeState::sync_snapshot()` as the single source of truth — same data the JSON endpoints emit, just text-formatted with `{node=...}` labels. Nodes without a fresh sync are absent (Prometheus handles missing series natively). Test added (8/8 sync_snapshot_helper_tests now green): bool_metric_returns_zero_or_one_as_text Pins the Prometheus convention that boolean gauges emit "0" or "1" literally, never "false"/"true" — if anyone refactors the helper to format!("{b}"), Prometheus would 400-reject the scrape; this test catches that drift before production. User-guide REST table updated with the new endpoint. Grafana / HA scrape config: - job_name: wifi-densepose-mesh scrape_interval: 5s metrics_path: /api/v1/mesh/metrics static_configs: - targets: ['localhost:3000'] Co-Authored-By: claude-flow --- docs/user-guide.md | 1 + .../wifi-densepose-sensing-server/src/main.rs | 82 +++++++++++++++++++ 2 files changed, 83 insertions(+) diff --git a/docs/user-guide.md b/docs/user-guide.md index b717e1bf..692b22a1 100644 --- a/docs/user-guide.md +++ b/docs/user-guide.md @@ -475,6 +475,7 @@ Base URL: `http://localhost:3000` (Docker) or `http://localhost:8080` (binary de | `POST` | `/api/v1/adaptive/unload` | Unload adaptive model | `{"success":true}` | | `GET` | `/api/v1/mesh` | ADR-110 fleet-wide mesh sync map ([iter 29](adr/ADR-110-esp32-c6-firmware-extension.md)) | `{"nodes":{"9":{...},"12":{...}},"total":2}` | | `GET` | `/api/v1/nodes/:id/sync` | Single-node mesh sync snapshot (or 404) | `{"offset_us":1163565,"is_leader":false,...}` | +| `GET` | `/api/v1/mesh/metrics` | ADR-110 mesh state in Prometheus exposition format ([iter 36](adr/ADR-110-esp32-c6-firmware-extension.md)) | `wifi_densepose_mesh_offset_us{node="9"} 1163565\n…` | ### Example: Get fleet mesh state (ADR-110) diff --git a/v2/crates/wifi-densepose-sensing-server/src/main.rs b/v2/crates/wifi-densepose-sensing-server/src/main.rs index dc88d0cb..014a61a0 100644 --- a/v2/crates/wifi-densepose-sensing-server/src/main.rs +++ b/v2/crates/wifi-densepose-sensing-server/src/main.rs @@ -4132,6 +4132,77 @@ async fn node_sync_endpoint( /// 200 → { "nodes": { "": NodeSyncSnapshot, ... }, "total": N } /// Nodes without a recent sync are omitted from the map; an empty /// `nodes` object means no mesh peers reachable. +/// ADR-110 iter 36 — Prometheus exposition format for mesh state. +/// +/// GET /api/v1/mesh/metrics → text/plain +/// wifi_densepose_mesh_offset_us{node="N"} +/// wifi_densepose_mesh_is_leader{node="N"} 0|1 +/// wifi_densepose_mesh_is_valid{node="N"} 0|1 +/// wifi_densepose_mesh_smoothed{node="N"} 0|1 +/// wifi_densepose_mesh_sequence{node="N"} +/// wifi_densepose_mesh_csi_fps{node="N"} +/// wifi_densepose_mesh_csi_fps_samples{node="N"} +/// wifi_densepose_mesh_staleness_ms{node="N"} +/// +/// Spec: . +/// Each metric is a gauge labeled by node_id. Nodes without a fresh sync +/// are simply absent from the output (Prometheus handles missing series +/// natively — the scrape just reports them as stale after the configured +/// staleness duration). +async fn mesh_metrics_endpoint(State(state): State) -> impl IntoResponse { + use std::fmt::Write; + let s = state.read().await; + let mut body = String::with_capacity(1024); + + // Each metric: HELP + TYPE header + one line per node that has a snapshot. + let metrics: &[(&str, &str, &str)] = &[ + ("wifi_densepose_mesh_offset_us", + "Cross-board mesh-aligned offset, microseconds (signed)", "gauge"), + ("wifi_densepose_mesh_is_leader", + "1 if this node is the elected mesh leader, else 0", "gauge"), + ("wifi_densepose_mesh_is_valid", + "1 if this node has heard a fresh leader beacon, else 0", "gauge"), + ("wifi_densepose_mesh_smoothed", + "1 once the firmware-side EMA filter has seeded, else 0", "gauge"), + ("wifi_densepose_mesh_sequence", + "High-water CSI sequence at sync emit time", "gauge"), + ("wifi_densepose_mesh_csi_fps", + "Per-node measured CSI frame rate (Hz)", "gauge"), + ("wifi_densepose_mesh_csi_fps_samples", + "How many inter-frame deltas the fps EMA has seen", "gauge"), + ("wifi_densepose_mesh_staleness_ms", + "Milliseconds since the host last received this node's sync packet", "gauge"), + ]; + + // Collect (id, snapshot) pairs once so each metric loop reads the same set. + let snaps: Vec<(u8, NodeSyncSnapshot)> = s.node_states.iter() + .filter_map(|(&id, ns)| ns.sync_snapshot().map(|snap| (id, snap))) + .collect(); + + for (name, help, kind) in metrics { + let _ = writeln!(body, "# HELP {name} {help}"); + let _ = writeln!(body, "# TYPE {name} {kind}"); + for (id, snap) in &snaps { + let value = match *name { + "wifi_densepose_mesh_offset_us" => snap.offset_us.to_string(), + "wifi_densepose_mesh_is_leader" => bool_metric(snap.is_leader), + "wifi_densepose_mesh_is_valid" => bool_metric(snap.is_valid), + "wifi_densepose_mesh_smoothed" => bool_metric(snap.smoothed), + "wifi_densepose_mesh_sequence" => snap.sequence.to_string(), + "wifi_densepose_mesh_csi_fps" => format!("{:.3}", snap.csi_fps_ema), + "wifi_densepose_mesh_csi_fps_samples" => snap.csi_fps_samples.to_string(), + "wifi_densepose_mesh_staleness_ms" => + snap.staleness_ms.map(|n| n.to_string()).unwrap_or_else(|| "0".into()), + _ => continue, + }; + let _ = writeln!(body, "{name}{{node=\"{id}\"}} {value}"); + } + } + ([(axum::http::header::CONTENT_TYPE, "text/plain; version=0.0.4")], body) +} + +fn bool_metric(b: bool) -> String { (if b { 1 } else { 0 }).to_string() } + async fn mesh_endpoint(State(state): State) -> Json { let s = state.read().await; let mut nodes = serde_json::Map::new(); @@ -5644,6 +5715,7 @@ async fn main() { // ADR-110 iter 29 — per-node mesh sync state for HTTP clients. .route("/api/v1/nodes/:id/sync", get(node_sync_endpoint)) .route("/api/v1/mesh", get(mesh_endpoint)) + .route("/api/v1/mesh/metrics", get(mesh_metrics_endpoint)) // Vital sign endpoints .route("/api/v1/vital-signs", get(vital_signs_endpoint)) .route("/api/v1/edge-vitals", get(edge_vitals_endpoint)) @@ -5973,6 +6045,16 @@ mod sync_snapshot_helper_tests { "expected ~750 ms staleness, got {} ms", st); } + #[test] + fn bool_metric_returns_zero_or_one_as_text() { + // Locks the Prometheus exposition convention: gauges holding a + // boolean state MUST emit literal "0" or "1", never "false"/"true". + // If anyone changes the helper to format!("{}", b), Prometheus will + // 400-reject the scrape — catch it here instead of in production. + assert_eq!(super::bool_metric(true), "1"); + assert_eq!(super::bool_metric(false), "0"); + } + #[test] fn mesh_aligned_us_honors_9s_staleness_gate() { // The receive helper stores latest_sync_at = Instant::now() each