feat(adr-110): Prometheus exposition endpoint /api/v1/mesh/metrics

Iter 36 — Grafana / Home Assistant Prometheus integration / Cognitum
Seed observability stack can now scrape mesh state directly with no
JSON-to-metric translation layer.

Endpoint: GET /api/v1/mesh/metrics → text/plain (Prometheus exposition
format v0.0.4). Eight gauges, one per NodeSyncSnapshot field, labeled
by node:

  wifi_densepose_mesh_offset_us{node="N"}        <signed-int>
  wifi_densepose_mesh_is_leader{node="N"}        0|1
  wifi_densepose_mesh_is_valid{node="N"}         0|1
  wifi_densepose_mesh_smoothed{node="N"}         0|1
  wifi_densepose_mesh_sequence{node="N"}         <u32>
  wifi_densepose_mesh_csi_fps{node="N"}          <float>
  wifi_densepose_mesh_csi_fps_samples{node="N"}  <u32>
  wifi_densepose_mesh_staleness_ms{node="N"}     <u64>

Each metric carries the standard `# HELP` + `# TYPE` headers before
its series block, exactly the format Prometheus + most scrape-format
implementations expect.

Implementation reuses iter-30's `NodeState::sync_snapshot()` as the
single source of truth — same data the JSON endpoints emit, just
text-formatted with `{node=...}` labels. Nodes without a fresh sync
are absent (Prometheus handles missing series natively).

Test added (8/8 sync_snapshot_helper_tests now green):
  bool_metric_returns_zero_or_one_as_text
    Pins the Prometheus convention that boolean gauges emit "0" or "1"
    literally, never "false"/"true" — if anyone refactors the helper
    to format!("{b}"), Prometheus would 400-reject the scrape; this
    test catches that drift before production.

User-guide REST table updated with the new endpoint.

Grafana / HA scrape config:
  - job_name: wifi-densepose-mesh
    scrape_interval: 5s
    metrics_path: /api/v1/mesh/metrics
    static_configs:
      - targets: ['localhost:3000']

Co-Authored-By: claude-flow <ruv@ruv.net>
This commit is contained in:
ruv 2026-05-23 15:03:51 -04:00
parent 883765150c
commit 74eb09f604
2 changed files with 83 additions and 0 deletions

View File

@ -475,6 +475,7 @@ Base URL: `http://localhost:3000` (Docker) or `http://localhost:8080` (binary de
| `POST` | `/api/v1/adaptive/unload` | Unload adaptive model | `{"success":true}` |
| `GET` | `/api/v1/mesh` | ADR-110 fleet-wide mesh sync map ([iter 29](adr/ADR-110-esp32-c6-firmware-extension.md)) | `{"nodes":{"9":{...},"12":{...}},"total":2}` |
| `GET` | `/api/v1/nodes/:id/sync` | Single-node mesh sync snapshot (or 404) | `{"offset_us":1163565,"is_leader":false,...}` |
| `GET` | `/api/v1/mesh/metrics` | ADR-110 mesh state in Prometheus exposition format ([iter 36](adr/ADR-110-esp32-c6-firmware-extension.md)) | `wifi_densepose_mesh_offset_us{node="9"} 1163565\n…` |
### Example: Get fleet mesh state (ADR-110)

View File

@ -4132,6 +4132,77 @@ async fn node_sync_endpoint(
/// 200 → { "nodes": { "<id>": NodeSyncSnapshot, ... }, "total": N }
/// Nodes without a recent sync are omitted from the map; an empty
/// `nodes` object means no mesh peers reachable.
/// ADR-110 iter 36 — Prometheus exposition format for mesh state.
///
/// GET /api/v1/mesh/metrics → text/plain
/// wifi_densepose_mesh_offset_us{node="N"} <signed-int>
/// wifi_densepose_mesh_is_leader{node="N"} 0|1
/// wifi_densepose_mesh_is_valid{node="N"} 0|1
/// wifi_densepose_mesh_smoothed{node="N"} 0|1
/// wifi_densepose_mesh_sequence{node="N"} <u32>
/// wifi_densepose_mesh_csi_fps{node="N"} <float>
/// wifi_densepose_mesh_csi_fps_samples{node="N"} <u32>
/// wifi_densepose_mesh_staleness_ms{node="N"} <u64>
///
/// Spec: <https://prometheus.io/docs/instrumenting/exposition_formats/>.
/// Each metric is a gauge labeled by node_id. Nodes without a fresh sync
/// are simply absent from the output (Prometheus handles missing series
/// natively — the scrape just reports them as stale after the configured
/// staleness duration).
async fn mesh_metrics_endpoint(State(state): State<SharedState>) -> impl IntoResponse {
use std::fmt::Write;
let s = state.read().await;
let mut body = String::with_capacity(1024);
// Each metric: HELP + TYPE header + one line per node that has a snapshot.
let metrics: &[(&str, &str, &str)] = &[
("wifi_densepose_mesh_offset_us",
"Cross-board mesh-aligned offset, microseconds (signed)", "gauge"),
("wifi_densepose_mesh_is_leader",
"1 if this node is the elected mesh leader, else 0", "gauge"),
("wifi_densepose_mesh_is_valid",
"1 if this node has heard a fresh leader beacon, else 0", "gauge"),
("wifi_densepose_mesh_smoothed",
"1 once the firmware-side EMA filter has seeded, else 0", "gauge"),
("wifi_densepose_mesh_sequence",
"High-water CSI sequence at sync emit time", "gauge"),
("wifi_densepose_mesh_csi_fps",
"Per-node measured CSI frame rate (Hz)", "gauge"),
("wifi_densepose_mesh_csi_fps_samples",
"How many inter-frame deltas the fps EMA has seen", "gauge"),
("wifi_densepose_mesh_staleness_ms",
"Milliseconds since the host last received this node's sync packet", "gauge"),
];
// Collect (id, snapshot) pairs once so each metric loop reads the same set.
let snaps: Vec<(u8, NodeSyncSnapshot)> = s.node_states.iter()
.filter_map(|(&id, ns)| ns.sync_snapshot().map(|snap| (id, snap)))
.collect();
for (name, help, kind) in metrics {
let _ = writeln!(body, "# HELP {name} {help}");
let _ = writeln!(body, "# TYPE {name} {kind}");
for (id, snap) in &snaps {
let value = match *name {
"wifi_densepose_mesh_offset_us" => snap.offset_us.to_string(),
"wifi_densepose_mesh_is_leader" => bool_metric(snap.is_leader),
"wifi_densepose_mesh_is_valid" => bool_metric(snap.is_valid),
"wifi_densepose_mesh_smoothed" => bool_metric(snap.smoothed),
"wifi_densepose_mesh_sequence" => snap.sequence.to_string(),
"wifi_densepose_mesh_csi_fps" => format!("{:.3}", snap.csi_fps_ema),
"wifi_densepose_mesh_csi_fps_samples" => snap.csi_fps_samples.to_string(),
"wifi_densepose_mesh_staleness_ms" =>
snap.staleness_ms.map(|n| n.to_string()).unwrap_or_else(|| "0".into()),
_ => continue,
};
let _ = writeln!(body, "{name}{{node=\"{id}\"}} {value}");
}
}
([(axum::http::header::CONTENT_TYPE, "text/plain; version=0.0.4")], body)
}
fn bool_metric(b: bool) -> String { (if b { 1 } else { 0 }).to_string() }
async fn mesh_endpoint(State(state): State<SharedState>) -> Json<serde_json::Value> {
let s = state.read().await;
let mut nodes = serde_json::Map::new();
@ -5644,6 +5715,7 @@ async fn main() {
// ADR-110 iter 29 — per-node mesh sync state for HTTP clients.
.route("/api/v1/nodes/:id/sync", get(node_sync_endpoint))
.route("/api/v1/mesh", get(mesh_endpoint))
.route("/api/v1/mesh/metrics", get(mesh_metrics_endpoint))
// Vital sign endpoints
.route("/api/v1/vital-signs", get(vital_signs_endpoint))
.route("/api/v1/edge-vitals", get(edge_vitals_endpoint))
@ -5973,6 +6045,16 @@ mod sync_snapshot_helper_tests {
"expected ~750 ms staleness, got {} ms", st);
}
#[test]
fn bool_metric_returns_zero_or_one_as_text() {
// Locks the Prometheus exposition convention: gauges holding a
// boolean state MUST emit literal "0" or "1", never "false"/"true".
// If anyone changes the helper to format!("{}", b), Prometheus will
// 400-reject the scrape — catch it here instead of in production.
assert_eq!(super::bool_metric(true), "1");
assert_eq!(super::bool_metric(false), "0");
}
#[test]
fn mesh_aligned_us_honors_9s_staleness_gate() {
// The receive helper stores latest_sync_at = Instant::now() each