diff --git a/v2/Cargo.lock b/v2/Cargo.lock index 2d8d9ecc..016aa000 100644 --- a/v2/Cargo.lock +++ b/v2/Cargo.lock @@ -1015,6 +1015,7 @@ dependencies = [ "candle-core 0.9.2", "candle-nn 0.9.2", "clap", + "criterion", "safetensors 0.4.5", "serde", "serde_json", @@ -1034,6 +1035,7 @@ dependencies = [ "candle-core 0.9.2", "candle-nn 0.9.2", "clap", + "criterion", "hex", "safetensors 0.4.5", "serde", diff --git a/v2/crates/cog-person-count/Cargo.toml b/v2/crates/cog-person-count/Cargo.toml index 2b3a65ea..811bf485 100644 --- a/v2/crates/cog-person-count/Cargo.toml +++ b/v2/crates/cog-person-count/Cargo.toml @@ -34,6 +34,12 @@ safetensors = "0.4" [dev-dependencies] tempfile = "3" approx = "0.5" +# ADR-163: steady-state infer latency bench (real count_v1 weights, Device::Cpu). +criterion = { version = "0.5", features = ["html_reports"] } + +[[bench]] +name = "infer_bench" +harness = false [features] default = [] diff --git a/v2/crates/cog-person-count/benches/infer_bench.rs b/v2/crates/cog-person-count/benches/infer_bench.rs new file mode 100644 index 00000000..2381f65b --- /dev/null +++ b/v2/crates/cog-person-count/benches/infer_bench.rs @@ -0,0 +1,95 @@ +//! Criterion bench for `cog-person-count` steady-state inference latency +//! (ADR-163, closing the ADR-159/160 deferred "cog inference latency bench" item). +//! +//! ## What this measures — and what the manifest's `cold_start_ms` does NOT +//! +//! This benches **steady-state** `InferenceEngine::infer` over a FIXED CSI +//! window on `Device::Cpu` with the **real** shipped `count_v1.safetensors` +//! weights — i.e. the per-frame cost once the model is loaded and warm. +//! +//! The cog manifest's `build_metadata.cold_start_ms_avg` (in the pose cog; +//! person-count's manifest carries comparable provenance) is a **DIFFERENT +//! measurement**: it includes one-time weight load / mmap / first-forward +//! allocation. Cold-start is a startup cost paid once; steady-state infer is the +//! recurring per-frame cost. They are not comparable and we do not conflate them. +//! `cold_start` was measured on ruvultra (RTX 5080 host, candle 0.9 cpu); this +//! bench runs on whatever machine you run it on — see `benchmarks/edge-latency/RESULTS.md` +//! for the host the committed numbers were taken on. +//! +//! If the weights file is absent the engine falls back to the zero-confidence +//! stub; we skip the bench in that case rather than benchmark the stub (which +//! would be a meaningless number) — the bench prints a notice and measures a +//! no-op so criterion still produces a (clearly-labelled) datapoint. +//! +//! Run (cog crates are normal workspace members): +//! cd v2 && cargo bench -p cog-person-count --no-default-features +//! cd v2 && cargo bench -p cog-person-count --no-default-features -- --warm-up-time 1 --measurement-time 2 + +use std::hint::black_box; +use std::path::Path; + +use criterion::{criterion_group, criterion_main, Criterion}; + +use cog_person_count::inference::{CsiWindow, InferenceEngine, INPUT_SUBCARRIERS, INPUT_TIMESTEPS}; + +/// Deterministic fixed CSI window (seed-stable LCG), normalised-ish amplitudes. +fn fixed_window() -> CsiWindow { + let mut s = 0x00C0_FFEEu32; + let data: Vec = (0..INPUT_SUBCARRIERS * INPUT_TIMESTEPS) + .map(|_| { + s = s.wrapping_mul(1103515245).wrapping_add(12345); + (s >> 16) as f32 / 32768.0 // [0, 1) + }) + .collect(); + CsiWindow { data } +} + +/// Locate the real weights from the crate dir or the repo root. +fn real_weights() -> Option { + let candidates = [ + "cog/artifacts/count_v1.safetensors", + "v2/crates/cog-person-count/cog/artifacts/count_v1.safetensors", + "crates/cog-person-count/cog/artifacts/count_v1.safetensors", + ]; + candidates + .iter() + .map(Path::new) + .find(|p| p.exists()) + .map(|p| p.to_path_buf()) +} + +fn bench_infer(c: &mut Criterion) { + let window = fixed_window(); + + match real_weights() { + Some(path) => { + let engine = + InferenceEngine::with_weights(Some(&path)).expect("load real count_v1 weights"); + assert!( + engine.backend().starts_with("candle-"), + "expected real Candle backend, got {} — bench would measure the stub", + engine.backend() + ); + // Sanity: one real inference before timing. + let _ = engine.infer(&window).expect("warmup infer"); + + c.bench_function("cog_person_count::infer[cpu_real_weights_steady_state]", |b| { + b.iter(|| { + black_box(engine.infer(black_box(&window)).expect("infer")); + }); + }); + } + None => { + eprintln!( + "NOTE: count_v1.safetensors not found — skipping the real-weights infer bench. \ + (The committed RESULTS.md numbers require the in-repo weights.)" + ); + c.bench_function("cog_person_count::infer[SKIPPED_no_weights]", |b| { + b.iter(|| black_box(1 + 1)); + }); + } + } +} + +criterion_group!(benches, bench_infer); +criterion_main!(benches); diff --git a/v2/crates/cog-pose-estimation/Cargo.toml b/v2/crates/cog-pose-estimation/Cargo.toml index 2bdeae77..f01b8626 100644 --- a/v2/crates/cog-pose-estimation/Cargo.toml +++ b/v2/crates/cog-pose-estimation/Cargo.toml @@ -39,6 +39,12 @@ wifi-densepose-train = { version = "0.3.1", path = "../wifi-densepose-train", de [dev-dependencies] tempfile = "3" +# ADR-163: steady-state infer latency bench (real pose_v1 weights, Device::Cpu). +criterion = { version = "0.5", features = ["html_reports"] } + +[[bench]] +name = "infer_bench" +harness = false [features] default = [] diff --git a/v2/crates/cog-pose-estimation/benches/infer_bench.rs b/v2/crates/cog-pose-estimation/benches/infer_bench.rs new file mode 100644 index 00000000..7d90ee59 --- /dev/null +++ b/v2/crates/cog-pose-estimation/benches/infer_bench.rs @@ -0,0 +1,89 @@ +//! Criterion bench for `cog-pose-estimation` steady-state inference latency +//! (ADR-163, closing the ADR-159/160 deferred "cog inference latency bench" item). +//! +//! ## What this measures — and what the manifest's `cold_start_ms_avg` does NOT +//! +//! The pose cog's manifest (`cog/artifacts/manifests/x86_64/manifest.json`) +//! cites `build_metadata.cold_start_ms_avg: 5.4` (30 invocations, measured on +//! ruvultra / RTX 5080 host, candle 0.9 cpu). **That is a cold-start number** — +//! it folds in one-time weight load / mmap / first-forward allocation. +//! +//! This bench measures the **steady-state** per-frame cost instead: +//! `InferenceEngine::infer` over a FIXED CSI window on `Device::Cpu` with the +//! **real** shipped `pose_v1.safetensors`, after a warm-up forward. Steady-state +//! and cold-start are different measurements; we label both honestly and do not +//! claim this reproduces the 5.4 ms manifest figure (different machine, different +//! measurement). See `benchmarks/edge-latency/RESULTS.md`. +//! +//! Run (cog crates are normal workspace members): +//! cd v2 && cargo bench -p cog-pose-estimation --no-default-features +//! cd v2 && cargo bench -p cog-pose-estimation --no-default-features -- --warm-up-time 1 --measurement-time 2 + +use std::hint::black_box; +use std::path::Path; + +use criterion::{criterion_group, criterion_main, Criterion}; + +use cog_pose_estimation::inference::{ + CsiWindow, InferenceEngine, INPUT_SUBCARRIERS, INPUT_TIMESTEPS, +}; + +/// Deterministic fixed CSI window (seed-stable LCG). +fn fixed_window() -> CsiWindow { + let mut s = 0x00C0_FFEEu32; + let data: Vec = (0..INPUT_SUBCARRIERS * INPUT_TIMESTEPS) + .map(|_| { + s = s.wrapping_mul(1103515245).wrapping_add(12345); + (s >> 16) as f32 / 32768.0 // [0, 1) + }) + .collect(); + CsiWindow { data } +} + +fn real_weights() -> Option { + let candidates = [ + "cog/artifacts/pose_v1.safetensors", + "v2/crates/cog-pose-estimation/cog/artifacts/pose_v1.safetensors", + "crates/cog-pose-estimation/cog/artifacts/pose_v1.safetensors", + ]; + candidates + .iter() + .map(Path::new) + .find(|p| p.exists()) + .map(|p| p.to_path_buf()) +} + +fn bench_infer(c: &mut Criterion) { + let window = fixed_window(); + + match real_weights() { + Some(path) => { + let engine = + InferenceEngine::with_weights(Some(&path)).expect("load real pose_v1 weights"); + assert!( + engine.backend().starts_with("candle-"), + "expected real Candle backend, got {} — bench would measure the stub", + engine.backend() + ); + let _ = engine.infer(&window).expect("warmup infer"); + + c.bench_function("cog_pose_estimation::infer[cpu_real_weights_steady_state]", |b| { + b.iter(|| { + black_box(engine.infer(black_box(&window)).expect("infer")); + }); + }); + } + None => { + eprintln!( + "NOTE: pose_v1.safetensors not found — skipping the real-weights infer bench. \ + (The committed RESULTS.md numbers require the in-repo weights.)" + ); + c.bench_function("cog_pose_estimation::infer[SKIPPED_no_weights]", |b| { + b.iter(|| black_box(1 + 1)); + }); + } + } +} + +criterion_group!(benches, bench_infer); +criterion_main!(benches);