//! ADR-155 M2 §4 — native (pure-Rust) DensePose conv benchmark. //! //! `DensePoseHead::apply_conv_layer` is a pure-Rust naive 6-nested-loop //! convolution (the §8 "native-conv naive-loop" backlog item). This bench //! measures `forward()` (which runs the shared-conv + segmentation + UV conv //! stacks through that naive loop) on a representative single-layer config so a //! perf claim can be made (or refused) with a MEASURED before/after — never a //! fabricated number. //! //! Reproduce: //! cargo bench -p wifi-densepose-nn --no-default-features --bench native_conv_bench //! //! The bench is `--no-default-features` (no `onnx`/`ort` download needed): the //! conv path is pure-Rust and benchable on any host. use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion, Throughput}; use ndarray::{Array1, Array4}; use std::hint::black_box; use wifi_densepose_nn::densepose::{ConvLayerWeights, DensePoseWeights}; use wifi_densepose_nn::{DensePoseConfig, DensePoseHead, Tensor}; /// Build a single same-padding conv layer `in_ch -> out_ch`, kernel `k`, with a /// bias (no batch-norm) — deterministic, small, representative of one stage. fn conv_layer(in_ch: usize, out_ch: usize, k: usize) -> ConvLayerWeights { let weight = Array4::from_shape_fn((out_ch, in_ch, k, k), |(o, i, kh, kw)| { // Deterministic, bounded weights. ((o + i + kh + kw) as f32 * 0.013).sin() }); ConvLayerWeights { weight, bias: Some(Array1::from_shape_fn(out_ch, |o| o as f32 * 0.01)), bn_gamma: None, bn_beta: None, bn_mean: None, bn_var: None, } } /// A head whose shared-conv stack is one `ch->ch` conv, with empty seg/uv heads, /// so the bench isolates a single conv-layer cost. fn single_conv_head(ch: usize, k: usize) -> DensePoseHead { let mut config = DensePoseConfig::new(ch, 1, 2); config.kernel_size = k; config.padding = k / 2; // same padding config.hidden_channels = vec![ch]; let weights = DensePoseWeights { shared_conv: vec![conv_layer(ch, ch, k)], segmentation_head: vec![], uv_head: vec![], }; DensePoseHead::with_weights(config, weights).expect("valid head") } fn bench_native_conv(c: &mut Criterion) { let mut group = c.benchmark_group("native_conv"); // (channels, spatial, kernel) — a modest map and a larger one. for &(ch, hw, k) in &[(16usize, 32usize, 3usize), (32, 32, 3)] { let head = single_conv_head(ch, k); let input = Tensor::Float4D(Array4::from_shape_fn((1, ch, hw, hw), |(_, c, y, x)| { ((c + y + x) as f32 * 0.001).cos() })); // Throughput in output elements processed. group.throughput(Throughput::Elements((ch * hw * hw) as u64)); group.bench_with_input( BenchmarkId::from_parameter(format!("ch{ch}_hw{hw}_k{k}")), &input, |bencher, inp| { bencher.iter(|| { let out = head.forward(black_box(inp)).expect("forward ok"); black_box(out); }); }, ); } group.finish(); } criterion_group!(benches, bench_native_conv); criterion_main!(benches);