wifi-densepose/v2/crates/wifi-densepose-nn/benches/native_conv_bench.rs

80 lines
3.2 KiB
Rust

//! ADR-155 M2 §4 — native (pure-Rust) DensePose conv benchmark.
//!
//! `DensePoseHead::apply_conv_layer` is a pure-Rust naive 6-nested-loop
//! convolution (the §8 "native-conv naive-loop" backlog item). This bench
//! measures `forward()` (which runs the shared-conv + segmentation + UV conv
//! stacks through that naive loop) on a representative single-layer config so a
//! perf claim can be made (or refused) with a MEASURED before/after — never a
//! fabricated number.
//!
//! Reproduce:
//! cargo bench -p wifi-densepose-nn --no-default-features --bench native_conv_bench
//!
//! The bench is `--no-default-features` (no `onnx`/`ort` download needed): the
//! conv path is pure-Rust and benchable on any host.
use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion, Throughput};
use ndarray::{Array1, Array4};
use std::hint::black_box;
use wifi_densepose_nn::densepose::{ConvLayerWeights, DensePoseWeights};
use wifi_densepose_nn::{DensePoseConfig, DensePoseHead, Tensor};
/// Build a single same-padding conv layer `in_ch -> out_ch`, kernel `k`, with a
/// bias (no batch-norm) — deterministic, small, representative of one stage.
fn conv_layer(in_ch: usize, out_ch: usize, k: usize) -> ConvLayerWeights {
let weight = Array4::from_shape_fn((out_ch, in_ch, k, k), |(o, i, kh, kw)| {
// Deterministic, bounded weights.
((o + i + kh + kw) as f32 * 0.013).sin()
});
ConvLayerWeights {
weight,
bias: Some(Array1::from_shape_fn(out_ch, |o| o as f32 * 0.01)),
bn_gamma: None,
bn_beta: None,
bn_mean: None,
bn_var: None,
}
}
/// A head whose shared-conv stack is one `ch->ch` conv, with empty seg/uv heads,
/// so the bench isolates a single conv-layer cost.
fn single_conv_head(ch: usize, k: usize) -> DensePoseHead {
let mut config = DensePoseConfig::new(ch, 1, 2);
config.kernel_size = k;
config.padding = k / 2; // same padding
config.hidden_channels = vec![ch];
let weights = DensePoseWeights {
shared_conv: vec![conv_layer(ch, ch, k)],
segmentation_head: vec![],
uv_head: vec![],
};
DensePoseHead::with_weights(config, weights).expect("valid head")
}
fn bench_native_conv(c: &mut Criterion) {
let mut group = c.benchmark_group("native_conv");
// (channels, spatial, kernel) — a modest map and a larger one.
for &(ch, hw, k) in &[(16usize, 32usize, 3usize), (32, 32, 3)] {
let head = single_conv_head(ch, k);
let input = Tensor::Float4D(Array4::from_shape_fn((1, ch, hw, hw), |(_, c, y, x)| {
((c + y + x) as f32 * 0.001).cos()
}));
// Throughput in output elements processed.
group.throughput(Throughput::Elements((ch * hw * hw) as u64));
group.bench_with_input(
BenchmarkId::from_parameter(format!("ch{ch}_hw{hw}_k{k}")),
&input,
|bencher, inp| {
bencher.iter(|| {
let out = head.forward(black_box(inp)).expect("forward ok");
black_box(out);
});
},
);
}
group.finish();
}
criterion_group!(benches, bench_native_conv);
criterion_main!(benches);