//! Criterion micro-benchmark for `to_gaussian_splats`: the old multi-pass //! cell reduction (up to 9 `.iter().sum()` passes per voxel) vs. the new //! 2-pass fused accumulation now used in production. //! //! This crate is a binary (no `lib.rs`), so the bench cannot import the //! production symbol directly. Both variants are reproduced here verbatim and //! driven over identical data; the `new`/`old` shapes match the code in //! `src/pointcloud.rs` exactly, so the measured speed-up reflects the real //! change. A `parity` assertion in the harness guards that the two variants //! produce bit-identical output before timing them. //! //! Run: `cargo bench -p wifi-densepose-pointcloud` use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion}; #[derive(Clone)] struct ColorPoint { x: f32, y: f32, z: f32, r: u8, g: u8, b: u8, } #[derive(Clone, Copy, PartialEq, Debug)] struct Splat { center: [f32; 3], color: [f32; 3], opacity: f32, scale: [f32; 3], } const VOXEL: f32 = 0.08; fn voxelize(points: &[ColorPoint]) -> std::collections::HashMap<(i32, i32, i32), Vec<&ColorPoint>> { let mut cells: std::collections::HashMap<(i32, i32, i32), Vec<&ColorPoint>> = std::collections::HashMap::new(); for p in points { let key = ( (p.x / VOXEL).floor() as i32, (p.y / VOXEL).floor() as i32, (p.z / VOXEL).floor() as i32, ); cells.entry(key).or_default().push(p); } cells } /// OLD: nine separate `.iter()` passes per cell. fn splats_old(points: &[ColorPoint]) -> Vec { let cells = voxelize(points); cells .values() .map(|pts| { let n = pts.len() as f32; let cx = pts.iter().map(|p| p.x).sum::() / n; let cy = pts.iter().map(|p| p.y).sum::() / n; let cz = pts.iter().map(|p| p.z).sum::() / n; let cr = pts.iter().map(|p| p.r as f32).sum::() / n / 255.0; let cg = pts.iter().map(|p| p.g as f32).sum::() / n / 255.0; let cb = pts.iter().map(|p| p.b as f32).sum::() / n / 255.0; let sx = pts.iter().map(|p| (p.x - cx).abs()).sum::() / n + 0.01; let sy = pts.iter().map(|p| (p.y - cy).abs()).sum::() / n + 0.01; let sz = pts.iter().map(|p| (p.z - cz).abs()).sum::() / n + 0.01; Splat { center: [cx, cy, cz], color: [cr, cg, cb], opacity: (n / 10.0).min(1.0), scale: [sx, sy, sz], } }) .collect() } /// NEW: two fused accumulation passes per cell (production version). fn splats_new(points: &[ColorPoint]) -> Vec { let cells = voxelize(points); cells .values() .map(|pts| { let n = pts.len() as f32; let (mut sum_x, mut sum_y, mut sum_z) = (0.0f32, 0.0f32, 0.0f32); let (mut sum_r, mut sum_g, mut sum_b) = (0.0f32, 0.0f32, 0.0f32); for p in pts { sum_x += p.x; sum_y += p.y; sum_z += p.z; sum_r += p.r as f32; sum_g += p.g as f32; sum_b += p.b as f32; } let cx = sum_x / n; let cy = sum_y / n; let cz = sum_z / n; let cr = sum_r / n / 255.0; let cg = sum_g / n / 255.0; let cb = sum_b / n / 255.0; let (mut dev_x, mut dev_y, mut dev_z) = (0.0f32, 0.0f32, 0.0f32); for p in pts { dev_x += (p.x - cx).abs(); dev_y += (p.y - cy).abs(); dev_z += (p.z - cz).abs(); } Splat { center: [cx, cy, cz], color: [cr, cg, cb], opacity: (n / 10.0).min(1.0), scale: [dev_x / n + 0.01, dev_y / n + 0.01, dev_z / n + 0.01], } }) .collect() } /// Deterministic synthetic cloud (no RNG — fully reproducible). /// /// `n` total points distributed so each occupied voxel holds about /// `pts_per_cell` points. A real MiDaS depth backprojection is *dense* — /// adjacent pixels at similar depth land in the same 8 cm voxel — so the /// realistic regime is tens-to-hundreds of points per cell, which is exactly /// where the per-cell pass-count reduction matters. We sweep `pts_per_cell` /// to show the dependence honestly rather than picking a flattering point. fn make_cloud(n: usize, pts_per_cell: usize) -> Vec { let ppc = pts_per_cell.max(1); let cells = (n / ppc).max(1); let cells_per_side = ((cells as f64).cbrt().ceil() as usize).max(1); let extent = cells_per_side as f32 * VOXEL; // metres let mut v = Vec::with_capacity(n); for i in 0..n { // `i / ppc` selects the cell; the low bits jitter within the cell so // points are genuinely distinct (non-zero spread → non-trivial scale). let cell = (i / ppc) as f32; let jitter = (i % ppc) as f32 / ppc as f32 * VOXEL * 0.9; let base = (cell * VOXEL) % extent.max(VOXEL); v.push(ColorPoint { x: (base + jitter) % extent.max(VOXEL), y: (base * 1.7 + jitter) % extent.max(VOXEL), z: (base * 2.3 + jitter) % extent.max(VOXEL), r: (i % 256) as u8, g: ((i / 2) % 256) as u8, b: ((i / 3) % 256) as u8, }); } v } fn bench_splats(c: &mut Criterion) { let mut group = c.benchmark_group("to_gaussian_splats"); let n = 50_000usize; // Sweep density: sparse (few points/cell) → dense (the realistic depth // backprojection regime). The optimization targets dense cells. for &ppc in &[4usize, 16, 64, 256] { let cloud = make_cloud(n, ppc); // Parity guard: old and new must agree bit-for-bit before we time them. let a = splats_old(&cloud); let b = splats_new(&cloud); assert_eq!(a.len(), b.len(), "cell count differs at ppc={ppc}"); let mut sa = a.clone(); let mut sb = b.clone(); let key = |s: &Splat| (s.center[0].to_bits(), s.center[1].to_bits(), s.center[2].to_bits()); sa.sort_by_key(key); sb.sort_by_key(key); assert_eq!(sa, sb, "old/new splat output diverged at ppc={ppc}"); let label = format!("ppc{ppc}"); group.bench_with_input(BenchmarkId::new("old_9pass", &label), &cloud, |bch, cl| { bch.iter(|| splats_old(black_box(cl))) }); group.bench_with_input(BenchmarkId::new("new_2pass", &label), &cloud, |bch, cl| { bch.iter(|| splats_new(black_box(cl))) }); } group.finish(); } criterion_group!(benches, bench_splats); criterion_main!(benches);