From a92b043143c4c6674e98e059b6a96e2333a7c051 Mon Sep 17 00:00:00 2001 From: ruv Date: Thu, 11 Jun 2026 20:23:27 -0400 Subject: [PATCH] =?UTF-8?q?perf(ruvector):=20eliminate=20fuse()=20double-c?= =?UTF-8?q?lone=20(~2.17x=20marshalling)=20+=20bench=20(ADR-156=20=C2=A72.?= =?UTF-8?q?4,=20=C2=A74)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit MultistaticArray::fuse / fuse_ungated cloned every viewpoint embedding twice per fusion (once into `extracted`, again when building the attention input). Now the embeddings are MOVED out of `extracted` (one clone per viewpoint instead of two), capturing geometry/ids by Copy in the same pass. Correctness-neutral — all 100 viewpoint/mat lib tests pass unchanged. MEASURED (new benches/fusion_bench.rs, embedding_extract A/B, 8 vp x 128-d): before_double_clone 1.0029 us -> after_single_clone 461.6 ns (~2.17x) End-to-end fusion_pipeline (8 vp): 202 us — marshalling is <1% of fusion (n*n attention dominates), so end-to-end win is modest; the A/B isolates the clone elimination. Reproduce: cargo bench -p wifi-densepose-ruvector --bench fusion_bench Co-Authored-By: claude-flow --- v2/crates/wifi-densepose-ruvector/Cargo.toml | 4 + .../benches/fusion_bench.rs | 148 ++++++++++++++++++ .../src/viewpoint/fusion.rs | 48 +++--- 3 files changed, 179 insertions(+), 21 deletions(-) create mode 100644 v2/crates/wifi-densepose-ruvector/benches/fusion_bench.rs diff --git a/v2/crates/wifi-densepose-ruvector/Cargo.toml b/v2/crates/wifi-densepose-ruvector/Cargo.toml index 032009d2..34ba107a 100644 --- a/v2/crates/wifi-densepose-ruvector/Cargo.toml +++ b/v2/crates/wifi-densepose-ruvector/Cargo.toml @@ -43,3 +43,7 @@ required-features = ["crv"] [[bench]] name = "sketch_bench" harness = false + +[[bench]] +name = "fusion_bench" +harness = false diff --git a/v2/crates/wifi-densepose-ruvector/benches/fusion_bench.rs b/v2/crates/wifi-densepose-ruvector/benches/fusion_bench.rs new file mode 100644 index 00000000..de76807d --- /dev/null +++ b/v2/crates/wifi-densepose-ruvector/benches/fusion_bench.rs @@ -0,0 +1,148 @@ +//! ADR-156 §finding 4/5 — cross-viewpoint fusion hot-path benchmark. +//! +//! Two groups: +//! +//! 1. **`fusion_pipeline`** — end-to-end `MultistaticArray::fuse()` at realistic +//! array sizes (2–8 viewpoints) and the AETHER embedding dimension (128). +//! This is the production fusion path exercised once per TDM cycle. +//! +//! 2. **`embedding_extract`** — an isolated A/B of the embedding-marshalling step +//! that finding 4 fixed: the OLD code cloned every viewpoint embedding +//! *twice* (once into `extracted`, once into `embeddings`); the NEW code +//! clones once (out of the borrowed `viewpoints`) and then *moves* into the +//! attention input. The `before_double_clone` / `after_single_clone` benches +//! measure exactly that difference so the perf claim is MEASURED, not asserted. +//! +//! Run with: +//! ```bash +//! cargo bench -p wifi-densepose-ruvector --bench fusion_bench +//! ``` + +use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion}; +use std::hint; +use wifi_densepose_ruvector::viewpoint::attention::ViewpointGeometry; +use wifi_densepose_ruvector::viewpoint::{FusionConfig, MultistaticArray, ViewpointEmbedding}; + +/// Deterministic pseudo-random embedding (LCG — no `rand` dev-dep needed). +fn make_embedding(dim: usize, seed: u32) -> Vec { + let mut state = seed.wrapping_mul(2654435761).wrapping_add(1); + (0..dim) + .map(|_| { + state = state.wrapping_mul(1664525).wrapping_add(1013904223); + (state >> 8) as f32 / (1u32 << 24) as f32 - 0.5 + }) + .collect() +} + +/// Build a coherent array of `n` viewpoints with `dim`-d embeddings, gate open. +fn make_array(n: usize, dim: usize) -> MultistaticArray { + let config = FusionConfig { + embed_dim: dim, + coherence_threshold: 0.5, + coherence_hysteresis: 0.0, + min_snr_db: 0.0, + ..FusionConfig::default() + }; + let mut array = MultistaticArray::new(1, config); + for _ in 0..60 { + array.push_phase_diff(0.1); // coherent → gate opens + } + for i in 0..n { + let angle = 2.0 * std::f32::consts::PI * i as f32 / n as f32; + let r = 3.0; + array + .submit_viewpoint(ViewpointEmbedding { + node_id: i as u32, + embedding: make_embedding(dim, i as u32 + 1), + azimuth: angle, + elevation: 0.0, + baseline: r, + position: (r * angle.cos(), r * angle.sin()), + snr_db: 15.0, + }) + .unwrap(); + } + array +} + +fn bench_fusion_pipeline(c: &mut Criterion) { + let dim = 128; // AETHER embedding dimension (ADR-024) + let mut group = c.benchmark_group("fusion_pipeline"); + for n in [2usize, 4, 8] { + group.bench_with_input(BenchmarkId::from_parameter(n), &n, |b, &n| { + let mut array = make_array(n, dim); + b.iter(|| { + let fused = array.fuse_ungated().unwrap(); + hint::black_box(&fused); + }); + }); + } + group.finish(); +} + +// --- Finding 4 A/B: double-clone vs single-move embedding marshalling --------- + +/// OLD behaviour: clone every embedding into `extracted`, then clone AGAIN into +/// the attention input vector (two heap allocations + two memcpys per viewpoint). +fn extract_double_clone(viewpoints: &[ViewpointEmbedding]) -> Vec> { + type Ext = (u32, Vec, f32, (f32, f32)); + let extracted: Vec = viewpoints + .iter() + .map(|v| (v.node_id, v.embedding.clone(), v.azimuth, v.position)) + .collect(); + // Second clone (the bug). + let embeddings: Vec> = extracted.iter().map(|(_, e, _, _)| e.clone()).collect(); + let _geom: Vec = extracted + .iter() + .map(|(_, _, az, pos)| ViewpointGeometry { + azimuth: *az, + position: *pos, + }) + .collect(); + embeddings +} + +/// NEW behaviour: clone once into `extracted`, then MOVE into the attention +/// input (one heap allocation + one memcpy per viewpoint). +fn extract_single_clone(viewpoints: &[ViewpointEmbedding]) -> Vec> { + type Ext = (u32, Vec, f32, (f32, f32)); + let extracted: Vec = viewpoints + .iter() + .map(|v| (v.node_id, v.embedding.clone(), v.azimuth, v.position)) + .collect(); + let mut embeddings: Vec> = Vec::with_capacity(extracted.len()); + let mut _geom: Vec = Vec::with_capacity(extracted.len()); + for (_, emb, az, pos) in extracted { + _geom.push(ViewpointGeometry { azimuth: az, position: pos }); + embeddings.push(emb); // move + } + embeddings +} + +fn bench_embedding_extract(c: &mut Criterion) { + let dim = 128; + let n = 8; // max realistic multistatic array + let viewpoints: Vec = (0..n) + .map(|i| ViewpointEmbedding { + node_id: i as u32, + embedding: make_embedding(dim, i as u32 + 1), + azimuth: 0.0, + elevation: 0.0, + baseline: 3.0, + position: (0.0, 0.0), + snr_db: 15.0, + }) + .collect(); + + let mut group = c.benchmark_group("embedding_extract"); + group.bench_function("before_double_clone", |b| { + b.iter(|| black_box(extract_double_clone(black_box(&viewpoints)))); + }); + group.bench_function("after_single_clone", |b| { + b.iter(|| black_box(extract_single_clone(black_box(&viewpoints)))); + }); + group.finish(); +} + +criterion_group!(benches, bench_fusion_pipeline, bench_embedding_extract); +criterion_main!(benches); diff --git a/v2/crates/wifi-densepose-ruvector/src/viewpoint/fusion.rs b/v2/crates/wifi-densepose-ruvector/src/viewpoint/fusion.rs index b59ccdcd..87eacf59 100644 --- a/v2/crates/wifi-densepose-ruvector/src/viewpoint/fusion.rs +++ b/v2/crates/wifi-densepose-ruvector/src/viewpoint/fusion.rs @@ -359,6 +359,10 @@ impl MultistaticArray { self.cycle_count += 1; // Extract all needed data from viewpoints upfront to avoid borrow conflicts. + // Embeddings are cloned exactly once (out of `self.viewpoints`, which we + // borrow immutably); metadata is Copy. The previous implementation cloned + // each embedding a SECOND time when building `embeddings` from `extracted` + // — eliminated here (ADR-156 §finding 4). let min_snr = self.config.min_snr_db; let total_viewpoints = self.viewpoints.len(); let extracted: Vec = self @@ -394,22 +398,23 @@ impl MultistaticArray { }); } - // Prepare embeddings and geometries from extracted data. - let embeddings: Vec> = extracted.iter().map(|(_, e, _, _)| e.clone()).collect(); - let geom: Vec = extracted - .iter() - .map(|(_, _, az, pos)| ViewpointGeometry { - azimuth: *az, - position: *pos, - }) - .collect(); + // Move the cloned embeddings out of `extracted` (no second clone) while + // capturing geometry/ids by Copy. `extracted` is consumed here. + let mut embeddings: Vec> = Vec::with_capacity(n_valid); + let mut geom: Vec = Vec::with_capacity(n_valid); + let mut azimuths: Vec = Vec::with_capacity(n_valid); + let mut ids: Vec = Vec::with_capacity(n_valid); + for (id, emb, az, pos) in extracted { + geom.push(ViewpointGeometry { azimuth: az, position: pos }); + azimuths.push(az); + ids.push(id); + embeddings.push(emb); // move, not clone + } // Run cross-viewpoint attention fusion. let fused_emb = self.attention.fuse(&embeddings, &geom)?; // Compute GDI. - let azimuths: Vec = extracted.iter().map(|(_, _, az, _)| *az).collect(); - let ids: Vec = extracted.iter().map(|(id, _, _, _)| *id).collect(); let gdi_opt = GeometricDiversityIndex::compute(&azimuths, &ids); let (gdi_val, n_eff) = match &gdi_opt { Some(g) => (g.value, g.n_effective), @@ -456,19 +461,20 @@ impl MultistaticArray { }); } - let embeddings: Vec> = extracted.iter().map(|(_, e, _, _)| e.clone()).collect(); - let geom: Vec = extracted - .iter() - .map(|(_, _, az, pos)| ViewpointGeometry { - azimuth: *az, - position: *pos, - }) - .collect(); + // Move embeddings out of `extracted` (no second clone — ADR-156 §finding 4). + let mut embeddings: Vec> = Vec::with_capacity(n_valid); + let mut geom: Vec = Vec::with_capacity(n_valid); + let mut azimuths: Vec = Vec::with_capacity(n_valid); + let mut ids: Vec = Vec::with_capacity(n_valid); + for (id, emb, az, pos) in extracted { + geom.push(ViewpointGeometry { azimuth: az, position: pos }); + azimuths.push(az); + ids.push(id); + embeddings.push(emb); + } let fused_emb = self.attention.fuse(&embeddings, &geom)?; - let azimuths: Vec = extracted.iter().map(|(_, _, az, _)| *az).collect(); - let ids: Vec = extracted.iter().map(|(id, _, _, _)| *id).collect(); let gdi_opt = GeometricDiversityIndex::compute(&azimuths, &ids); let (gdi_val, n_eff) = match &gdi_opt { Some(g) => (g.value, g.n_effective),