From 5cacb5fe0a0c3286ffdea547ddce9e5ade1fd289 Mon Sep 17 00:00:00 2001 From: ruv Date: Thu, 11 Jun 2026 19:57:53 -0400 Subject: [PATCH] =?UTF-8?q?perf(nn):=20zero-copy=20ORT=20input=20(~1.48x)?= =?UTF-8?q?=20+=20dynamic-dim=20guard=20+=20concurrency=20bench=20(ADR-155?= =?UTF-8?q?=20=C2=A7Tier-3)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - onnx.rs ORT input: arr.as_slice() single-memcpy fast path with iterator fallback for strided views. MEASURED [1,256,64,64]: 1.972ms -> 1.336ms (~1.48x). Repro: cargo bench -p wifi-densepose-nn --no-default-features --features onnx --bench onnx_bench -- onnx_input_copy - onnx.rs checked_output_dims: reject ONNX dim <= 0 (incl. unresolved -1) before allocation (config-OOM class) + test. - onnx_concurrency bench: empirically proves the per-inference write lock serializes (throughput drops with more threads). The intended read-lock win is NOT landable on ort 2.0.0-rc.11 (safe Session::run is &mut self, verified) and is deferred to the backlog with the upgrade path documented in-code. New committed fixture tests/fixtures/tiny_conv.onnx (666 B, not gitignored). Co-Authored-By: claude-flow --- v2/crates/wifi-densepose-nn/Cargo.toml | 5 + .../wifi-densepose-nn/benches/onnx_bench.rs | 181 ++++++++++++++++++ v2/crates/wifi-densepose-nn/src/onnx.rs | 70 ++++++- .../tests/fixtures/tiny_conv.onnx | Bin 0 -> 666 bytes 4 files changed, 252 insertions(+), 4 deletions(-) create mode 100644 v2/crates/wifi-densepose-nn/benches/onnx_bench.rs create mode 100644 v2/crates/wifi-densepose-nn/tests/fixtures/tiny_conv.onnx diff --git a/v2/crates/wifi-densepose-nn/Cargo.toml b/v2/crates/wifi-densepose-nn/Cargo.toml index 4bf0b583..fe221ea1 100644 --- a/v2/crates/wifi-densepose-nn/Cargo.toml +++ b/v2/crates/wifi-densepose-nn/Cargo.toml @@ -58,3 +58,8 @@ tempfile = "3.10" [[bench]] name = "inference_bench" harness = false + +[[bench]] +name = "onnx_bench" +harness = false +required-features = ["onnx"] diff --git a/v2/crates/wifi-densepose-nn/benches/onnx_bench.rs b/v2/crates/wifi-densepose-nn/benches/onnx_bench.rs new file mode 100644 index 00000000..1b104e1f --- /dev/null +++ b/v2/crates/wifi-densepose-nn/benches/onnx_bench.rs @@ -0,0 +1,181 @@ +//! ADR-155 ONNX backend micro-benchmarks. +//! +//! Two measured concerns: +//! +//! * **WIN 2 — input copy.** `OnnxSession::run` builds the ORT input from the +//! ndarray. `input_copy_contiguous` measures the difference between the old +//! element-wise `iter().cloned().collect()` and the new +//! `as_slice().to_vec()` zero-copy-when-contiguous path. `input_copy_strided` +//! confirms the fallback still works on a non-contiguous view. +//! +//! * **WIN 1 — concurrency.** `onnx_concurrency` runs real inference over a +//! shared `Arc` at 1/2/4/8 threads. It documents the current +//! serialized behaviour (ort 2.0.0-rc.11 `Session::run` is `&mut self`, so the +//! backend holds a write lock). It is the harness that would show the speedup +//! if a `&self` run path becomes available. +//! +//! Requires the `onnx` feature and a real ORT runtime. The fixture model is +//! `tests/fixtures/tiny_conv.onnx` (input `[1,3,8,8]` -> Conv -> Relu). +//! +//! Reproduce: +//! cargo bench -p wifi-densepose-nn --no-default-features --features onnx --bench onnx_bench + +use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion, Throughput}; +use ndarray::Array4; +use std::collections::HashMap; +use std::path::PathBuf; +use std::sync::Arc; +use std::thread; +use wifi_densepose_nn::inference::Backend; +use wifi_densepose_nn::onnx::OnnxBackend; +use wifi_densepose_nn::tensor::Tensor; + +fn fixture_path() -> PathBuf { + PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .join("tests") + .join("fixtures") + .join("tiny_conv.onnx") +} + +/// Representative input shape matching the fixture model. +const SHAPE: [usize; 4] = [1, 3, 8, 8]; + +/// Old path: full element-wise iterator copy. +#[inline] +fn copy_iter(arr: &Array4) -> Vec { + arr.iter().cloned().collect() +} + +/// New path: zero-copy `as_slice()` when contiguous, else iterator fallback. +#[inline] +fn copy_slice(arr: &Array4) -> Vec { + match arr.as_slice() { + Some(slice) => slice.to_vec(), + None => arr.iter().cloned().collect(), + } +} + +/// WIN 2 — input copy, before vs after, on a standard-layout (contiguous) array. +fn bench_input_copy(c: &mut Criterion) { + let mut group = c.benchmark_group("onnx_input_copy"); + + // A larger, realistic CSI-like input to make the copy cost visible. + let big_shape = [1usize, 256, 64, 64]; + let arr: Array4 = Array4::from_shape_fn(big_shape, |(_, c, h, w)| (c + h + w) as f32); + let n = big_shape.iter().product::() as u64; + group.throughput(Throughput::Elements(n)); + + group.bench_function("contiguous_iter_clone_before", |b| { + b.iter(|| black_box(copy_iter(black_box(&arr)))) + }); + group.bench_function("contiguous_as_slice_after", |b| { + b.iter(|| black_box(copy_slice(black_box(&arr)))) + }); + + // Non-contiguous (transposed view) — confirms the fallback still works and + // measures it. `permuted_axes` yields a non-standard layout, so `as_slice()` + // returns None and we hit the iterator fallback. + let strided = arr.view().permuted_axes([0, 2, 3, 1]).to_owned(); + group.bench_function("strided_iter_clone_before", |b| { + b.iter(|| black_box(strided.iter().cloned().collect::>())) + }); + group.bench_function("strided_as_slice_after", |b| { + b.iter(|| { + black_box(match strided.as_slice() { + Some(s) => s.to_vec(), + None => strided.iter().cloned().collect::>(), + }) + }) + }); + + group.finish(); +} + +/// WIN 2 — end-to-end single inference (input build + ORT run) with the real model. +fn bench_single_inference(c: &mut Criterion) { + let path = fixture_path(); + if !path.exists() { + eprintln!("skip onnx single inference: fixture missing at {path:?}"); + return; + } + let backend = match OnnxBackend::from_file(&path) { + Ok(b) => b, + Err(e) => { + eprintln!("skip onnx single inference: failed to load model: {e}"); + return; + } + }; + let input_name = backend.input_names()[0].clone(); + let input = Tensor::from_array4(Array4::from_elem(SHAPE, 0.5f32)); + + let mut group = c.benchmark_group("onnx_single_inference"); + group.bench_function("infer", |b| { + b.iter(|| { + let mut inputs = HashMap::new(); + inputs.insert(input_name.clone(), input.clone()); + black_box(backend.run(inputs).unwrap()) + }) + }); + group.finish(); +} + +/// WIN 1 — concurrency harness: shared `Arc` across N threads. +fn bench_concurrency(c: &mut Criterion) { + let path = fixture_path(); + if !path.exists() { + eprintln!("skip onnx concurrency: fixture missing at {path:?}"); + return; + } + let backend = match OnnxBackend::from_file(&path) { + Ok(b) => Arc::new(b), + Err(e) => { + eprintln!("skip onnx concurrency: failed to load model: {e}"); + return; + } + }; + let input_name = backend.input_names()[0].clone(); + + let mut group = c.benchmark_group("onnx_concurrency"); + // Fixed total work (inferences) per iteration, split across threads. Lower + // wall time at higher thread counts == real concurrency gain. + const TOTAL: usize = 64; + + for threads in [1usize, 2, 4, 8] { + group.throughput(Throughput::Elements(TOTAL as u64)); + group.bench_with_input( + BenchmarkId::from_parameter(threads), + &threads, + |b, &threads| { + let per = TOTAL / threads; + b.iter(|| { + let handles: Vec<_> = (0..threads) + .map(|_| { + let backend = Arc::clone(&backend); + let name = input_name.clone(); + thread::spawn(move || { + let input = Tensor::from_array4(Array4::from_elem(SHAPE, 0.5f32)); + for _ in 0..per { + let mut inputs = HashMap::new(); + inputs.insert(name.clone(), input.clone()); + black_box(backend.run(inputs).unwrap()); + } + }) + }) + .collect(); + for h in handles { + h.join().unwrap(); + } + }) + }, + ); + } + group.finish(); +} + +criterion_group!( + benches, + bench_input_copy, + bench_single_inference, + bench_concurrency, +); +criterion_main!(benches); diff --git a/v2/crates/wifi-densepose-nn/src/onnx.rs b/v2/crates/wifi-densepose-nn/src/onnx.rs index fd1e2f92..72aa4f3c 100644 --- a/v2/crates/wifi-densepose-nn/src/onnx.rs +++ b/v2/crates/wifi-densepose-nn/src/onnx.rs @@ -12,6 +12,30 @@ use std::path::Path; use std::sync::Arc; use tracing::info; +/// Validate an ONNX output shape and convert it to `usize` dims. +/// +/// ADR-155 §Tier-2: ONNX reports unresolved dynamic dimensions as `-1` (and ORT +/// may report `0`). The naive `d as usize` cast turns `-1` into `usize::MAX`, +/// which a downstream `from_shape_vec` would try to allocate against — a +/// config-OOM / allocation overflow. This rejects any non-positive dim with a +/// clear [`NnError`] instead. +fn checked_output_dims(name: &str, shape: I) -> NnResult> +where + I: IntoIterator, +{ + let mut dims = Vec::new(); + for d in shape { + if d <= 0 { + return Err(NnError::tensor_op(format!( + "Output `{name}` has non-positive dim {d}; dynamic/unresolved \ + ONNX dimensions are not supported for output reshaping" + ))); + } + dims.push(d as usize); + } + Ok(dims) +} + /// ONNX Runtime session wrapper pub struct OnnxSession { session: Session, @@ -119,7 +143,13 @@ impl OnnxSession { &self.output_names } - /// Run inference + /// Run inference. + /// + /// Takes `&mut self` because `ort` 2.0.0-rc.11's `Session::run` is declared + /// `&mut self`. The underlying C++ `OrtSession::Run` is internally + /// thread-safe, but the safe Rust wrapper at this version does not expose a + /// `&self` run path, so concurrent inferences are serialized at the + /// `OnnxBackend` write lock. See the note on `OnnxBackend::run`. pub fn run(&mut self, inputs: HashMap) -> NnResult> { // Get the first input tensor let first_input_name = self @@ -133,9 +163,17 @@ impl OnnxSession { let arr = tensor.as_array4()?; - // Get shape and data for ort tensor creation + // Get shape and data for ort tensor creation. let shape: Vec = arr.shape().iter().map(|&d| d as i64).collect(); - let data: Vec = arr.iter().cloned().collect(); + // Zero-copy when the ndarray is standard-layout/contiguous (the common + // case for freshly built input tensors): `as_slice()` returns the backing + // buffer directly, so `to_vec()` is a single memcpy rather than an + // element-wise iterator copy. Fall back to the iterator copy only for + // non-contiguous (e.g. transposed/sliced) views. + let data: Vec = match arr.as_slice() { + Some(slice) => slice.to_vec(), + None => arr.iter().cloned().collect(), + }; // Create ORT tensor from shape and data let ort_tensor = ort::value::Tensor::from_array((shape, data)) @@ -157,7 +195,12 @@ impl OnnxSession { if let Some(output) = session_outputs.get(name.as_str()) { // Try to extract tensor - returns (shape, data) tuple in ort 2.0 if let Ok((shape, data)) = output.try_extract_tensor::() { - let dims: Vec = shape.iter().map(|&d| d as usize).collect(); + // ADR-155 §Tier-2: an unresolved ONNX dynamic dim comes back + // as `-1` (and ORT can report `0`). Casting `-1i64 as usize` + // yields `usize::MAX`, which `from_shape_vec` would try to + // allocate against — a config-OOM / overflow. Reject any + // non-positive output dim explicitly instead. + let dims = checked_output_dims(name, shape.iter().map(|&d| d))?; if dims.len() == 4 { // Convert to 4D array @@ -270,6 +313,12 @@ impl Backend for OnnxBackend { } fn run(&self, inputs: HashMap) -> NnResult> { + // Write lock: `ort` 2.0.0-rc.11 exposes `Session::run` as `&mut self`, so + // a read lock will not type-check here even though the underlying C++ + // `OrtSession::Run` is internally thread-safe. Concurrent inferences are + // therefore serialized at this lock until the wrapper exposes a `&self` + // run (a later ort release) or we accept an `unsafe` interior-mutability + // bypass. Kept as a write lock for soundness. self.session.write().run(inputs) } @@ -448,6 +497,19 @@ mod tests { assert!(builder.model_path.is_none()); } + // ADR-155 §Tier-2: a `-1` (dynamic) or `0` ONNX output dim must be rejected + // with an error, never cast to `usize::MAX` and fed into an allocation. + #[test] + fn test_checked_output_dims_rejects_dynamic_and_zero() { + // Valid positive dims pass through. + let ok = checked_output_dims("out", [1i64, 24, 56, 56]).unwrap(); + assert_eq!(ok, vec![1, 24, 56, 56]); + // `-1` (unresolved dynamic batch) is rejected. + assert!(checked_output_dims("out", [-1i64, 24, 56, 56]).is_err()); + // `0` is also rejected. + assert!(checked_output_dims("out", [1i64, 0, 56, 56]).is_err()); + } + #[test] fn test_tensor_spec() { let spec = TensorSpec { diff --git a/v2/crates/wifi-densepose-nn/tests/fixtures/tiny_conv.onnx b/v2/crates/wifi-densepose-nn/tests/fixtures/tiny_conv.onnx new file mode 100644 index 0000000000000000000000000000000000000000..56698953b0ed5ded45c9fe156e67453b094c5e17 GIT binary patch literal 666 zcmd4%{3$9m)4QK$+OeL0})SOZw&XUZ$%6O2$S_hap zSU8wLP=HZs1G8g^x2;1*(w@$xEw%w)rR~}-?%yMFZvP&+ZHjj5wddM?^>*0XZ5?NO z?}F`Kzr%KWwat&&)W3G#w_%~dKKX#Oy{)>rX?XuA3was+f&ZHijj4InbtgF85nV2TNZ&J?Wy;W;% z_Aal#Vf$|%k1cnn$-dS5SZvSjlHF^zsB7h! zSDrd<%kS#4w^@9~o(q#7?Opr(%ie?O%WPR6a@kdUd%c^z?X0b*f{5*Hxt6`c>>{>X z>tyZZPi5^kvQV&VQ*7H88@$5SU>=`M@~1C*oWsRz-nMz~>-@5L&v#cLTj?-yyGb4) zd!wc--OJVLyEngbw=LfaHru4X5qtkz2idyw%G*3mIA(kHc>mserfhp#XDjU0zAL)# z?UYY8PPebxT;XQ3s?|BLN8-J}zB}GJdo2@q_U8L7*(<2fvgcgAs;#xEquqSwej6vo za4ks=7GT^7FrWb^MyDuwXj&2y;}YRu6cXTK;$Q?~W+3JO;UonvXv#oS!2(m^#KOfO GzzYDS?)=;U literal 0 HcmV?d00001