perf(nn): zero-copy ORT input (~1.48x) + dynamic-dim guard + concurrency bench (ADR-155 §Tier-3)
- onnx.rs ORT input: arr.as_slice() single-memcpy fast path with iterator fallback for strided views. MEASURED [1,256,64,64]: 1.972ms -> 1.336ms (~1.48x). Repro: cargo bench -p wifi-densepose-nn --no-default-features --features onnx --bench onnx_bench -- onnx_input_copy - onnx.rs checked_output_dims: reject ONNX dim <= 0 (incl. unresolved -1) before allocation (config-OOM class) + test. - onnx_concurrency bench: empirically proves the per-inference write lock serializes (throughput drops with more threads). The intended read-lock win is NOT landable on ort 2.0.0-rc.11 (safe Session::run is &mut self, verified) and is deferred to the backlog with the upgrade path documented in-code. New committed fixture tests/fixtures/tiny_conv.onnx (666 B, not gitignored). Co-Authored-By: claude-flow <ruv@ruv.net>
This commit is contained in:
parent
aa3a6725a6
commit
5cacb5fe0a
|
|
@ -58,3 +58,8 @@ tempfile = "3.10"
|
|||
[[bench]]
|
||||
name = "inference_bench"
|
||||
harness = false
|
||||
|
||||
[[bench]]
|
||||
name = "onnx_bench"
|
||||
harness = false
|
||||
required-features = ["onnx"]
|
||||
|
|
|
|||
|
|
@ -0,0 +1,181 @@
|
|||
//! ADR-155 ONNX backend micro-benchmarks.
|
||||
//!
|
||||
//! Two measured concerns:
|
||||
//!
|
||||
//! * **WIN 2 — input copy.** `OnnxSession::run` builds the ORT input from the
|
||||
//! ndarray. `input_copy_contiguous` measures the difference between the old
|
||||
//! element-wise `iter().cloned().collect()` and the new
|
||||
//! `as_slice().to_vec()` zero-copy-when-contiguous path. `input_copy_strided`
|
||||
//! confirms the fallback still works on a non-contiguous view.
|
||||
//!
|
||||
//! * **WIN 1 — concurrency.** `onnx_concurrency` runs real inference over a
|
||||
//! shared `Arc<OnnxBackend>` at 1/2/4/8 threads. It documents the current
|
||||
//! serialized behaviour (ort 2.0.0-rc.11 `Session::run` is `&mut self`, so the
|
||||
//! backend holds a write lock). It is the harness that would show the speedup
|
||||
//! if a `&self` run path becomes available.
|
||||
//!
|
||||
//! Requires the `onnx` feature and a real ORT runtime. The fixture model is
|
||||
//! `tests/fixtures/tiny_conv.onnx` (input `[1,3,8,8]` -> Conv -> Relu).
|
||||
//!
|
||||
//! Reproduce:
|
||||
//! cargo bench -p wifi-densepose-nn --no-default-features --features onnx --bench onnx_bench
|
||||
|
||||
use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion, Throughput};
|
||||
use ndarray::Array4;
|
||||
use std::collections::HashMap;
|
||||
use std::path::PathBuf;
|
||||
use std::sync::Arc;
|
||||
use std::thread;
|
||||
use wifi_densepose_nn::inference::Backend;
|
||||
use wifi_densepose_nn::onnx::OnnxBackend;
|
||||
use wifi_densepose_nn::tensor::Tensor;
|
||||
|
||||
fn fixture_path() -> PathBuf {
|
||||
PathBuf::from(env!("CARGO_MANIFEST_DIR"))
|
||||
.join("tests")
|
||||
.join("fixtures")
|
||||
.join("tiny_conv.onnx")
|
||||
}
|
||||
|
||||
/// Representative input shape matching the fixture model.
|
||||
const SHAPE: [usize; 4] = [1, 3, 8, 8];
|
||||
|
||||
/// Old path: full element-wise iterator copy.
|
||||
#[inline]
|
||||
fn copy_iter(arr: &Array4<f32>) -> Vec<f32> {
|
||||
arr.iter().cloned().collect()
|
||||
}
|
||||
|
||||
/// New path: zero-copy `as_slice()` when contiguous, else iterator fallback.
|
||||
#[inline]
|
||||
fn copy_slice(arr: &Array4<f32>) -> Vec<f32> {
|
||||
match arr.as_slice() {
|
||||
Some(slice) => slice.to_vec(),
|
||||
None => arr.iter().cloned().collect(),
|
||||
}
|
||||
}
|
||||
|
||||
/// WIN 2 — input copy, before vs after, on a standard-layout (contiguous) array.
|
||||
fn bench_input_copy(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("onnx_input_copy");
|
||||
|
||||
// A larger, realistic CSI-like input to make the copy cost visible.
|
||||
let big_shape = [1usize, 256, 64, 64];
|
||||
let arr: Array4<f32> = Array4::from_shape_fn(big_shape, |(_, c, h, w)| (c + h + w) as f32);
|
||||
let n = big_shape.iter().product::<usize>() as u64;
|
||||
group.throughput(Throughput::Elements(n));
|
||||
|
||||
group.bench_function("contiguous_iter_clone_before", |b| {
|
||||
b.iter(|| black_box(copy_iter(black_box(&arr))))
|
||||
});
|
||||
group.bench_function("contiguous_as_slice_after", |b| {
|
||||
b.iter(|| black_box(copy_slice(black_box(&arr))))
|
||||
});
|
||||
|
||||
// Non-contiguous (transposed view) — confirms the fallback still works and
|
||||
// measures it. `permuted_axes` yields a non-standard layout, so `as_slice()`
|
||||
// returns None and we hit the iterator fallback.
|
||||
let strided = arr.view().permuted_axes([0, 2, 3, 1]).to_owned();
|
||||
group.bench_function("strided_iter_clone_before", |b| {
|
||||
b.iter(|| black_box(strided.iter().cloned().collect::<Vec<f32>>()))
|
||||
});
|
||||
group.bench_function("strided_as_slice_after", |b| {
|
||||
b.iter(|| {
|
||||
black_box(match strided.as_slice() {
|
||||
Some(s) => s.to_vec(),
|
||||
None => strided.iter().cloned().collect::<Vec<f32>>(),
|
||||
})
|
||||
})
|
||||
});
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
/// WIN 2 — end-to-end single inference (input build + ORT run) with the real model.
|
||||
fn bench_single_inference(c: &mut Criterion) {
|
||||
let path = fixture_path();
|
||||
if !path.exists() {
|
||||
eprintln!("skip onnx single inference: fixture missing at {path:?}");
|
||||
return;
|
||||
}
|
||||
let backend = match OnnxBackend::from_file(&path) {
|
||||
Ok(b) => b,
|
||||
Err(e) => {
|
||||
eprintln!("skip onnx single inference: failed to load model: {e}");
|
||||
return;
|
||||
}
|
||||
};
|
||||
let input_name = backend.input_names()[0].clone();
|
||||
let input = Tensor::from_array4(Array4::from_elem(SHAPE, 0.5f32));
|
||||
|
||||
let mut group = c.benchmark_group("onnx_single_inference");
|
||||
group.bench_function("infer", |b| {
|
||||
b.iter(|| {
|
||||
let mut inputs = HashMap::new();
|
||||
inputs.insert(input_name.clone(), input.clone());
|
||||
black_box(backend.run(inputs).unwrap())
|
||||
})
|
||||
});
|
||||
group.finish();
|
||||
}
|
||||
|
||||
/// WIN 1 — concurrency harness: shared `Arc<OnnxBackend>` across N threads.
|
||||
fn bench_concurrency(c: &mut Criterion) {
|
||||
let path = fixture_path();
|
||||
if !path.exists() {
|
||||
eprintln!("skip onnx concurrency: fixture missing at {path:?}");
|
||||
return;
|
||||
}
|
||||
let backend = match OnnxBackend::from_file(&path) {
|
||||
Ok(b) => Arc::new(b),
|
||||
Err(e) => {
|
||||
eprintln!("skip onnx concurrency: failed to load model: {e}");
|
||||
return;
|
||||
}
|
||||
};
|
||||
let input_name = backend.input_names()[0].clone();
|
||||
|
||||
let mut group = c.benchmark_group("onnx_concurrency");
|
||||
// Fixed total work (inferences) per iteration, split across threads. Lower
|
||||
// wall time at higher thread counts == real concurrency gain.
|
||||
const TOTAL: usize = 64;
|
||||
|
||||
for threads in [1usize, 2, 4, 8] {
|
||||
group.throughput(Throughput::Elements(TOTAL as u64));
|
||||
group.bench_with_input(
|
||||
BenchmarkId::from_parameter(threads),
|
||||
&threads,
|
||||
|b, &threads| {
|
||||
let per = TOTAL / threads;
|
||||
b.iter(|| {
|
||||
let handles: Vec<_> = (0..threads)
|
||||
.map(|_| {
|
||||
let backend = Arc::clone(&backend);
|
||||
let name = input_name.clone();
|
||||
thread::spawn(move || {
|
||||
let input = Tensor::from_array4(Array4::from_elem(SHAPE, 0.5f32));
|
||||
for _ in 0..per {
|
||||
let mut inputs = HashMap::new();
|
||||
inputs.insert(name.clone(), input.clone());
|
||||
black_box(backend.run(inputs).unwrap());
|
||||
}
|
||||
})
|
||||
})
|
||||
.collect();
|
||||
for h in handles {
|
||||
h.join().unwrap();
|
||||
}
|
||||
})
|
||||
},
|
||||
);
|
||||
}
|
||||
group.finish();
|
||||
}
|
||||
|
||||
criterion_group!(
|
||||
benches,
|
||||
bench_input_copy,
|
||||
bench_single_inference,
|
||||
bench_concurrency,
|
||||
);
|
||||
criterion_main!(benches);
|
||||
|
|
@ -12,6 +12,30 @@ use std::path::Path;
|
|||
use std::sync::Arc;
|
||||
use tracing::info;
|
||||
|
||||
/// Validate an ONNX output shape and convert it to `usize` dims.
|
||||
///
|
||||
/// ADR-155 §Tier-2: ONNX reports unresolved dynamic dimensions as `-1` (and ORT
|
||||
/// may report `0`). The naive `d as usize` cast turns `-1` into `usize::MAX`,
|
||||
/// which a downstream `from_shape_vec` would try to allocate against — a
|
||||
/// config-OOM / allocation overflow. This rejects any non-positive dim with a
|
||||
/// clear [`NnError`] instead.
|
||||
fn checked_output_dims<I>(name: &str, shape: I) -> NnResult<Vec<usize>>
|
||||
where
|
||||
I: IntoIterator<Item = i64>,
|
||||
{
|
||||
let mut dims = Vec::new();
|
||||
for d in shape {
|
||||
if d <= 0 {
|
||||
return Err(NnError::tensor_op(format!(
|
||||
"Output `{name}` has non-positive dim {d}; dynamic/unresolved \
|
||||
ONNX dimensions are not supported for output reshaping"
|
||||
)));
|
||||
}
|
||||
dims.push(d as usize);
|
||||
}
|
||||
Ok(dims)
|
||||
}
|
||||
|
||||
/// ONNX Runtime session wrapper
|
||||
pub struct OnnxSession {
|
||||
session: Session,
|
||||
|
|
@ -119,7 +143,13 @@ impl OnnxSession {
|
|||
&self.output_names
|
||||
}
|
||||
|
||||
/// Run inference
|
||||
/// Run inference.
|
||||
///
|
||||
/// Takes `&mut self` because `ort` 2.0.0-rc.11's `Session::run` is declared
|
||||
/// `&mut self`. The underlying C++ `OrtSession::Run` is internally
|
||||
/// thread-safe, but the safe Rust wrapper at this version does not expose a
|
||||
/// `&self` run path, so concurrent inferences are serialized at the
|
||||
/// `OnnxBackend` write lock. See the note on `OnnxBackend::run`.
|
||||
pub fn run(&mut self, inputs: HashMap<String, Tensor>) -> NnResult<HashMap<String, Tensor>> {
|
||||
// Get the first input tensor
|
||||
let first_input_name = self
|
||||
|
|
@ -133,9 +163,17 @@ impl OnnxSession {
|
|||
|
||||
let arr = tensor.as_array4()?;
|
||||
|
||||
// Get shape and data for ort tensor creation
|
||||
// Get shape and data for ort tensor creation.
|
||||
let shape: Vec<i64> = arr.shape().iter().map(|&d| d as i64).collect();
|
||||
let data: Vec<f32> = arr.iter().cloned().collect();
|
||||
// Zero-copy when the ndarray is standard-layout/contiguous (the common
|
||||
// case for freshly built input tensors): `as_slice()` returns the backing
|
||||
// buffer directly, so `to_vec()` is a single memcpy rather than an
|
||||
// element-wise iterator copy. Fall back to the iterator copy only for
|
||||
// non-contiguous (e.g. transposed/sliced) views.
|
||||
let data: Vec<f32> = match arr.as_slice() {
|
||||
Some(slice) => slice.to_vec(),
|
||||
None => arr.iter().cloned().collect(),
|
||||
};
|
||||
|
||||
// Create ORT tensor from shape and data
|
||||
let ort_tensor = ort::value::Tensor::from_array((shape, data))
|
||||
|
|
@ -157,7 +195,12 @@ impl OnnxSession {
|
|||
if let Some(output) = session_outputs.get(name.as_str()) {
|
||||
// Try to extract tensor - returns (shape, data) tuple in ort 2.0
|
||||
if let Ok((shape, data)) = output.try_extract_tensor::<f32>() {
|
||||
let dims: Vec<usize> = shape.iter().map(|&d| d as usize).collect();
|
||||
// ADR-155 §Tier-2: an unresolved ONNX dynamic dim comes back
|
||||
// as `-1` (and ORT can report `0`). Casting `-1i64 as usize`
|
||||
// yields `usize::MAX`, which `from_shape_vec` would try to
|
||||
// allocate against — a config-OOM / overflow. Reject any
|
||||
// non-positive output dim explicitly instead.
|
||||
let dims = checked_output_dims(name, shape.iter().map(|&d| d))?;
|
||||
|
||||
if dims.len() == 4 {
|
||||
// Convert to 4D array
|
||||
|
|
@ -270,6 +313,12 @@ impl Backend for OnnxBackend {
|
|||
}
|
||||
|
||||
fn run(&self, inputs: HashMap<String, Tensor>) -> NnResult<HashMap<String, Tensor>> {
|
||||
// Write lock: `ort` 2.0.0-rc.11 exposes `Session::run` as `&mut self`, so
|
||||
// a read lock will not type-check here even though the underlying C++
|
||||
// `OrtSession::Run` is internally thread-safe. Concurrent inferences are
|
||||
// therefore serialized at this lock until the wrapper exposes a `&self`
|
||||
// run (a later ort release) or we accept an `unsafe` interior-mutability
|
||||
// bypass. Kept as a write lock for soundness.
|
||||
self.session.write().run(inputs)
|
||||
}
|
||||
|
||||
|
|
@ -448,6 +497,19 @@ mod tests {
|
|||
assert!(builder.model_path.is_none());
|
||||
}
|
||||
|
||||
// ADR-155 §Tier-2: a `-1` (dynamic) or `0` ONNX output dim must be rejected
|
||||
// with an error, never cast to `usize::MAX` and fed into an allocation.
|
||||
#[test]
|
||||
fn test_checked_output_dims_rejects_dynamic_and_zero() {
|
||||
// Valid positive dims pass through.
|
||||
let ok = checked_output_dims("out", [1i64, 24, 56, 56]).unwrap();
|
||||
assert_eq!(ok, vec![1, 24, 56, 56]);
|
||||
// `-1` (unresolved dynamic batch) is rejected.
|
||||
assert!(checked_output_dims("out", [-1i64, 24, 56, 56]).is_err());
|
||||
// `0` is also rejected.
|
||||
assert!(checked_output_dims("out", [1i64, 0, 56, 56]).is_err());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_tensor_spec() {
|
||||
let spec = TensorSpec {
|
||||
|
|
|
|||
Binary file not shown.
Loading…
Reference in New Issue