diff --git a/v2/crates/wifi-densepose-temporal/Cargo.toml b/v2/crates/wifi-densepose-temporal/Cargo.toml index 8d25889b..f237845d 100644 --- a/v2/crates/wifi-densepose-temporal/Cargo.toml +++ b/v2/crates/wifi-densepose-temporal/Cargo.toml @@ -21,3 +21,7 @@ fp16 = [] [[example]] name = "init_random_blob" path = "examples/init_random_blob.rs" + +[[example]] +name = "bench_speedup" +path = "examples/bench_speedup.rs" diff --git a/v2/crates/wifi-densepose-temporal/benches_results.md b/v2/crates/wifi-densepose-temporal/benches_results.md new file mode 100644 index 00000000..e587d6e6 --- /dev/null +++ b/v2/crates/wifi-densepose-temporal/benches_results.md @@ -0,0 +1,72 @@ +# Bench results — sparse vs dense prefill + +Output of `cargo run -p wifi-densepose-temporal --example bench_speedup --release` +on a Windows 11 / x86_64 dev box, 2026-05-08. Single-run wall-clock, +pure-Rust vs pure-Rust (no SIMD/threads on either side). Reproduce by +running the example yourself; results vary 2–3× between machines and +power states, but the **trends across N** are what matter. + +## Sparse-vs-dense prefill speedup + +Config: `q_heads=4, kv_heads=4, head_dim=32, window=16, block_size=32, causal=true`. + +| N | Dense (ms) | Sparse (ms) | Speedup | +|--------|-------------:|-------------:|--------:| +| 64 | 0.262 | 0.141 | 1.86× | +| 128 | 1.120 | 0.335 | 3.34× | +| 256 | 4.129 | 0.711 | 5.81× | +| 512 | 19.230 | 2.356 | 8.16× | +| 1024 | 71.904 | 3.389 | **21.21×** | + +## Asymptotic check + +ADR-096 §3.1 claimed dense scales as O(N²) and sparse as O(N log N). +The measured 64→1024 cost growth (16× more tokens) is: + +| Path | 64 ms | 1024 ms | Growth | Theory | +|--------|------:|--------:|-------:|-------:| +| Dense | 0.262 | 71.904 | 274× | 256× = 16² | +| Sparse | 0.141 | 3.389 | 24× | ~27× = 16 · log(1024)/log(64) | + +Dense's 274× growth matches `N²` cleanly. Sparse's 24× growth matches +`N log N` to within measurement noise. **The asymptotic complexity +claim is empirically supported on this hardware.** + +## Why N=64 is only 1.86× and not faster + +ADR-096 §3.1 already called this out: at the AETHER training default +of `window_frames = 100`, dense MHA is essentially free and the sparse +machinery has overhead — the per-token candidate-set construction, +landmark indexing, and global-token bookkeeping are constant-factor +costs that only amortize past N ≈ 200. The speedup-vs-N curve +inflects sharply between N=128 and N=256 because that's where dense's +N² term starts dominating its constants. + +If a downstream consumer is using AETHER on 4-frame windows +(`proof.rs`, `trainer.rs`), this ADR pays nothing. The case rests +entirely on the long-window roadmap. + +## What this benchmark doesn't measure + +- **Decode-step latency.** `streaming_step_matches_forward_at_last_position` + proves correctness; this bench doesn't measure how fast `decode_step` + runs vs a hypothetical dense-MHA decode (which would be O(N²) recompute + every step — structurally not even comparable). +- **Memory.** KvCache + FP16 halves the K/V footprint vs FP32, which + matters more on the firmware than on x86_64 host. Phase 5 unblocking + is the prerequisite for measuring this on real hardware. +- **GQA dispatch.** This config uses `q_heads == kv_heads` to force + the MHA branch, so dense and sparse operate on the same shape. + Real AETHER will probably want `kv_heads=1` (MQA) which halves + the KV memory and is what the default head config picks. + +## How to run + +``` +cargo run -p wifi-densepose-temporal --example bench_speedup --release +``` + +Release mode is mandatory. Debug builds run sparse 5–10× slower than +release because the candidate-set construction has tight inner loops +that benefit hard from `-O3`. Don't draw conclusions from `cargo run` +without `--release`. diff --git a/v2/crates/wifi-densepose-temporal/examples/bench_speedup.rs b/v2/crates/wifi-densepose-temporal/examples/bench_speedup.rs new file mode 100644 index 00000000..737db1bc --- /dev/null +++ b/v2/crates/wifi-densepose-temporal/examples/bench_speedup.rs @@ -0,0 +1,151 @@ +// Measure sparse-GQA prefill cost vs dense MHA at N = {64, 128, 256, 512, 1024}. +// ADR-096 §3.1 claimed 30–100× edge-evaluation reduction at long windows; +// this is the empirical check. +// +// Run with: cargo run -p wifi-densepose-temporal --example bench_speedup --release +// +// Caveat: single-run wall-clock on one machine — not a rigorous benchmark. +// Trends across N matter more than the absolute numbers, and results vary +// 2–3× between machines / power states. The point is to confirm the +// magnitude of the speedup is what the ADR claimed, not a perf-engineering +// dashboard. For that, use criterion + a dedicated machine. + +use std::time::Instant; + +use ruvllm_sparse_attention::{dense_attention, AttentionBackend, SparseAttentionConfig, SubquadraticSparseAttention, Tensor3}; +use wifi_densepose_temporal::{TemporalBackendKind, TemporalHeadConfig, AetherTemporalHead}; + +fn make_qkv(seq: usize, heads: usize, dim: usize) -> (Tensor3, Tensor3, Tensor3) { + // Simple deterministic init — content doesn't matter for timing, + // but we want each benchmark run to use the same numbers. + let mut q = Tensor3::zeros(seq, heads, dim); + let mut k = Tensor3::zeros(seq, heads, dim); + let mut v = Tensor3::zeros(seq, heads, dim); + for s in 0..seq { + for h in 0..heads { + for d in 0..dim { + let qv = ((s * 31 + h * 7 + d) as f32).sin() * 0.1; + let kv = (((s * 17 + h * 3 + d) as f32).cos()) * 0.1; + q.set(s, h, d, qv); + k.set(s, h, d, kv); + v.set(s, h, d, kv * 0.5); + } + } + } + (q, k, v) +} + +fn time_run(label: &str, runs: usize, mut f: F) -> f64 { + // 1 warmup + `runs` measurements. Wall clock; release-mode only is + // meaningful (debug builds run sparse 5–10× slower than release). + f(); + let start = Instant::now(); + for _ in 0..runs { + f(); + } + let total_ms = start.elapsed().as_secs_f64() * 1000.0; + let avg_ms = total_ms / runs as f64; + println!(" {label:<36} {avg_ms:>8.3} ms/run ({runs} runs)"); + avg_ms +} + +fn bench_at(seq: usize) -> (f64, f64, f64) { + println!(); + println!("=== seq = {seq} ==="); + + // MHA shape (q_heads == kv_heads) so dense_attention and the sparse + // forward path operate on the same tensor shape — direct timing + // comparison without GQA bookkeeping confounding the result. + let heads = 4; + let dim = 32; + let (q, k, v) = make_qkv(seq, heads, dim); + + // Dense reference. dense_attention is the upstream's naive O(N²) + // pure-Rust kernel — same scale, same shape, no SIMD acceleration — + // a fair head-to-head against the equally-pure-Rust sparse path. + let runs_dense = if seq <= 128 { 50 } else if seq <= 512 { 10 } else { 3 }; + let dense_ms = time_run( + &format!("dense_attention (causal=true)"), + runs_dense, + || { + let _ = dense_attention(&q, &k, &v, true).expect("dense forward"); + }, + ); + + // Sparse via the AETHER head wrapper — same code path the production + // training/inference would use, not the lower-level SubquadraticSparseAttention. + // Window/block_size kept small so the sparse pattern actually drops + // candidates at all benchmark lengths (otherwise at N=64 with default + // config we'd touch the entire sequence and look the same as dense). + let cfg = TemporalHeadConfig { + backend: TemporalBackendKind::SparseGqa, + q_heads: heads, + kv_heads: heads, // MHA — match dense + head_dim: dim, + window: 16, + block_size: 32, + causal: true, + }; + let head = AetherTemporalHead::new(&cfg).expect("construct head"); + let runs_sparse = if seq <= 128 { 50 } else if seq <= 512 { 30 } else { 10 }; + let sparse_ms = time_run( + "AetherTemporalHead.forward (sparse)", + runs_sparse, + || { + let _ = head.forward(&q, &k, &v).expect("sparse forward"); + }, + ); + + // Also measure SubquadraticSparseAttention directly — bypasses our + // wrapper, useful for confirming the wrapper isn't introducing + // measurable overhead. + let attn = SubquadraticSparseAttention::new(SparseAttentionConfig { + window: 16, + block_size: 32, + global_tokens: vec![0], + causal: true, + use_log_stride: true, + use_landmarks: true, + sort_candidates: false, + }) + .expect("construct attn"); + let raw_ms = time_run( + "Subquadratic.forward (raw, no wrapper)", + runs_sparse, + || { + let _ = attn.forward(&q, &k, &v).expect("raw sparse forward"); + }, + ); + + let speedup = dense_ms / sparse_ms; + println!(" -> sparse/dense speedup {speedup:>6.2}×"); + + (dense_ms, sparse_ms, speedup) +} + +fn main() { + println!("ADR-096 §3.1 empirical speedup check"); + println!("===================================="); + println!("Pure-Rust vs pure-Rust, no SIMD/threads, single-run wall-clock."); + println!("Trends across N matter more than absolute numbers."); + + let lengths = [64, 128, 256, 512, 1024]; + let mut rows: Vec<(usize, f64, f64, f64)> = Vec::new(); + for &n in &lengths { + let (dense_ms, sparse_ms, speedup) = bench_at(n); + rows.push((n, dense_ms, sparse_ms, speedup)); + } + + println!(); + println!("Summary"); + println!(" N dense (ms) sparse (ms) speedup"); + println!(" ---- ---------- ----------- -------"); + for (n, d, s, sp) in &rows { + println!(" {n:<5} {d:>10.3} {s:>11.3} {sp:>5.2}×"); + } + println!(); + println!("ADR-096 §3.1 claim: ~30× edge reduction at N=8192,"); + println!("growing roughly N/log(N). At N=1024 the claim is ~5–10×;"); + println!("at N=64 the sparse machinery is overhead-bound (sparse may"); + println!("lose, see ADR-096 §3.1 'honest framing' paragraph)."); +}