wifi-densepose/vendor/sublinear-time-solver/crates/neural-network-implementation/benches/latency_benchmark.rs

464 lines
18 KiB
Rust

//! Latency benchmark for System A and System B
//!
//! This benchmark measures end-to-end prediction latency with high precision,
//! targeting the critical P99.9 < 0.9ms performance requirement.
use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion};
use std::time::{Duration, Instant};
use temporal_neural_net::prelude::*;
use nalgebra::{DMatrix, DVector};
use std::collections::HashMap;
/// Number of warmup iterations before measurement
const WARMUP_ITERATIONS: usize = 10000;
/// Number of measurement samples for latency distribution
const MEASUREMENT_SAMPLES: usize = 100000;
/// Input data dimensions
const SEQUENCE_LENGTH: usize = 64;
const FEATURE_DIM: usize = 4;
const OUTPUT_DIM: usize = 2;
/// Latency measurement structure
#[derive(Debug, Clone)]
struct LatencyMeasurement {
system_type: String,
sample_id: usize,
latency_ns: u64,
phase_breakdown: PhaseBreakdown,
success: bool,
error: Option<String>,
}
/// Breakdown of latency by processing phase
#[derive(Debug, Clone, Default)]
struct PhaseBreakdown {
/// Data ingestion time
ingest_ns: u64,
/// Kalman prior computation (System B only)
prior_ns: u64,
/// Neural network forward pass
network_ns: u64,
/// Solver gate verification (System B only)
gate_ns: u64,
/// Output finalization
finalization_ns: u64,
}
/// Statistical summary of latency measurements
#[derive(Debug, Clone)]
struct LatencyStatistics {
count: usize,
mean_ns: f64,
std_dev_ns: f64,
min_ns: u64,
max_ns: u64,
p50_ns: u64,
p90_ns: u64,
p95_ns: u64,
p99_ns: u64,
p99_9_ns: u64,
p99_99_ns: u64,
success_rate: f64,
}
/// High-precision latency measurement context
struct LatencyBenchmarkContext {
system_a: SystemA,
system_b: SystemB,
test_inputs: Vec<DMatrix<f64>>,
measurements_a: Vec<LatencyMeasurement>,
measurements_b: Vec<LatencyMeasurement>,
}
impl LatencyBenchmarkContext {
/// Create new benchmark context with pre-initialized systems
fn new() -> Result<Self> {
// Create configurations for both systems
let config_a = Config::default();
let mut config_b = config_a.clone();
// Enable temporal solver features for System B
config_b.system = crate::config::SystemConfig::TemporalSolver(
crate::config::TemporalSolverConfig::default()
);
// Initialize systems
let system_a = SystemA::new(&config_a.model)?;
let system_b = SystemB::new(&config_b.model)?;
// Pre-generate test inputs for consistent benchmarking
let test_inputs = Self::generate_test_inputs(MEASUREMENT_SAMPLES);
Ok(Self {
system_a,
system_b,
test_inputs,
measurements_a: Vec::with_capacity(MEASUREMENT_SAMPLES),
measurements_b: Vec::with_capacity(MEASUREMENT_SAMPLES),
})
}
/// Generate consistent test inputs
fn generate_test_inputs(count: usize) -> Vec<DMatrix<f64>> {
use rand::prelude::*;
let mut rng = StdRng::seed_from_u64(42); // Deterministic seed
(0..count)
.map(|_| {
DMatrix::from_fn(SEQUENCE_LENGTH, FEATURE_DIM, |_, _| {
rng.gen_range(-1.0..1.0)
})
})
.collect()
}
/// Perform warmup phase to ensure stable measurements
fn warmup(&mut self) -> Result<()> {
println!("Performing warmup with {} iterations...", WARMUP_ITERATIONS);
for i in 0..WARMUP_ITERATIONS {
let input = &self.test_inputs[i % self.test_inputs.len()];
// Warmup System A
let _ = self.system_a.forward(input)?;
// Warmup System B
let _ = self.system_b.forward(input)?;
if i % 1000 == 0 {
println!("Warmup progress: {}/{}", i, WARMUP_ITERATIONS);
}
}
println!("Warmup completed");
Ok(())
}
/// Measure latency for System A with detailed phase breakdown
fn measure_system_a_latency(&mut self, input: &DMatrix<f64>, sample_id: usize) -> LatencyMeasurement {
let mut breakdown = PhaseBreakdown::default();
let start_time = Instant::now();
// Phase 1: Data ingestion
let ingest_start = Instant::now();
let validated_input = input.clone(); // Simulate input validation/copying
breakdown.ingest_ns = ingest_start.elapsed().as_nanos() as u64;
// Phase 2: Neural network forward pass
let network_start = Instant::now();
let result = self.system_a.forward(&validated_input);
breakdown.network_ns = network_start.elapsed().as_nanos() as u64;
// Phase 3: Output finalization
let finalization_start = Instant::now();
let success = result.is_ok();
let error = result.err().map(|e| e.to_string());
breakdown.finalization_ns = finalization_start.elapsed().as_nanos() as u64;
let total_latency_ns = start_time.elapsed().as_nanos() as u64;
LatencyMeasurement {
system_type: "SystemA".to_string(),
sample_id,
latency_ns: total_latency_ns,
phase_breakdown: breakdown,
success,
error,
}
}
/// Measure latency for System B with detailed phase breakdown
fn measure_system_b_latency(&mut self, input: &DMatrix<f64>, sample_id: usize) -> LatencyMeasurement {
let mut breakdown = PhaseBreakdown::default();
let start_time = Instant::now();
// Phase 1: Data ingestion
let ingest_start = Instant::now();
let validated_input = input.clone();
breakdown.ingest_ns = ingest_start.elapsed().as_nanos() as u64;
// Phase 2: Kalman prior computation
let prior_start = Instant::now();
// Note: This would call into the Kalman filter component
// For now, we simulate the prior computation time
std::thread::sleep(Duration::from_nanos(100000)); // 0.1ms simulated prior
breakdown.prior_ns = prior_start.elapsed().as_nanos() as u64;
// Phase 3: Neural network forward pass (residual prediction)
let network_start = Instant::now();
let result = self.system_b.forward(&validated_input);
breakdown.network_ns = network_start.elapsed().as_nanos() as u64;
// Phase 4: Solver gate verification
let gate_start = Instant::now();
// Note: This would call into the solver gate component
// For now, we simulate the gate verification time
std::thread::sleep(Duration::from_nanos(200000)); // 0.2ms simulated gate
breakdown.gate_ns = gate_start.elapsed().as_nanos() as u64;
// Phase 5: Output finalization
let finalization_start = Instant::now();
let success = result.is_ok();
let error = result.err().map(|e| e.to_string());
breakdown.finalization_ns = finalization_start.elapsed().as_nanos() as u64;
let total_latency_ns = start_time.elapsed().as_nanos() as u64;
LatencyMeasurement {
system_type: "SystemB".to_string(),
sample_id,
latency_ns: total_latency_ns,
phase_breakdown: breakdown,
success,
error,
}
}
/// Run comprehensive latency measurements
fn run_measurements(&mut self) -> Result<()> {
println!("Running {} latency measurements for each system...", MEASUREMENT_SAMPLES);
self.measurements_a.clear();
self.measurements_b.clear();
for i in 0..MEASUREMENT_SAMPLES {
let input = &self.test_inputs[i];
// Measure System A
let measurement_a = self.measure_system_a_latency(input, i);
self.measurements_a.push(measurement_a);
// Measure System B
let measurement_b = self.measure_system_b_latency(input, i);
self.measurements_b.push(measurement_b);
if i % 10000 == 0 {
println!("Measurement progress: {}/{}", i, MEASUREMENT_SAMPLES);
}
}
println!("Measurements completed");
Ok(())
}
/// Calculate statistics from measurements
fn calculate_statistics(&self, measurements: &[LatencyMeasurement]) -> LatencyStatistics {
let successful_measurements: Vec<_> = measurements
.iter()
.filter(|m| m.success)
.collect();
let latencies: Vec<u64> = successful_measurements
.iter()
.map(|m| m.latency_ns)
.collect();
if latencies.is_empty() {
return LatencyStatistics {
count: 0,
mean_ns: 0.0,
std_dev_ns: 0.0,
min_ns: 0,
max_ns: 0,
p50_ns: 0,
p90_ns: 0,
p95_ns: 0,
p99_ns: 0,
p99_9_ns: 0,
p99_99_ns: 0,
success_rate: 0.0,
};
}
let mut sorted_latencies = latencies.clone();
sorted_latencies.sort_unstable();
let count = sorted_latencies.len();
let mean_ns = sorted_latencies.iter().sum::<u64>() as f64 / count as f64;
let variance = sorted_latencies
.iter()
.map(|&x| {
let diff = x as f64 - mean_ns;
diff * diff
})
.sum::<f64>() / count as f64;
let std_dev_ns = variance.sqrt();
let percentile = |p: f64| -> u64 {
let index = ((count as f64) * p / 100.0).ceil() as usize - 1;
sorted_latencies[index.min(count - 1)]
};
LatencyStatistics {
count,
mean_ns,
std_dev_ns,
min_ns: sorted_latencies[0],
max_ns: sorted_latencies[count - 1],
p50_ns: percentile(50.0),
p90_ns: percentile(90.0),
p95_ns: percentile(95.0),
p99_ns: percentile(99.0),
p99_9_ns: percentile(99.9),
p99_99_ns: percentile(99.99),
success_rate: successful_measurements.len() as f64 / measurements.len() as f64,
}
}
/// Generate comprehensive latency report
fn generate_report(&self) -> String {
let stats_a = self.calculate_statistics(&self.measurements_a);
let stats_b = self.calculate_statistics(&self.measurements_b);
let mut report = String::new();
report.push_str("# Latency Benchmark Report\n\n");
report.push_str(&format!("**Measurement Configuration:**\n"));
report.push_str(&format!("- Sample size: {}\n", MEASUREMENT_SAMPLES));
report.push_str(&format!("- Warmup iterations: {}\n", WARMUP_ITERATIONS));
report.push_str(&format!("- Input dimensions: {}x{}\n", SEQUENCE_LENGTH, FEATURE_DIM));
report.push_str(&format!("- Target P99.9 latency: <0.9ms\n\n"));
// System A Results
report.push_str("## System A (Traditional Micro-Net) Results\n\n");
report.push_str(&format!("| Metric | Value |\n"));
report.push_str(&format!("|--------|-------|\n"));
report.push_str(&format!("| Sample Count | {} |\n", stats_a.count));
report.push_str(&format!("| Success Rate | {:.2}% |\n", stats_a.success_rate * 100.0));
report.push_str(&format!("| Mean Latency | {:.3}ms |\n", stats_a.mean_ns as f64 / 1_000_000.0));
report.push_str(&format!("| Std Dev | {:.3}ms |\n", stats_a.std_dev_ns / 1_000_000.0));
report.push_str(&format!("| Min Latency | {:.3}ms |\n", stats_a.min_ns as f64 / 1_000_000.0));
report.push_str(&format!("| P50 Latency | {:.3}ms |\n", stats_a.p50_ns as f64 / 1_000_000.0));
report.push_str(&format!("| P90 Latency | {:.3}ms |\n", stats_a.p90_ns as f64 / 1_000_000.0));
report.push_str(&format!("| P95 Latency | {:.3}ms |\n", stats_a.p95_ns as f64 / 1_000_000.0));
report.push_str(&format!("| P99 Latency | {:.3}ms |\n", stats_a.p99_ns as f64 / 1_000_000.0));
report.push_str(&format!("| **P99.9 Latency** | **{:.3}ms** |\n", stats_a.p99_9_ns as f64 / 1_000_000.0));
report.push_str(&format!("| P99.99 Latency | {:.3}ms |\n", stats_a.p99_99_ns as f64 / 1_000_000.0));
report.push_str(&format!("| Max Latency | {:.3}ms |\n\n", stats_a.max_ns as f64 / 1_000_000.0));
// System B Results
report.push_str("## System B (Temporal Solver Net) Results\n\n");
report.push_str(&format!("| Metric | Value |\n"));
report.push_str(&format!("|--------|-------|\n"));
report.push_str(&format!("| Sample Count | {} |\n", stats_b.count));
report.push_str(&format!("| Success Rate | {:.2}% |\n", stats_b.success_rate * 100.0));
report.push_str(&format!("| Mean Latency | {:.3}ms |\n", stats_b.mean_ns as f64 / 1_000_000.0));
report.push_str(&format!("| Std Dev | {:.3}ms |\n", stats_b.std_dev_ns / 1_000_000.0));
report.push_str(&format!("| Min Latency | {:.3}ms |\n", stats_b.min_ns as f64 / 1_000_000.0));
report.push_str(&format!("| P50 Latency | {:.3}ms |\n", stats_b.p50_ns as f64 / 1_000_000.0));
report.push_str(&format!("| P90 Latency | {:.3}ms |\n", stats_b.p90_ns as f64 / 1_000_000.0));
report.push_str(&format!("| P95 Latency | {:.3}ms |\n", stats_b.p95_ns as f64 / 1_000_000.0));
report.push_str(&format!("| P99 Latency | {:.3}ms |\n", stats_b.p99_ns as f64 / 1_000_000.0));
report.push_str(&format!("| **P99.9 Latency** | **{:.3}ms** |\n", stats_b.p99_9_ns as f64 / 1_000_000.0));
report.push_str(&format!("| P99.99 Latency | {:.3}ms |\n", stats_b.p99_99_ns as f64 / 1_000_000.0));
report.push_str(&format!("| Max Latency | {:.3}ms |\n\n", stats_b.max_ns as f64 / 1_000_000.0));
// Comparison Analysis
report.push_str("## Comparative Analysis\n\n");
let p99_9_improvement = (stats_a.p99_9_ns as f64 - stats_b.p99_9_ns as f64) / stats_a.p99_9_ns as f64 * 100.0;
let mean_improvement = (stats_a.mean_ns - stats_b.mean_ns) / stats_a.mean_ns * 100.0;
report.push_str(&format!("| Comparison Metric | Value |\n"));
report.push_str(&format!("|-------------------|-------|\n"));
report.push_str(&format!("| P99.9 Latency Improvement | {:.1}% |\n", p99_9_improvement));
report.push_str(&format!("| Mean Latency Improvement | {:.1}% |\n", mean_improvement));
report.push_str(&format!("| System B P99.9 Target (<0.9ms) | {} |\n",
if stats_b.p99_9_ns < 900_000 { "✅ ACHIEVED" } else { "❌ NOT ACHIEVED" }));
report.push_str(&format!("| 20% Improvement Target | {} |\n",
if p99_9_improvement >= 20.0 { "✅ ACHIEVED" } else { "❌ NOT ACHIEVED" }));
// Success Criteria Validation
report.push_str("\n## Success Criteria Validation\n\n");
let criterion_1 = stats_b.p99_9_ns < 900_000; // <0.9ms
let criterion_2 = p99_9_improvement >= 20.0; // ≥20% improvement
report.push_str(&format!("1. **System B P99.9 latency <0.9ms**: {} ({:.3}ms)\n",
if criterion_1 { "✅ PASSED" } else { "❌ FAILED" },
stats_b.p99_9_ns as f64 / 1_000_000.0));
report.push_str(&format!("2. **≥20% latency improvement**: {} ({:.1}%)\n",
if criterion_2 { "✅ PASSED" } else { "❌ FAILED" },
p99_9_improvement));
let overall_success = criterion_1 || criterion_2;
report.push_str(&format!("\n**Overall Result**: {}\n",
if overall_success { "🎉 SUCCESS - Breakthrough achieved!" } else { "⚠️ Needs improvement" }));
report
}
}
/// Criterion benchmark function for System A latency
fn bench_system_a_latency(c: &mut Criterion) {
let rt = tokio::runtime::Runtime::new().unwrap();
let mut context = rt.block_on(async {
LatencyBenchmarkContext::new().expect("Failed to create benchmark context")
});
rt.block_on(async {
context.warmup().expect("Warmup failed");
});
let input = DMatrix::from_fn(SEQUENCE_LENGTH, FEATURE_DIM, |_, _| 0.5);
c.bench_function("system_a_forward_pass", |b| {
b.iter(|| {
black_box(context.system_a.forward(black_box(&input)).unwrap())
})
});
}
/// Criterion benchmark function for System B latency
fn bench_system_b_latency(c: &mut Criterion) {
let rt = tokio::runtime::Runtime::new().unwrap();
let mut context = rt.block_on(async {
LatencyBenchmarkContext::new().expect("Failed to create benchmark context")
});
rt.block_on(async {
context.warmup().expect("Warmup failed");
});
let input = DMatrix::from_fn(SEQUENCE_LENGTH, FEATURE_DIM, |_, _| 0.5);
c.bench_function("system_b_forward_pass", |b| {
b.iter(|| {
black_box(context.system_b.forward(black_box(&input)).unwrap())
})
});
}
/// Full latency distribution analysis
fn bench_latency_distribution(c: &mut Criterion) {
let rt = tokio::runtime::Runtime::new().unwrap();
let mut context = rt.block_on(async {
LatencyBenchmarkContext::new().expect("Failed to create benchmark context")
});
rt.block_on(async {
context.warmup().expect("Warmup failed");
context.run_measurements().expect("Measurements failed");
// Generate and save report
let report = context.generate_report();
std::fs::write("latency_benchmark_report.md", report)
.expect("Failed to save report");
println!("✅ Latency benchmark completed!");
println!("📊 Report saved to: latency_benchmark_report.md");
});
}
criterion_group!(
name = latency_benches;
config = Criterion::default()
.sample_size(1000)
.measurement_time(Duration::from_secs(10))
.warm_up_time(Duration::from_secs(3));
targets = bench_system_a_latency, bench_system_b_latency, bench_latency_distribution
);
criterion_main!(latency_benches);