433 lines
16 KiB
Rust
433 lines
16 KiB
Rust
//! Throughput benchmark for System A and System B
|
|
//!
|
|
//! This benchmark measures prediction throughput (predictions per second)
|
|
//! under various load conditions and batch sizes.
|
|
|
|
use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion, Throughput};
|
|
use std::time::{Duration, Instant};
|
|
use temporal_neural_net::prelude::*;
|
|
use nalgebra::{DMatrix, DVector};
|
|
use rayon::prelude::*;
|
|
use std::sync::Arc;
|
|
|
|
/// Test configuration
|
|
const SEQUENCE_LENGTH: usize = 64;
|
|
const FEATURE_DIM: usize = 4;
|
|
const THROUGHPUT_TEST_DURATION_SEC: u64 = 30;
|
|
const BATCH_SIZES: &[usize] = &[1, 4, 8, 16, 32, 64, 128];
|
|
const CONCURRENT_THREADS: &[usize] = &[1, 2, 4, 8];
|
|
|
|
/// Throughput measurement result
|
|
#[derive(Debug, Clone)]
|
|
struct ThroughputMeasurement {
|
|
system_type: String,
|
|
batch_size: usize,
|
|
thread_count: usize,
|
|
duration_ms: u64,
|
|
total_predictions: usize,
|
|
throughput_pred_per_sec: f64,
|
|
avg_latency_ms: f64,
|
|
memory_usage_mb: f64,
|
|
cpu_utilization: f64,
|
|
error_rate: f64,
|
|
}
|
|
|
|
/// Throughput benchmark context
|
|
struct ThroughputBenchmarkContext {
|
|
system_a: Arc<SystemA>,
|
|
system_b: Arc<SystemB>,
|
|
test_batches: Vec<Vec<DMatrix<f64>>>,
|
|
}
|
|
|
|
impl ThroughputBenchmarkContext {
|
|
/// Create new throughput benchmark context
|
|
fn new() -> Result<Self> {
|
|
let config_a = Config::default();
|
|
let mut config_b = config_a.clone();
|
|
config_b.system = crate::config::SystemConfig::TemporalSolver(
|
|
crate::config::TemporalSolverConfig::default()
|
|
);
|
|
|
|
let system_a = Arc::new(SystemA::new(&config_a.model)?);
|
|
let system_b = Arc::new(SystemB::new(&config_b.model)?);
|
|
|
|
// Pre-generate test batches
|
|
let test_batches = Self::generate_test_batches();
|
|
|
|
Ok(Self {
|
|
system_a,
|
|
system_b,
|
|
test_batches,
|
|
})
|
|
}
|
|
|
|
/// Generate test batches for different batch sizes
|
|
fn generate_test_batches() -> Vec<Vec<DMatrix<f64>>> {
|
|
use rand::prelude::*;
|
|
let mut rng = StdRng::seed_from_u64(42);
|
|
|
|
BATCH_SIZES
|
|
.iter()
|
|
.map(|&batch_size| {
|
|
(0..1000) // Generate 1000 batches for each size
|
|
.map(|_| {
|
|
DMatrix::from_fn(SEQUENCE_LENGTH, FEATURE_DIM, |_, _| {
|
|
rng.gen_range(-1.0..1.0)
|
|
})
|
|
})
|
|
.collect()
|
|
})
|
|
.collect()
|
|
}
|
|
|
|
/// Measure single-threaded throughput
|
|
fn measure_single_threaded_throughput(
|
|
&self,
|
|
system_type: &str,
|
|
batch_size: usize,
|
|
duration_sec: u64,
|
|
) -> ThroughputMeasurement {
|
|
let start_time = Instant::now();
|
|
let duration = Duration::from_secs(duration_sec);
|
|
let batch_index = BATCH_SIZES.iter().position(|&x| x == batch_size).unwrap_or(0);
|
|
let test_batch = &self.test_batches[batch_index];
|
|
|
|
let mut total_predictions = 0;
|
|
let mut total_latency_ms = 0.0;
|
|
let mut errors = 0;
|
|
let mut batch_iter = test_batch.iter().cycle();
|
|
|
|
let memory_start = Self::get_memory_usage_mb();
|
|
|
|
while start_time.elapsed() < duration {
|
|
let input = batch_iter.next().unwrap();
|
|
let prediction_start = Instant::now();
|
|
|
|
let result = match system_type {
|
|
"SystemA" => self.system_a.forward(input),
|
|
"SystemB" => self.system_b.forward(input),
|
|
_ => panic!("Unknown system type"),
|
|
};
|
|
|
|
let prediction_latency = prediction_start.elapsed().as_millis() as f64;
|
|
total_latency_ms += prediction_latency;
|
|
|
|
if result.is_ok() {
|
|
total_predictions += batch_size;
|
|
} else {
|
|
errors += 1;
|
|
}
|
|
}
|
|
|
|
let actual_duration_ms = start_time.elapsed().as_millis() as u64;
|
|
let memory_end = Self::get_memory_usage_mb();
|
|
let throughput_pred_per_sec = (total_predictions as f64) / (actual_duration_ms as f64 / 1000.0);
|
|
let avg_latency_ms = total_latency_ms / (total_predictions as f64 / batch_size as f64);
|
|
let error_rate = errors as f64 / (total_predictions as f64 / batch_size as f64);
|
|
|
|
ThroughputMeasurement {
|
|
system_type: system_type.to_string(),
|
|
batch_size,
|
|
thread_count: 1,
|
|
duration_ms: actual_duration_ms,
|
|
total_predictions,
|
|
throughput_pred_per_sec,
|
|
avg_latency_ms,
|
|
memory_usage_mb: memory_end - memory_start,
|
|
cpu_utilization: Self::get_cpu_utilization(),
|
|
error_rate,
|
|
}
|
|
}
|
|
|
|
/// Measure multi-threaded throughput
|
|
fn measure_multi_threaded_throughput(
|
|
&self,
|
|
system_type: &str,
|
|
batch_size: usize,
|
|
thread_count: usize,
|
|
duration_sec: u64,
|
|
) -> ThroughputMeasurement {
|
|
let start_time = Instant::now();
|
|
let duration = Duration::from_secs(duration_sec);
|
|
let batch_index = BATCH_SIZES.iter().position(|&x| x == batch_size).unwrap_or(0);
|
|
let test_batch = &self.test_batches[batch_index];
|
|
|
|
let memory_start = Self::get_memory_usage_mb();
|
|
|
|
// Create thread pool for parallel execution
|
|
let pool = rayon::ThreadPoolBuilder::new()
|
|
.num_threads(thread_count)
|
|
.build()
|
|
.unwrap();
|
|
|
|
let results = pool.install(|| {
|
|
let chunk_size = test_batch.len() / thread_count;
|
|
let chunks: Vec<_> = test_batch.chunks(chunk_size).collect();
|
|
|
|
chunks
|
|
.par_iter()
|
|
.map(|chunk| {
|
|
let mut thread_predictions = 0;
|
|
let mut thread_latency_ms = 0.0;
|
|
let mut thread_errors = 0;
|
|
let mut chunk_iter = chunk.iter().cycle();
|
|
|
|
while start_time.elapsed() < duration {
|
|
let input = chunk_iter.next().unwrap();
|
|
let prediction_start = Instant::now();
|
|
|
|
let result = match system_type {
|
|
"SystemA" => self.system_a.forward(input),
|
|
"SystemB" => self.system_b.forward(input),
|
|
_ => panic!("Unknown system type"),
|
|
};
|
|
|
|
let prediction_latency = prediction_start.elapsed().as_millis() as f64;
|
|
thread_latency_ms += prediction_latency;
|
|
|
|
if result.is_ok() {
|
|
thread_predictions += batch_size;
|
|
} else {
|
|
thread_errors += 1;
|
|
}
|
|
}
|
|
|
|
(thread_predictions, thread_latency_ms, thread_errors)
|
|
})
|
|
.collect::<Vec<_>>()
|
|
});
|
|
|
|
// Aggregate results from all threads
|
|
let total_predictions: usize = results.iter().map(|(p, _, _)| p).sum();
|
|
let total_latency_ms: f64 = results.iter().map(|(_, l, _)| l).sum();
|
|
let total_errors: usize = results.iter().map(|(_, _, e)| e).sum();
|
|
|
|
let actual_duration_ms = start_time.elapsed().as_millis() as u64;
|
|
let memory_end = Self::get_memory_usage_mb();
|
|
let throughput_pred_per_sec = (total_predictions as f64) / (actual_duration_ms as f64 / 1000.0);
|
|
let avg_latency_ms = total_latency_ms / (total_predictions as f64 / batch_size as f64);
|
|
let error_rate = total_errors as f64 / (total_predictions as f64 / batch_size as f64);
|
|
|
|
ThroughputMeasurement {
|
|
system_type: system_type.to_string(),
|
|
batch_size,
|
|
thread_count,
|
|
duration_ms: actual_duration_ms,
|
|
total_predictions,
|
|
throughput_pred_per_sec,
|
|
avg_latency_ms,
|
|
memory_usage_mb: memory_end - memory_start,
|
|
cpu_utilization: Self::get_cpu_utilization(),
|
|
error_rate,
|
|
}
|
|
}
|
|
|
|
/// Get current memory usage (simplified)
|
|
fn get_memory_usage_mb() -> f64 {
|
|
// This is a placeholder - in a real implementation, you'd use
|
|
// a proper memory profiling library
|
|
42.0 // MB
|
|
}
|
|
|
|
/// Get current CPU utilization (simplified)
|
|
fn get_cpu_utilization() -> f64 {
|
|
// This is a placeholder - in a real implementation, you'd use
|
|
// a proper CPU monitoring library
|
|
85.0 // Percentage
|
|
}
|
|
|
|
/// Generate comprehensive throughput report
|
|
fn generate_throughput_report(&self, measurements: &[ThroughputMeasurement]) -> String {
|
|
let mut report = String::new();
|
|
report.push_str("# Throughput Benchmark Report\n\n");
|
|
|
|
// Group measurements by system type
|
|
let system_a_measurements: Vec<_> = measurements
|
|
.iter()
|
|
.filter(|m| m.system_type == "SystemA")
|
|
.collect();
|
|
let system_b_measurements: Vec<_> = measurements
|
|
.iter()
|
|
.filter(|m| m.system_type == "SystemB")
|
|
.collect();
|
|
|
|
// System A Results
|
|
report.push_str("## System A (Traditional Micro-Net) Throughput Results\n\n");
|
|
report.push_str("| Batch Size | Threads | Throughput (pred/sec) | Avg Latency (ms) | Memory (MB) | Error Rate |\n");
|
|
report.push_str("|------------|---------|----------------------|------------------|-------------|------------|\n");
|
|
|
|
for measurement in &system_a_measurements {
|
|
report.push_str(&format!(
|
|
"| {} | {} | {:.1} | {:.3} | {:.1} | {:.2}% |\n",
|
|
measurement.batch_size,
|
|
measurement.thread_count,
|
|
measurement.throughput_pred_per_sec,
|
|
measurement.avg_latency_ms,
|
|
measurement.memory_usage_mb,
|
|
measurement.error_rate * 100.0
|
|
));
|
|
}
|
|
|
|
// System B Results
|
|
report.push_str("\n## System B (Temporal Solver Net) Throughput Results\n\n");
|
|
report.push_str("| Batch Size | Threads | Throughput (pred/sec) | Avg Latency (ms) | Memory (MB) | Error Rate |\n");
|
|
report.push_str("|------------|---------|----------------------|------------------|-------------|------------|\n");
|
|
|
|
for measurement in &system_b_measurements {
|
|
report.push_str(&format!(
|
|
"| {} | {} | {:.1} | {:.3} | {:.1} | {:.2}% |\n",
|
|
measurement.batch_size,
|
|
measurement.thread_count,
|
|
measurement.throughput_pred_per_sec,
|
|
measurement.avg_latency_ms,
|
|
measurement.memory_usage_mb,
|
|
measurement.error_rate * 100.0
|
|
));
|
|
}
|
|
|
|
// Peak Performance Analysis
|
|
report.push_str("\n## Peak Performance Analysis\n\n");
|
|
|
|
let peak_a = system_a_measurements
|
|
.iter()
|
|
.max_by(|a, b| a.throughput_pred_per_sec.partial_cmp(&b.throughput_pred_per_sec).unwrap())
|
|
.unwrap();
|
|
|
|
let peak_b = system_b_measurements
|
|
.iter()
|
|
.max_by(|a, b| a.throughput_pred_per_sec.partial_cmp(&b.throughput_pred_per_sec).unwrap())
|
|
.unwrap();
|
|
|
|
report.push_str(&format!("**System A Peak Performance:**\n"));
|
|
report.push_str(&format!("- Throughput: {:.1} predictions/sec\n", peak_a.throughput_pred_per_sec));
|
|
report.push_str(&format!("- Configuration: Batch size {}, {} threads\n", peak_a.batch_size, peak_a.thread_count));
|
|
report.push_str(&format!("- Latency: {:.3}ms\n\n", peak_a.avg_latency_ms));
|
|
|
|
report.push_str(&format!("**System B Peak Performance:**\n"));
|
|
report.push_str(&format!("- Throughput: {:.1} predictions/sec\n", peak_b.throughput_pred_per_sec));
|
|
report.push_str(&format!("- Configuration: Batch size {}, {} threads\n", peak_b.batch_size, peak_b.thread_count));
|
|
report.push_str(&format!("- Latency: {:.3}ms\n\n", peak_b.avg_latency_ms));
|
|
|
|
// Throughput Improvement Analysis
|
|
let throughput_improvement = (peak_b.throughput_pred_per_sec - peak_a.throughput_pred_per_sec)
|
|
/ peak_a.throughput_pred_per_sec * 100.0;
|
|
|
|
report.push_str("## Comparative Analysis\n\n");
|
|
report.push_str(&format!("| Metric | System A | System B | Improvement |\n"));
|
|
report.push_str(&format!("|--------|----------|----------|-------------|\n"));
|
|
report.push_str(&format!("| Peak Throughput | {:.1} pred/sec | {:.1} pred/sec | {:.1}% |\n",
|
|
peak_a.throughput_pred_per_sec, peak_b.throughput_pred_per_sec, throughput_improvement));
|
|
report.push_str(&format!("| Best Latency | {:.3}ms | {:.3}ms | {:.1}% |\n",
|
|
peak_a.avg_latency_ms, peak_b.avg_latency_ms,
|
|
(peak_a.avg_latency_ms - peak_b.avg_latency_ms) / peak_a.avg_latency_ms * 100.0));
|
|
|
|
report
|
|
}
|
|
}
|
|
|
|
/// Benchmark throughput for different batch sizes
|
|
fn bench_batch_size_throughput(c: &mut Criterion) {
|
|
let rt = tokio::runtime::Runtime::new().unwrap();
|
|
let context = rt.block_on(async {
|
|
ThroughputBenchmarkContext::new().expect("Failed to create context")
|
|
});
|
|
|
|
let mut group = c.benchmark_group("batch_throughput");
|
|
|
|
for &batch_size in BATCH_SIZES {
|
|
group.throughput(Throughput::Elements(batch_size as u64));
|
|
|
|
group.bench_with_input(
|
|
BenchmarkId::new("SystemA", batch_size),
|
|
&batch_size,
|
|
|b, &batch_size| {
|
|
let test_input = DMatrix::from_fn(SEQUENCE_LENGTH, FEATURE_DIM, |_, _| 0.5);
|
|
b.iter(|| {
|
|
for _ in 0..batch_size {
|
|
black_box(context.system_a.forward(black_box(&test_input)).unwrap());
|
|
}
|
|
});
|
|
},
|
|
);
|
|
|
|
group.bench_with_input(
|
|
BenchmarkId::new("SystemB", batch_size),
|
|
&batch_size,
|
|
|b, &batch_size| {
|
|
let test_input = DMatrix::from_fn(SEQUENCE_LENGTH, FEATURE_DIM, |_, _| 0.5);
|
|
b.iter(|| {
|
|
for _ in 0..batch_size {
|
|
black_box(context.system_b.forward(black_box(&test_input)).unwrap());
|
|
}
|
|
});
|
|
},
|
|
);
|
|
}
|
|
|
|
group.finish();
|
|
}
|
|
|
|
/// Comprehensive throughput analysis
|
|
fn bench_comprehensive_throughput(c: &mut Criterion) {
|
|
let rt = tokio::runtime::Runtime::new().unwrap();
|
|
|
|
rt.block_on(async {
|
|
let context = ThroughputBenchmarkContext::new()
|
|
.expect("Failed to create benchmark context");
|
|
|
|
let mut measurements = Vec::new();
|
|
|
|
println!("Running comprehensive throughput benchmarks...");
|
|
|
|
// Test all combinations of batch sizes and thread counts
|
|
for &batch_size in BATCH_SIZES {
|
|
for &thread_count in CONCURRENT_THREADS {
|
|
println!("Testing batch_size={}, threads={}", batch_size, thread_count);
|
|
|
|
// Test System A
|
|
if thread_count == 1 {
|
|
let measurement_a = context.measure_single_threaded_throughput(
|
|
"SystemA", batch_size, 5
|
|
);
|
|
measurements.push(measurement_a);
|
|
} else {
|
|
let measurement_a = context.measure_multi_threaded_throughput(
|
|
"SystemA", batch_size, thread_count, 5
|
|
);
|
|
measurements.push(measurement_a);
|
|
}
|
|
|
|
// Test System B
|
|
if thread_count == 1 {
|
|
let measurement_b = context.measure_single_threaded_throughput(
|
|
"SystemB", batch_size, 5
|
|
);
|
|
measurements.push(measurement_b);
|
|
} else {
|
|
let measurement_b = context.measure_multi_threaded_throughput(
|
|
"SystemB", batch_size, thread_count, 5
|
|
);
|
|
measurements.push(measurement_b);
|
|
}
|
|
}
|
|
}
|
|
|
|
// Generate and save report
|
|
let report = context.generate_throughput_report(&measurements);
|
|
std::fs::write("throughput_benchmark_report.md", report)
|
|
.expect("Failed to save throughput report");
|
|
|
|
println!("✅ Throughput benchmark completed!");
|
|
println!("📊 Report saved to: throughput_benchmark_report.md");
|
|
});
|
|
}
|
|
|
|
criterion_group!(
|
|
name = throughput_benches;
|
|
config = Criterion::default()
|
|
.sample_size(100)
|
|
.measurement_time(Duration::from_secs(30))
|
|
.warm_up_time(Duration::from_secs(5));
|
|
targets = bench_batch_size_throughput, bench_comprehensive_throughput
|
|
);
|
|
criterion_main!(throughput_benches); |