630 lines
24 KiB
Rust
630 lines
24 KiB
Rust
//! Statistical analysis benchmark for System A vs System B comparison
|
||
//!
|
||
//! This benchmark performs rigorous statistical tests to validate the
|
||
//! performance differences between systems, including effect size calculations.
|
||
|
||
use criterion::{criterion_group, criterion_main, Criterion};
|
||
use std::time::Duration;
|
||
use temporal_neural_net::prelude::*;
|
||
use nalgebra::{DMatrix, DVector};
|
||
|
||
/// Number of samples for statistical analysis
|
||
const STATISTICAL_SAMPLES: usize = 10000;
|
||
|
||
/// Statistical test result
|
||
#[derive(Debug, Clone)]
|
||
struct StatisticalTestResult {
|
||
test_name: String,
|
||
test_statistic: f64,
|
||
p_value: f64,
|
||
confidence_interval: (f64, f64),
|
||
effect_size: f64,
|
||
interpretation: String,
|
||
significant: bool,
|
||
}
|
||
|
||
/// Effect size classification
|
||
#[derive(Debug, Clone)]
|
||
enum EffectSize {
|
||
Negligible, // < 0.2
|
||
Small, // 0.2 - 0.5
|
||
Medium, // 0.5 - 0.8
|
||
Large, // > 0.8
|
||
}
|
||
|
||
/// Comprehensive statistical analysis context
|
||
struct StatisticalAnalysisContext {
|
||
system_a: SystemA,
|
||
system_b: SystemB,
|
||
test_inputs: Vec<DMatrix<f64>>,
|
||
}
|
||
|
||
impl StatisticalAnalysisContext {
|
||
/// Create new statistical analysis context
|
||
fn new() -> Result<Self> {
|
||
let config_a = Config::default();
|
||
let mut config_b = config_a.clone();
|
||
config_b.system = crate::config::SystemConfig::TemporalSolver(
|
||
crate::config::TemporalSolverConfig::default()
|
||
);
|
||
|
||
let system_a = SystemA::new(&config_a.model)?;
|
||
let system_b = SystemB::new(&config_b.model)?;
|
||
|
||
// Generate test inputs
|
||
let test_inputs = Self::generate_test_inputs();
|
||
|
||
Ok(Self {
|
||
system_a,
|
||
system_b,
|
||
test_inputs,
|
||
})
|
||
}
|
||
|
||
/// Generate test inputs for statistical analysis
|
||
fn generate_test_inputs() -> Vec<DMatrix<f64>> {
|
||
use rand::prelude::*;
|
||
let mut rng = StdRng::seed_from_u64(12345);
|
||
|
||
(0..STATISTICAL_SAMPLES)
|
||
.map(|_| {
|
||
DMatrix::from_fn(64, 4, |_, _| {
|
||
rng.gen_range(-1.0..1.0)
|
||
})
|
||
})
|
||
.collect()
|
||
}
|
||
|
||
/// Collect latency samples for both systems
|
||
fn collect_latency_samples(&mut self) -> Result<(Vec<f64>, Vec<f64>)> {
|
||
use std::time::Instant;
|
||
|
||
let mut system_a_latencies = Vec::new();
|
||
let mut system_b_latencies = Vec::new();
|
||
|
||
println!("Collecting {} latency samples for statistical analysis...", STATISTICAL_SAMPLES);
|
||
|
||
for (i, input) in self.test_inputs.iter().enumerate() {
|
||
// Measure System A
|
||
let start_a = Instant::now();
|
||
let _ = self.system_a.forward(input)?;
|
||
let latency_a = start_a.elapsed().as_micros() as f64; // microseconds
|
||
system_a_latencies.push(latency_a);
|
||
|
||
// Measure System B
|
||
let start_b = Instant::now();
|
||
let _ = self.system_b.forward(input)?;
|
||
let latency_b = start_b.elapsed().as_micros() as f64; // microseconds
|
||
system_b_latencies.push(latency_b);
|
||
|
||
if i % 1000 == 0 {
|
||
println!("Progress: {}/{}", i, STATISTICAL_SAMPLES);
|
||
}
|
||
}
|
||
|
||
Ok((system_a_latencies, system_b_latencies))
|
||
}
|
||
|
||
/// Perform paired t-test
|
||
fn paired_t_test(&self, sample_a: &[f64], sample_b: &[f64]) -> StatisticalTestResult {
|
||
let n = sample_a.len() as f64;
|
||
|
||
// Calculate differences
|
||
let differences: Vec<f64> = sample_a.iter()
|
||
.zip(sample_b.iter())
|
||
.map(|(a, b)| a - b)
|
||
.collect();
|
||
|
||
// Calculate mean difference
|
||
let mean_diff = differences.iter().sum::<f64>() / n;
|
||
|
||
// Calculate standard deviation of differences
|
||
let variance = differences.iter()
|
||
.map(|d| (d - mean_diff).powi(2))
|
||
.sum::<f64>() / (n - 1.0);
|
||
let std_dev = variance.sqrt();
|
||
|
||
// Calculate t-statistic
|
||
let t_statistic = mean_diff / (std_dev / n.sqrt());
|
||
|
||
// Degrees of freedom
|
||
let df = n - 1.0;
|
||
|
||
// Calculate p-value (simplified - using t-distribution approximation)
|
||
let p_value = self.t_distribution_p_value(t_statistic, df);
|
||
|
||
// Calculate confidence interval (95%)
|
||
let t_critical = 1.96; // Approximate for large samples
|
||
let margin_error = t_critical * (std_dev / n.sqrt());
|
||
let confidence_interval = (mean_diff - margin_error, mean_diff + margin_error);
|
||
|
||
// Calculate effect size (Cohen's d for paired samples)
|
||
let effect_size = mean_diff / std_dev;
|
||
|
||
// Interpret results
|
||
let significant = p_value < 0.05;
|
||
let interpretation = format!(
|
||
"Mean difference: {:.3}μs, 95% CI: ({:.3}, {:.3}), Cohen's d: {:.3}",
|
||
mean_diff, confidence_interval.0, confidence_interval.1, effect_size
|
||
);
|
||
|
||
StatisticalTestResult {
|
||
test_name: "Paired t-test".to_string(),
|
||
test_statistic: t_statistic,
|
||
p_value,
|
||
confidence_interval,
|
||
effect_size,
|
||
interpretation,
|
||
significant,
|
||
}
|
||
}
|
||
|
||
/// Perform Mann-Whitney U test (Wilcoxon rank-sum test)
|
||
fn mann_whitney_u_test(&self, sample_a: &[f64], sample_b: &[f64]) -> StatisticalTestResult {
|
||
let n1 = sample_a.len();
|
||
let n2 = sample_b.len();
|
||
|
||
// Combine and rank all observations
|
||
let mut combined: Vec<(f64, usize)> = Vec::new();
|
||
for (i, &val) in sample_a.iter().enumerate() {
|
||
combined.push((val, 0)); // 0 for group A
|
||
}
|
||
for (i, &val) in sample_b.iter().enumerate() {
|
||
combined.push((val, 1)); // 1 for group B
|
||
}
|
||
|
||
// Sort by value
|
||
combined.sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap());
|
||
|
||
// Assign ranks (handling ties with average ranks)
|
||
let mut ranks = vec![0.0; combined.len()];
|
||
let mut i = 0;
|
||
while i < combined.len() {
|
||
let mut j = i;
|
||
while j < combined.len() && combined[j].0 == combined[i].0 {
|
||
j += 1;
|
||
}
|
||
let avg_rank = (i + j + 1) as f64 / 2.0;
|
||
for k in i..j {
|
||
ranks[k] = avg_rank;
|
||
}
|
||
i = j;
|
||
}
|
||
|
||
// Calculate rank sums
|
||
let mut rank_sum_a = 0.0;
|
||
let mut rank_sum_b = 0.0;
|
||
for (i, (_, group)) in combined.iter().enumerate() {
|
||
if *group == 0 {
|
||
rank_sum_a += ranks[i];
|
||
} else {
|
||
rank_sum_b += ranks[i];
|
||
}
|
||
}
|
||
|
||
// Calculate U statistics
|
||
let u1 = rank_sum_a - (n1 * (n1 + 1)) as f64 / 2.0;
|
||
let u2 = rank_sum_b - (n2 * (n2 + 1)) as f64 / 2.0;
|
||
let u_statistic = u1.min(u2);
|
||
|
||
// Calculate z-score for normal approximation
|
||
let mean_u = (n1 * n2) as f64 / 2.0;
|
||
let std_u = ((n1 * n2 * (n1 + n2 + 1)) as f64 / 12.0).sqrt();
|
||
let z_score = (u_statistic - mean_u) / std_u;
|
||
|
||
// Calculate p-value (two-tailed)
|
||
let p_value = 2.0 * (1.0 - self.standard_normal_cdf(z_score.abs()));
|
||
|
||
// Effect size (rank-biserial correlation)
|
||
let effect_size = 1.0 - (2.0 * u_statistic) / (n1 * n2) as f64;
|
||
|
||
let significant = p_value < 0.05;
|
||
let interpretation = format!(
|
||
"U statistic: {:.1}, Z-score: {:.3}, Effect size (r): {:.3}",
|
||
u_statistic, z_score, effect_size
|
||
);
|
||
|
||
StatisticalTestResult {
|
||
test_name: "Mann-Whitney U test".to_string(),
|
||
test_statistic: u_statistic,
|
||
p_value,
|
||
confidence_interval: (0.0, 0.0), // Not typically calculated for U test
|
||
effect_size,
|
||
interpretation,
|
||
significant,
|
||
}
|
||
}
|
||
|
||
/// Calculate bootstrap confidence interval for difference in means
|
||
fn bootstrap_confidence_interval(&self, sample_a: &[f64], sample_b: &[f64], n_bootstrap: usize) -> (f64, f64) {
|
||
use rand::prelude::*;
|
||
let mut rng = StdRng::seed_from_u64(42);
|
||
|
||
let mut bootstrap_diffs = Vec::new();
|
||
|
||
for _ in 0..n_bootstrap {
|
||
// Bootstrap resample both groups
|
||
let bootstrap_a: Vec<f64> = (0..sample_a.len())
|
||
.map(|_| sample_a[rng.gen_range(0..sample_a.len())])
|
||
.collect();
|
||
let bootstrap_b: Vec<f64> = (0..sample_b.len())
|
||
.map(|_| sample_b[rng.gen_range(0..sample_b.len())])
|
||
.collect();
|
||
|
||
// Calculate means
|
||
let mean_a = bootstrap_a.iter().sum::<f64>() / bootstrap_a.len() as f64;
|
||
let mean_b = bootstrap_b.iter().sum::<f64>() / bootstrap_b.len() as f64;
|
||
|
||
bootstrap_diffs.push(mean_a - mean_b);
|
||
}
|
||
|
||
bootstrap_diffs.sort_by(|a, b| a.partial_cmp(b).unwrap());
|
||
|
||
// 95% confidence interval
|
||
let lower_idx = ((n_bootstrap as f64) * 0.025) as usize;
|
||
let upper_idx = ((n_bootstrap as f64) * 0.975) as usize;
|
||
|
||
(bootstrap_diffs[lower_idx], bootstrap_diffs[upper_idx])
|
||
}
|
||
|
||
/// Calculate various effect size measures
|
||
fn calculate_effect_sizes(&self, sample_a: &[f64], sample_b: &[f64]) -> Vec<(String, f64, EffectSize)> {
|
||
let mean_a = sample_a.iter().sum::<f64>() / sample_a.len() as f64;
|
||
let mean_b = sample_b.iter().sum::<f64>() / sample_b.len() as f64;
|
||
|
||
let var_a = sample_a.iter()
|
||
.map(|x| (x - mean_a).powi(2))
|
||
.sum::<f64>() / (sample_a.len() - 1) as f64;
|
||
let var_b = sample_b.iter()
|
||
.map(|x| (x - mean_b).powi(2))
|
||
.sum::<f64>() / (sample_b.len() - 1) as f64;
|
||
|
||
let pooled_std = ((var_a + var_b) / 2.0).sqrt();
|
||
|
||
let mut effect_sizes = Vec::new();
|
||
|
||
// Cohen's d
|
||
let cohens_d = (mean_a - mean_b) / pooled_std;
|
||
effect_sizes.push(("Cohen's d".to_string(), cohens_d, self.classify_effect_size(cohens_d.abs())));
|
||
|
||
// Glass's Δ (using sample_a as control)
|
||
let glass_delta = (mean_a - mean_b) / var_a.sqrt();
|
||
effect_sizes.push(("Glass's Δ".to_string(), glass_delta, self.classify_effect_size(glass_delta.abs())));
|
||
|
||
// Hedge's g (bias-corrected Cohen's d)
|
||
let n = sample_a.len() + sample_b.len();
|
||
let correction = 1.0 - 3.0 / (4.0 * n as f64 - 9.0);
|
||
let hedges_g = cohens_d * correction;
|
||
effect_sizes.push(("Hedge's g".to_string(), hedges_g, self.classify_effect_size(hedges_g.abs())));
|
||
|
||
// Common Language Effect Size (probability of superiority)
|
||
let cles = self.calculate_cles(sample_a, sample_b);
|
||
effect_sizes.push(("CLES".to_string(), cles, self.classify_effect_size((cles - 0.5).abs() * 2.0)));
|
||
|
||
effect_sizes
|
||
}
|
||
|
||
/// Calculate Common Language Effect Size
|
||
fn calculate_cles(&self, sample_a: &[f64], sample_b: &[f64]) -> f64 {
|
||
let mut count = 0;
|
||
let mut total = 0;
|
||
|
||
for &a in sample_a {
|
||
for &b in sample_b {
|
||
total += 1;
|
||
if a > b {
|
||
count += 1;
|
||
}
|
||
}
|
||
}
|
||
|
||
count as f64 / total as f64
|
||
}
|
||
|
||
/// Classify effect size magnitude
|
||
fn classify_effect_size(&self, effect_size: f64) -> EffectSize {
|
||
if effect_size < 0.2 {
|
||
EffectSize::Negligible
|
||
} else if effect_size < 0.5 {
|
||
EffectSize::Small
|
||
} else if effect_size < 0.8 {
|
||
EffectSize::Medium
|
||
} else {
|
||
EffectSize::Large
|
||
}
|
||
}
|
||
|
||
/// Simplified t-distribution p-value calculation
|
||
fn t_distribution_p_value(&self, t: f64, df: f64) -> f64 {
|
||
// Simplified approximation - in practice, use a proper statistical library
|
||
let z = t / (1.0 + t.powi(2) / (4.0 * df)).sqrt();
|
||
2.0 * (1.0 - self.standard_normal_cdf(z.abs()))
|
||
}
|
||
|
||
/// Standard normal CDF approximation
|
||
fn standard_normal_cdf(&self, x: f64) -> f64 {
|
||
// Abramowitz and Stegun approximation
|
||
let a1 = 0.254829592;
|
||
let a2 = -0.284496736;
|
||
let a3 = 1.421413741;
|
||
let a4 = -1.453152027;
|
||
let a5 = 1.061405429;
|
||
let p = 0.3275911;
|
||
|
||
let sign = if x < 0.0 { -1.0 } else { 1.0 };
|
||
let x = x.abs();
|
||
|
||
let t = 1.0 / (1.0 + p * x);
|
||
let y = 1.0 - (((((a5 * t + a4) * t) + a3) * t + a2) * t + a1) * t * (-x * x / 2.0).exp();
|
||
|
||
0.5 * (1.0 + sign * y)
|
||
}
|
||
|
||
/// Perform power analysis
|
||
fn power_analysis(&self, sample_a: &[f64], sample_b: &[f64], alpha: f64) -> f64 {
|
||
let mean_a = sample_a.iter().sum::<f64>() / sample_a.len() as f64;
|
||
let mean_b = sample_b.iter().sum::<f64>() / sample_b.len() as f64;
|
||
|
||
let var_a = sample_a.iter()
|
||
.map(|x| (x - mean_a).powi(2))
|
||
.sum::<f64>() / (sample_a.len() - 1) as f64;
|
||
let var_b = sample_b.iter()
|
||
.map(|x| (x - mean_b).powi(2))
|
||
.sum::<f64>() / (sample_b.len() - 1) as f64;
|
||
|
||
let pooled_var = (var_a + var_b) / 2.0;
|
||
let effect_size = (mean_a - mean_b).abs() / pooled_var.sqrt();
|
||
|
||
let n = sample_a.len().min(sample_b.len()) as f64;
|
||
let delta = effect_size * (n / 2.0).sqrt();
|
||
|
||
// Critical value for two-tailed test
|
||
let z_alpha = self.inverse_normal_cdf(1.0 - alpha / 2.0);
|
||
let z_beta = delta - z_alpha;
|
||
|
||
self.standard_normal_cdf(z_beta)
|
||
}
|
||
|
||
/// Inverse normal CDF (simplified)
|
||
fn inverse_normal_cdf(&self, p: f64) -> f64 {
|
||
// Simplified approximation - Beasley-Springer-Moro algorithm
|
||
let a = vec![
|
||
-3.969683028665376e+01,
|
||
2.209460984245205e+02,
|
||
-2.759285104469687e+02,
|
||
1.383577518672690e+02,
|
||
-3.066479806614716e+01,
|
||
2.506628277459239e+00,
|
||
];
|
||
|
||
let b = vec![
|
||
-5.447609879822406e+01,
|
||
1.615858368580409e+02,
|
||
-1.556989798598866e+02,
|
||
6.680131188771972e+01,
|
||
-1.328068155288572e+01,
|
||
];
|
||
|
||
let c = vec![
|
||
-7.784894002430293e-03,
|
||
-3.223964580411365e-01,
|
||
-2.400758277161838e+00,
|
||
-2.549732539343734e+00,
|
||
4.374664141464968e+00,
|
||
2.938163982698783e+00,
|
||
];
|
||
|
||
let d = vec![
|
||
7.784695709041462e-03,
|
||
3.224671290700398e-01,
|
||
2.445134137142996e+00,
|
||
3.754408661907416e+00,
|
||
];
|
||
|
||
let p_low = 0.02425;
|
||
let p_high = 1.0 - p_low;
|
||
|
||
if p < p_low {
|
||
let q = (-2.0 * p.ln()).sqrt();
|
||
return (((((c[0] * q + c[1]) * q + c[2]) * q + c[3]) * q + c[4]) * q + c[5]) /
|
||
((((d[0] * q + d[1]) * q + d[2]) * q + d[3]) * q + 1.0);
|
||
} else if p <= p_high {
|
||
let q = p - 0.5;
|
||
let r = q * q;
|
||
return (((((a[0] * r + a[1]) * r + a[2]) * r + a[3]) * r + a[4]) * r + a[5]) * q /
|
||
(((((b[0] * r + b[1]) * r + b[2]) * r + b[3]) * r + b[4]) * r + 1.0);
|
||
} else {
|
||
let q = (-2.0 * (1.0 - p).ln()).sqrt();
|
||
return -(((((c[0] * q + c[1]) * q + c[2]) * q + c[3]) * q + c[4]) * q + c[5]) /
|
||
((((d[0] * q + d[1]) * q + d[2]) * q + d[3]) * q + 1.0);
|
||
}
|
||
}
|
||
|
||
/// Generate comprehensive statistical report
|
||
fn generate_statistical_report(
|
||
&self,
|
||
sample_a: &[f64],
|
||
sample_b: &[f64],
|
||
tests: &[StatisticalTestResult],
|
||
effect_sizes: &[(String, f64, EffectSize)],
|
||
bootstrap_ci: (f64, f64),
|
||
power: f64,
|
||
) -> String {
|
||
let mut report = String::new();
|
||
report.push_str("# Statistical Analysis Report: System A vs System B\n\n");
|
||
|
||
// Sample statistics
|
||
let mean_a = sample_a.iter().sum::<f64>() / sample_a.len() as f64;
|
||
let mean_b = sample_b.iter().sum::<f64>() / sample_b.len() as f64;
|
||
let std_a = (sample_a.iter().map(|x| (x - mean_a).powi(2)).sum::<f64>() / (sample_a.len() - 1) as f64).sqrt();
|
||
let std_b = (sample_b.iter().map(|x| (x - mean_b).powi(2)).sum::<f64>() / (sample_b.len() - 1) as f64).sqrt();
|
||
|
||
report.push_str("## Descriptive Statistics\n\n");
|
||
report.push_str("| System | N | Mean (μs) | Std Dev (μs) | Min (μs) | Max (μs) |\n");
|
||
report.push_str("|--------|---|-----------|--------------|----------|----------|\n");
|
||
report.push_str(&format!("| System A | {} | {:.3} | {:.3} | {:.3} | {:.3} |\n",
|
||
sample_a.len(), mean_a, std_a,
|
||
sample_a.iter().fold(f64::INFINITY, |acc, &x| acc.min(x)),
|
||
sample_a.iter().fold(f64::NEG_INFINITY, |acc, &x| acc.max(x))
|
||
));
|
||
report.push_str(&format!("| System B | {} | {:.3} | {:.3} | {:.3} | {:.3} |\n\n",
|
||
sample_b.len(), mean_b, std_b,
|
||
sample_b.iter().fold(f64::INFINITY, |acc, &x| acc.min(x)),
|
||
sample_b.iter().fold(f64::NEG_INFINITY, |acc, &x| acc.max(x))
|
||
));
|
||
|
||
// Statistical tests
|
||
report.push_str("## Statistical Tests\n\n");
|
||
for test in tests {
|
||
report.push_str(&format!("### {}\n\n", test.test_name));
|
||
report.push_str("| Metric | Value |\n|--------|-------|\n");
|
||
report.push_str(&format!("| Test Statistic | {:.4} |\n", test.test_statistic));
|
||
report.push_str(&format!("| p-value | {:.6} |\n", test.p_value));
|
||
report.push_str(&format!("| Significant (α=0.05) | {} |\n", if test.significant { "Yes ✅" } else { "No ❌" }));
|
||
if test.confidence_interval.0 != 0.0 || test.confidence_interval.1 != 0.0 {
|
||
report.push_str(&format!("| 95% CI | ({:.3}, {:.3}) |\n", test.confidence_interval.0, test.confidence_interval.1));
|
||
}
|
||
report.push_str(&format!("| Interpretation | {} |\n\n", test.interpretation));
|
||
}
|
||
|
||
// Bootstrap confidence interval
|
||
report.push_str("## Bootstrap Analysis\n\n");
|
||
report.push_str(&format!("**Bootstrap 95% Confidence Interval for Difference in Means:**\n"));
|
||
report.push_str(&format!("({:.3}, {:.3}) μs\n\n", bootstrap_ci.0, bootstrap_ci.1));
|
||
|
||
// Effect sizes
|
||
report.push_str("## Effect Size Analysis\n\n");
|
||
report.push_str("| Measure | Value | Magnitude | Interpretation |\n");
|
||
report.push_str("|---------|-------|-----------|----------------|\n");
|
||
for (name, value, magnitude) in effect_sizes {
|
||
let magnitude_str = match magnitude {
|
||
EffectSize::Negligible => "Negligible",
|
||
EffectSize::Small => "Small",
|
||
EffectSize::Medium => "Medium",
|
||
EffectSize::Large => "Large",
|
||
};
|
||
let interpretation = match name.as_str() {
|
||
"Cohen's d" => "Standardized mean difference",
|
||
"Glass's Δ" => "Mean difference in control group SD units",
|
||
"Hedge's g" => "Bias-corrected Cohen's d",
|
||
"CLES" => "Probability that System A > System B",
|
||
_ => "Effect size measure",
|
||
};
|
||
report.push_str(&format!("| {} | {:.4} | {} | {} |\n", name, value, magnitude_str, interpretation));
|
||
}
|
||
|
||
// Power analysis
|
||
report.push_str(&format!("\n## Power Analysis\n\n"));
|
||
report.push_str(&format!("**Statistical Power:** {:.3} ({:.1}%)\n\n", power, power * 100.0));
|
||
|
||
if power < 0.8 {
|
||
report.push_str("⚠️ **Warning:** Statistical power is below the conventional threshold of 0.8. Consider increasing sample size for more reliable results.\n\n");
|
||
} else {
|
||
report.push_str("✅ **Good:** Statistical power exceeds the conventional threshold of 0.8.\n\n");
|
||
}
|
||
|
||
// Summary and conclusions
|
||
report.push_str("## Summary and Conclusions\n\n");
|
||
|
||
let significant_tests = tests.iter().filter(|t| t.significant).count();
|
||
let total_tests = tests.len();
|
||
|
||
report.push_str(&format!("**Statistical Significance:** {}/{} tests show significant differences (p < 0.05)\n\n", significant_tests, total_tests));
|
||
|
||
let largest_effect = effect_sizes.iter()
|
||
.filter(|(name, _, _)| name == "Cohen's d")
|
||
.map(|(_, value, _)| value.abs())
|
||
.next()
|
||
.unwrap_or(0.0);
|
||
|
||
if significant_tests > 0 && largest_effect > 0.5 {
|
||
report.push_str("🎉 **Conclusion:** The statistical analysis provides strong evidence for a significant and meaningful performance difference between System A and System B. The effect size is moderate to large, indicating practical significance beyond statistical significance.\n\n");
|
||
} else if significant_tests > 0 {
|
||
report.push_str("📊 **Conclusion:** There is statistical evidence for a difference between systems, but the effect size suggests the practical impact may be limited.\n\n");
|
||
} else {
|
||
report.push_str("📈 **Conclusion:** No statistically significant differences were detected between the systems at the α = 0.05 level.\n\n");
|
||
}
|
||
|
||
// Recommendations
|
||
report.push_str("## Recommendations\n\n");
|
||
|
||
if power < 0.8 {
|
||
let recommended_n = ((1.96 + 0.84).powi(2) * (std_a.powi(2) + std_b.powi(2)) / (mean_a - mean_b).powi(2)).ceil() as usize;
|
||
report.push_str(&format!("1. **Sample Size:** Consider increasing sample size to approximately {} per group for 80% power.\n", recommended_n));
|
||
}
|
||
|
||
if largest_effect > 0.8 {
|
||
report.push_str("2. **Effect Size:** The large effect size suggests this is a practically meaningful difference worth investigating further.\n");
|
||
}
|
||
|
||
if significant_tests == total_tests {
|
||
report.push_str("3. **Consistency:** All statistical tests agree on significance, providing strong evidence for the observed difference.\n");
|
||
}
|
||
|
||
report.push_str(&format!("\n---\n*Generated from {} samples per system using rigorous statistical methods.*", sample_a.len()));
|
||
|
||
report
|
||
}
|
||
}
|
||
|
||
/// Main statistical analysis benchmark
|
||
fn bench_statistical_analysis(c: &mut Criterion) {
|
||
let rt = tokio::runtime::Runtime::new().unwrap();
|
||
|
||
rt.block_on(async {
|
||
let mut context = StatisticalAnalysisContext::new()
|
||
.expect("Failed to create statistical analysis context");
|
||
|
||
println!("Running comprehensive statistical analysis...");
|
||
|
||
// Collect latency samples
|
||
let (system_a_latencies, system_b_latencies) = context.collect_latency_samples()
|
||
.expect("Failed to collect samples");
|
||
|
||
// Perform statistical tests
|
||
let mut tests = Vec::new();
|
||
|
||
// Paired t-test
|
||
let t_test = context.paired_t_test(&system_a_latencies, &system_b_latencies);
|
||
tests.push(t_test);
|
||
|
||
// Mann-Whitney U test
|
||
let u_test = context.mann_whitney_u_test(&system_a_latencies, &system_b_latencies);
|
||
tests.push(u_test);
|
||
|
||
// Bootstrap confidence interval
|
||
let bootstrap_ci = context.bootstrap_confidence_interval(&system_a_latencies, &system_b_latencies, 10000);
|
||
|
||
// Effect size calculations
|
||
let effect_sizes = context.calculate_effect_sizes(&system_a_latencies, &system_b_latencies);
|
||
|
||
// Power analysis
|
||
let power = context.power_analysis(&system_a_latencies, &system_b_latencies, 0.05);
|
||
|
||
// Generate comprehensive report
|
||
let report = context.generate_statistical_report(
|
||
&system_a_latencies,
|
||
&system_b_latencies,
|
||
&tests,
|
||
&effect_sizes,
|
||
bootstrap_ci,
|
||
power,
|
||
);
|
||
|
||
std::fs::write("statistical_analysis_report.md", report)
|
||
.expect("Failed to save statistical analysis report");
|
||
|
||
println!("✅ Statistical analysis completed!");
|
||
println!("📊 Report saved to: statistical_analysis_report.md");
|
||
});
|
||
}
|
||
|
||
criterion_group!(
|
||
name = statistical_benches;
|
||
config = Criterion::default()
|
||
.sample_size(10)
|
||
.measurement_time(Duration::from_secs(180)) // 3 minutes for comprehensive analysis
|
||
.warm_up_time(Duration::from_secs(30));
|
||
targets = bench_statistical_analysis
|
||
);
|
||
criterion_main!(statistical_benches); |