wifi-densepose/vendor/ruvector/examples/refrag-pipeline/src/benchmark.rs

258 lines
8.2 KiB
Rust

//! REFRAG Pipeline Benchmark
//!
//! Measures performance of the Compress-Sense-Expand pipeline.
//!
//! Run with: cargo run --bin refrag-benchmark --release
use refrag_pipeline_example::{
compress::{CompressionStrategy, TensorCompressor},
expand::{ExpandLayer, Projector, ProjectorRegistry},
sense::{LinearPolicy, MLPPolicy, PolicyModel, PolicyNetwork, ThresholdPolicy},
store::RefragStoreBuilder,
types::RefragEntry,
};
use rand::Rng;
use std::time::{Duration, Instant};
fn main() -> anyhow::Result<()> {
println!("=================================================");
println!(" REFRAG Pipeline Benchmark ");
println!("=================================================\n");
// Run all benchmarks
benchmark_compression()?;
benchmark_policy()?;
benchmark_projection()?;
benchmark_end_to_end()?;
Ok(())
}
fn benchmark_compression() -> anyhow::Result<()> {
println!("--- Compression Layer Benchmark ---\n");
let dimensions = [384, 768, 1024, 2048, 4096];
let iterations = 10000;
println!(
"{:>8} | {:>12} | {:>12} | {:>12} | {:>12}",
"Dims", "None (us)", "Float16 (us)", "Int8 (us)", "Binary (us)"
);
println!("{}", "-".repeat(70));
for dim in dimensions {
let mut rng = rand::thread_rng();
let vector: Vec<f32> = (0..dim).map(|_| rng.gen_range(-1.0..1.0)).collect();
let strategies = [
CompressionStrategy::None,
CompressionStrategy::Float16,
CompressionStrategy::Int8,
CompressionStrategy::Binary,
];
let mut times = Vec::new();
for strategy in strategies {
let compressor = TensorCompressor::new(dim).with_strategy(strategy);
let start = Instant::now();
for _ in 0..iterations {
let _ = compressor.compress(&vector);
}
let elapsed = start.elapsed();
times.push(elapsed.as_nanos() as f64 / iterations as f64 / 1000.0);
}
println!(
"{:>8} | {:>12.2} | {:>12.2} | {:>12.2} | {:>12.2}",
dim, times[0], times[1], times[2], times[3]
);
}
println!();
Ok(())
}
fn benchmark_policy() -> anyhow::Result<()> {
println!("--- Sense Layer (Policy) Benchmark ---\n");
let dimensions = [384, 768, 1024];
let iterations = 100000;
println!(
"{:>8} | {:>15} | {:>15} | {:>15}",
"Dims", "Threshold (us)", "Linear (us)", "MLP-32 (us)"
);
println!("{}", "-".repeat(60));
for dim in dimensions {
let mut rng = rand::thread_rng();
let chunk: Vec<f32> = (0..dim).map(|_| rng.gen_range(-1.0..1.0)).collect();
let query: Vec<f32> = (0..dim).map(|_| rng.gen_range(-1.0..1.0)).collect();
// Threshold policy
let threshold_policy = ThresholdPolicy::new(0.5);
let start = Instant::now();
for _ in 0..iterations {
let _ = threshold_policy.decide(&chunk, &query);
}
let threshold_time = start.elapsed().as_nanos() as f64 / iterations as f64 / 1000.0;
// Linear policy
let linear_policy = LinearPolicy::new(dim, 0.5);
let start = Instant::now();
for _ in 0..iterations {
let _ = linear_policy.decide(&chunk, &query);
}
let linear_time = start.elapsed().as_nanos() as f64 / iterations as f64 / 1000.0;
// MLP policy
let mlp_policy = MLPPolicy::new(dim, 32, 0.5);
let start = Instant::now();
for _ in 0..iterations {
let _ = mlp_policy.decide(&chunk, &query);
}
let mlp_time = start.elapsed().as_nanos() as f64 / iterations as f64 / 1000.0;
println!(
"{:>8} | {:>15.3} | {:>15.3} | {:>15.3}",
dim, threshold_time, linear_time, mlp_time
);
}
println!();
Ok(())
}
fn benchmark_projection() -> anyhow::Result<()> {
println!("--- Expand Layer (Projection) Benchmark ---\n");
let projections = [
(768, 4096, "RoBERTa -> LLaMA-8B"),
(768, 8192, "RoBERTa -> LLaMA-70B"),
(1536, 8192, "OpenAI -> GPT-4"),
(4096, 4096, "Identity"),
];
let iterations = 10000;
println!(
"{:>25} | {:>12} | {:>15}",
"Projection", "Time (us)", "Throughput"
);
println!("{}", "-".repeat(60));
for (source, target, name) in projections {
let mut rng = rand::thread_rng();
let input: Vec<f32> = (0..source).map(|_| rng.gen_range(-1.0..1.0)).collect();
let projector = if source == target {
Projector::identity(source, "test")
} else {
Projector::new(source, target, "test")
};
let start = Instant::now();
for _ in 0..iterations {
let _ = projector.project(&input);
}
let elapsed = start.elapsed();
let time_us = elapsed.as_nanos() as f64 / iterations as f64 / 1000.0;
let throughput = iterations as f64 / elapsed.as_secs_f64();
println!("{:>25} | {:>12.2} | {:>12.0}/s", name, time_us, throughput);
}
println!();
Ok(())
}
fn benchmark_end_to_end() -> anyhow::Result<()> {
println!("--- End-to-End Pipeline Benchmark ---\n");
let configs = [
(100, 10, "Small (100 docs, k=10)"),
(1000, 10, "Medium (1K docs, k=10)"),
(10000, 10, "Large (10K docs, k=10)"),
(10000, 100, "Large (10K docs, k=100)"),
];
let search_dim = 384;
let tensor_dim = 768;
let num_queries = 100;
println!(
"{:>30} | {:>12} | {:>12} | {:>10}",
"Configuration", "Avg (us)", "P99 (us)", "QPS"
);
println!("{}", "-".repeat(75));
for (num_docs, k, name) in configs {
let store = RefragStoreBuilder::new()
.search_dimensions(search_dim)
.tensor_dimensions(tensor_dim)
.compress_threshold(0.5)
.auto_project(false)
.build()?;
// Insert documents
let mut rng = rand::thread_rng();
for i in 0..num_docs {
let search_vec: Vec<f32> = (0..search_dim).map(|_| rng.gen_range(-1.0..1.0)).collect();
let tensor_vec: Vec<f32> = (0..tensor_dim).map(|_| rng.gen_range(-1.0..1.0)).collect();
let tensor_bytes: Vec<u8> = tensor_vec.iter().flat_map(|f| f.to_le_bytes()).collect();
let entry = RefragEntry::new(format!("doc_{}", i), search_vec, format!("Text {}", i))
.with_tensor(tensor_bytes, "llama3-8b");
store.insert(entry)?;
}
// Run queries and collect latencies
let mut latencies = Vec::with_capacity(num_queries);
for _ in 0..num_queries {
let query: Vec<f32> = (0..search_dim).map(|_| rng.gen_range(-1.0..1.0)).collect();
let start = Instant::now();
let _ = store.search_hybrid(&query, k, None)?;
latencies.push(start.elapsed());
}
// Calculate statistics
latencies.sort();
let avg_us =
latencies.iter().map(|d| d.as_micros()).sum::<u128>() as f64 / num_queries as f64;
let p99_idx = (num_queries as f64 * 0.99) as usize;
let p99_us = latencies[p99_idx.min(num_queries - 1)].as_micros();
let total_time: Duration = latencies.iter().sum();
let qps = num_queries as f64 / total_time.as_secs_f64();
println!(
"{:>30} | {:>12.1} | {:>12} | {:>10.0}",
name, avg_us, p99_us, qps
);
}
println!();
// Comparison summary
println!("--- Performance Summary ---\n");
println!("REFRAG Pipeline Latency Breakdown:");
println!(" 1. Vector search (HNSW): ~100-500us");
println!(" 2. Policy decision: ~1-50us");
println!(" 3. Tensor decompression: ~1-10us");
println!(" 4. Projection (optional): ~10-100us");
println!(" ----------------------------------------");
println!(" Total per query: ~150-700us");
println!();
println!("Compared to traditional RAG:");
println!(" - Text tokenization: ~1-5ms");
println!(" - LLM context preparation: ~5-20ms");
println!(" - Network transfer (text): ~10-50ms");
println!(" ----------------------------------------");
println!(" Potential speedup: 10-30x\n");
Ok(())
}