//! REFRAG Pipeline Benchmark //! //! Measures performance of the Compress-Sense-Expand pipeline. //! //! Run with: cargo run --bin refrag-benchmark --release use refrag_pipeline_example::{ compress::{CompressionStrategy, TensorCompressor}, expand::{ExpandLayer, Projector, ProjectorRegistry}, sense::{LinearPolicy, MLPPolicy, PolicyModel, PolicyNetwork, ThresholdPolicy}, store::RefragStoreBuilder, types::RefragEntry, }; use rand::Rng; use std::time::{Duration, Instant}; fn main() -> anyhow::Result<()> { println!("================================================="); println!(" REFRAG Pipeline Benchmark "); println!("=================================================\n"); // Run all benchmarks benchmark_compression()?; benchmark_policy()?; benchmark_projection()?; benchmark_end_to_end()?; Ok(()) } fn benchmark_compression() -> anyhow::Result<()> { println!("--- Compression Layer Benchmark ---\n"); let dimensions = [384, 768, 1024, 2048, 4096]; let iterations = 10000; println!( "{:>8} | {:>12} | {:>12} | {:>12} | {:>12}", "Dims", "None (us)", "Float16 (us)", "Int8 (us)", "Binary (us)" ); println!("{}", "-".repeat(70)); for dim in dimensions { let mut rng = rand::thread_rng(); let vector: Vec = (0..dim).map(|_| rng.gen_range(-1.0..1.0)).collect(); let strategies = [ CompressionStrategy::None, CompressionStrategy::Float16, CompressionStrategy::Int8, CompressionStrategy::Binary, ]; let mut times = Vec::new(); for strategy in strategies { let compressor = TensorCompressor::new(dim).with_strategy(strategy); let start = Instant::now(); for _ in 0..iterations { let _ = compressor.compress(&vector); } let elapsed = start.elapsed(); times.push(elapsed.as_nanos() as f64 / iterations as f64 / 1000.0); } println!( "{:>8} | {:>12.2} | {:>12.2} | {:>12.2} | {:>12.2}", dim, times[0], times[1], times[2], times[3] ); } println!(); Ok(()) } fn benchmark_policy() -> anyhow::Result<()> { println!("--- Sense Layer (Policy) Benchmark ---\n"); let dimensions = [384, 768, 1024]; let iterations = 100000; println!( "{:>8} | {:>15} | {:>15} | {:>15}", "Dims", "Threshold (us)", "Linear (us)", "MLP-32 (us)" ); println!("{}", "-".repeat(60)); for dim in dimensions { let mut rng = rand::thread_rng(); let chunk: Vec = (0..dim).map(|_| rng.gen_range(-1.0..1.0)).collect(); let query: Vec = (0..dim).map(|_| rng.gen_range(-1.0..1.0)).collect(); // Threshold policy let threshold_policy = ThresholdPolicy::new(0.5); let start = Instant::now(); for _ in 0..iterations { let _ = threshold_policy.decide(&chunk, &query); } let threshold_time = start.elapsed().as_nanos() as f64 / iterations as f64 / 1000.0; // Linear policy let linear_policy = LinearPolicy::new(dim, 0.5); let start = Instant::now(); for _ in 0..iterations { let _ = linear_policy.decide(&chunk, &query); } let linear_time = start.elapsed().as_nanos() as f64 / iterations as f64 / 1000.0; // MLP policy let mlp_policy = MLPPolicy::new(dim, 32, 0.5); let start = Instant::now(); for _ in 0..iterations { let _ = mlp_policy.decide(&chunk, &query); } let mlp_time = start.elapsed().as_nanos() as f64 / iterations as f64 / 1000.0; println!( "{:>8} | {:>15.3} | {:>15.3} | {:>15.3}", dim, threshold_time, linear_time, mlp_time ); } println!(); Ok(()) } fn benchmark_projection() -> anyhow::Result<()> { println!("--- Expand Layer (Projection) Benchmark ---\n"); let projections = [ (768, 4096, "RoBERTa -> LLaMA-8B"), (768, 8192, "RoBERTa -> LLaMA-70B"), (1536, 8192, "OpenAI -> GPT-4"), (4096, 4096, "Identity"), ]; let iterations = 10000; println!( "{:>25} | {:>12} | {:>15}", "Projection", "Time (us)", "Throughput" ); println!("{}", "-".repeat(60)); for (source, target, name) in projections { let mut rng = rand::thread_rng(); let input: Vec = (0..source).map(|_| rng.gen_range(-1.0..1.0)).collect(); let projector = if source == target { Projector::identity(source, "test") } else { Projector::new(source, target, "test") }; let start = Instant::now(); for _ in 0..iterations { let _ = projector.project(&input); } let elapsed = start.elapsed(); let time_us = elapsed.as_nanos() as f64 / iterations as f64 / 1000.0; let throughput = iterations as f64 / elapsed.as_secs_f64(); println!("{:>25} | {:>12.2} | {:>12.0}/s", name, time_us, throughput); } println!(); Ok(()) } fn benchmark_end_to_end() -> anyhow::Result<()> { println!("--- End-to-End Pipeline Benchmark ---\n"); let configs = [ (100, 10, "Small (100 docs, k=10)"), (1000, 10, "Medium (1K docs, k=10)"), (10000, 10, "Large (10K docs, k=10)"), (10000, 100, "Large (10K docs, k=100)"), ]; let search_dim = 384; let tensor_dim = 768; let num_queries = 100; println!( "{:>30} | {:>12} | {:>12} | {:>10}", "Configuration", "Avg (us)", "P99 (us)", "QPS" ); println!("{}", "-".repeat(75)); for (num_docs, k, name) in configs { let store = RefragStoreBuilder::new() .search_dimensions(search_dim) .tensor_dimensions(tensor_dim) .compress_threshold(0.5) .auto_project(false) .build()?; // Insert documents let mut rng = rand::thread_rng(); for i in 0..num_docs { let search_vec: Vec = (0..search_dim).map(|_| rng.gen_range(-1.0..1.0)).collect(); let tensor_vec: Vec = (0..tensor_dim).map(|_| rng.gen_range(-1.0..1.0)).collect(); let tensor_bytes: Vec = tensor_vec.iter().flat_map(|f| f.to_le_bytes()).collect(); let entry = RefragEntry::new(format!("doc_{}", i), search_vec, format!("Text {}", i)) .with_tensor(tensor_bytes, "llama3-8b"); store.insert(entry)?; } // Run queries and collect latencies let mut latencies = Vec::with_capacity(num_queries); for _ in 0..num_queries { let query: Vec = (0..search_dim).map(|_| rng.gen_range(-1.0..1.0)).collect(); let start = Instant::now(); let _ = store.search_hybrid(&query, k, None)?; latencies.push(start.elapsed()); } // Calculate statistics latencies.sort(); let avg_us = latencies.iter().map(|d| d.as_micros()).sum::() as f64 / num_queries as f64; let p99_idx = (num_queries as f64 * 0.99) as usize; let p99_us = latencies[p99_idx.min(num_queries - 1)].as_micros(); let total_time: Duration = latencies.iter().sum(); let qps = num_queries as f64 / total_time.as_secs_f64(); println!( "{:>30} | {:>12.1} | {:>12} | {:>10.0}", name, avg_us, p99_us, qps ); } println!(); // Comparison summary println!("--- Performance Summary ---\n"); println!("REFRAG Pipeline Latency Breakdown:"); println!(" 1. Vector search (HNSW): ~100-500us"); println!(" 2. Policy decision: ~1-50us"); println!(" 3. Tensor decompression: ~1-10us"); println!(" 4. Projection (optional): ~10-100us"); println!(" ----------------------------------------"); println!(" Total per query: ~150-700us"); println!(); println!("Compared to traditional RAG:"); println!(" - Text tokenization: ~1-5ms"); println!(" - LLM context preparation: ~5-20ms"); println!(" - Network transfer (text): ~10-50ms"); println!(" ----------------------------------------"); println!(" Potential speedup: 10-30x\n"); Ok(()) }