//! Demonstration of performance optimizations in ruvector-scipix //! //! This example shows how to use various optimization features: //! - SIMD operations for image processing //! - Parallel batch processing //! - Memory pooling //! - Model quantization //! - Dynamic batching use ruvector_scipix::optimize::*; use std::sync::Arc; use std::time::Instant; fn main() { println!("=== Ruvector-Scipix Optimization Demo ===\n"); // 1. Feature Detection demo_feature_detection(); // 2. SIMD Operations demo_simd_operations(); // 3. Parallel Processing demo_parallel_processing(); // 4. Memory Optimizations demo_memory_optimizations(); // 5. Model Quantization demo_quantization(); println!("\n=== Demo Complete ==="); } fn demo_feature_detection() { println!("1. CPU Feature Detection"); println!("------------------------"); let features = detect_features(); println!("AVX2 Support: {}", if features.avx2 { "✓" } else { "✗" }); println!( "AVX-512 Support: {}", if features.avx512f { "✓" } else { "✗" } ); println!("NEON Support: {}", if features.neon { "✓" } else { "✗" }); println!( "SSE4.2 Support: {}", if features.sse4_2 { "✓" } else { "✗" } ); let opt_level = get_opt_level(); println!("Optimization Level: {:?}", opt_level); println!(); } fn demo_simd_operations() { println!("2. SIMD Operations"); println!("------------------"); // Create test image (512x512 RGBA) let size = 512; let rgba: Vec = (0..size * size * 4).map(|i| (i % 256) as u8).collect(); let mut gray = vec![0u8; size * size]; // Benchmark grayscale conversion let iterations = 100; let start = Instant::now(); for _ in 0..iterations { simd::simd_grayscale(&rgba, &mut gray); } let simd_time = start.elapsed(); println!("Grayscale conversion ({} iterations):", iterations); println!( " SIMD: {:?} ({:.2} MP/s)", simd_time, (iterations as f64 * size as f64 * size as f64 / 1_000_000.0) / simd_time.as_secs_f64() ); // Benchmark threshold let mut binary = vec![0u8; size * size]; let start = Instant::now(); for _ in 0..iterations { simd::simd_threshold(&gray, 128, &mut binary); } let threshold_time = start.elapsed(); println!("Threshold operation ({} iterations):", iterations); println!( " SIMD: {:?} ({:.2} MP/s)", threshold_time, (iterations as f64 * size as f64 * size as f64 / 1_000_000.0) / threshold_time.as_secs_f64() ); // Benchmark normalization let mut data: Vec = (0..8192).map(|i| i as f32).collect(); let start = Instant::now(); for _ in 0..iterations { simd::simd_normalize(&mut data); } let normalize_time = start.elapsed(); println!("Normalization ({} iterations):", iterations); println!(" SIMD: {:?}", normalize_time); println!(); } fn demo_parallel_processing() { println!("3. Parallel Processing"); println!("----------------------"); let data: Vec = (0..10000).collect(); // Sequential processing let start = Instant::now(); let _seq_result: Vec = data.iter().map(|&x| expensive_computation(x)).collect(); let seq_time = start.elapsed(); // Parallel processing let start = Instant::now(); let _par_result = parallel::parallel_map_chunked(data.clone(), 100, |x| expensive_computation(x)); let par_time = start.elapsed(); println!("Processing 10,000 items:"); println!(" Sequential: {:?}", seq_time); println!(" Parallel: {:?}", par_time); println!( " Speedup: {:.2}x", seq_time.as_secs_f64() / par_time.as_secs_f64() ); let threads = parallel::optimal_thread_count(); println!(" Using {} threads", threads); println!(); } fn expensive_computation(x: i32) -> i32 { // Simulate some work (0..100).fold(x, |acc, i| acc.wrapping_add(i)) } fn demo_memory_optimizations() { println!("4. Memory Optimizations"); println!("-----------------------"); let pools = memory::GlobalPools::get(); // Benchmark buffer pool vs direct allocation let iterations = 10000; // Pooled allocation let start = Instant::now(); for _ in 0..iterations { let mut buf = pools.acquire_small(); buf.extend_from_slice(&[0u8; 512]); } let pooled_time = start.elapsed(); // Direct allocation let start = Instant::now(); for _ in 0..iterations { let mut buf = Vec::with_capacity(1024); buf.extend_from_slice(&[0u8; 512]); } let direct_time = start.elapsed(); println!("Buffer allocation ({} iterations):", iterations); println!(" Pooled: {:?}", pooled_time); println!(" Direct: {:?}", direct_time); println!( " Speedup: {:.2}x", direct_time.as_secs_f64() / pooled_time.as_secs_f64() ); // Arena allocation let mut arena = memory::Arena::with_capacity(1024 * 1024); let start = Instant::now(); for _ in 0..iterations { arena.reset(); for _ in 0..10 { let _slice = arena.alloc(1024, 8); } } let arena_time = start.elapsed(); println!( "\nArena allocation ({} iterations, 10 allocs each):", iterations ); println!(" Time: {:?}", arena_time); println!(); } fn demo_quantization() { println!("5. Model Quantization"); println!("---------------------"); // Create model weights let size = 100_000; let weights: Vec = (0..size) .map(|i| ((i as f32 / size as f32) * 2.0 - 1.0)) .collect(); println!( "Original model: {} weights ({:.2} MB)", weights.len(), (weights.len() * std::mem::size_of::()) as f64 / 1_048_576.0 ); // Quantize let start = Instant::now(); let (quantized, params) = quantize::quantize_weights(&weights); let quant_time = start.elapsed(); println!( "Quantized: {} weights ({:.2} MB)", quantized.len(), (quantized.len() * std::mem::size_of::()) as f64 / 1_048_576.0 ); println!( "Compression: {:.2}x", (weights.len() * std::mem::size_of::()) as f64 / (quantized.len() * std::mem::size_of::()) as f64 ); println!("Quantization time: {:?}", quant_time); // Check quality let error = quantize::quantization_error(&weights, &quantized, params); let snr = quantize::sqnr(&weights, &quantized, params); println!("Quality metrics:"); println!(" MSE: {:.6}", error); println!(" SQNR: {:.2} dB", snr); // Benchmark dequantization let iterations = 100; let start = Instant::now(); for _ in 0..iterations { let _restored = quantize::dequantize(&quantized, params); } let dequant_time = start.elapsed(); println!( "Dequantization ({} iterations): {:?}", iterations, dequant_time ); // Per-channel quantization let weights_2d: Vec = (0..10_000).map(|i| i as f32).collect(); let shape = vec![100, 100]; // 100 channels, 100 values each let start = Instant::now(); let per_channel = quantize::PerChannelQuant::from_f32(&weights_2d, shape); let per_channel_time = start.elapsed(); println!("\nPer-channel quantization:"); println!(" Channels: {}", per_channel.params.len()); println!(" Time: {:?}", per_channel_time); println!(); } // Async batching demo (would need tokio runtime) #[allow(dead_code)] async fn demo_batching() { println!("6. Dynamic Batching"); println!("-------------------"); use batch::{BatchConfig, DynamicBatcher}; let config = BatchConfig { max_batch_size: 32, max_wait_ms: 50, max_queue_size: 1000, preferred_batch_size: 16, }; let batcher = Arc::new(DynamicBatcher::new(config, |items: Vec| { // Simulate batch processing items.into_iter().map(|x| Ok(x * 2)).collect() })); // Start processing loop let batcher_clone = batcher.clone(); tokio::spawn(async move { batcher_clone.run().await; }); // Add items let mut handles = vec![]; for i in 0..100 { let batcher = batcher.clone(); handles.push(tokio::spawn(async move { batcher.add(i).await })); } // Wait for results for handle in handles { let _ = handle.await; } let stats = batcher.stats().await; println!("Queue size: {}", stats.queue_size); println!("Max wait: {:?}", stats.max_wait_time); batcher.shutdown().await; }