350 lines
10 KiB
Rust
350 lines
10 KiB
Rust
//! Parallel Processing for 256-Tile Fabric
|
||
//!
|
||
//! This module provides rayon-based parallel processing for the tile fabric,
|
||
//! achieving 4-8× throughput improvement on multi-core systems.
|
||
//!
|
||
//! ## Architecture
|
||
//!
|
||
//! ```text
|
||
//! Syndromes ──┬──► Tile 0-63 ──┐
|
||
//! ├──► Tile 64-127 ──┼──► TileZero Merge ──► Decision
|
||
//! ├──► Tile 128-191 ─┤
|
||
//! └──► Tile 192-255 ─┘
|
||
//! (parallel) (parallel reduce)
|
||
//! ```
|
||
//!
|
||
//! ## Usage
|
||
//!
|
||
//! ```rust,ignore
|
||
//! use ruqu::parallel::{ParallelFabric, ParallelConfig};
|
||
//!
|
||
//! let config = ParallelConfig::default(); // Auto-detect cores
|
||
//! let mut fabric = ParallelFabric::new(config)?;
|
||
//!
|
||
//! // Process syndromes in parallel
|
||
//! let decision = fabric.process_parallel(&syndrome_data)?;
|
||
//! ```
|
||
|
||
#[cfg(feature = "parallel")]
|
||
use rayon::prelude::*;
|
||
|
||
use crate::error::{Result, RuQuError};
|
||
use crate::tile::{GateDecision, GateThresholds, SyndromeDelta, TileReport, TileZero, WorkerTile};
|
||
|
||
/// Configuration for parallel processing
|
||
#[derive(Clone, Debug)]
|
||
pub struct ParallelConfig {
|
||
/// Number of worker threads (0 = auto-detect)
|
||
pub num_threads: usize,
|
||
/// Chunk size for parallel iteration
|
||
pub chunk_size: usize,
|
||
/// Enable work-stealing scheduler
|
||
pub work_stealing: bool,
|
||
/// Tile thresholds
|
||
pub thresholds: GateThresholds,
|
||
}
|
||
|
||
impl Default for ParallelConfig {
|
||
fn default() -> Self {
|
||
Self {
|
||
num_threads: 0, // Auto-detect
|
||
chunk_size: 16, // Process 16 tiles per chunk
|
||
work_stealing: true,
|
||
thresholds: GateThresholds::default(),
|
||
}
|
||
}
|
||
}
|
||
|
||
impl ParallelConfig {
|
||
/// Create config optimized for low latency
|
||
pub fn low_latency() -> Self {
|
||
Self {
|
||
num_threads: 4,
|
||
chunk_size: 64, // Larger chunks = less overhead
|
||
work_stealing: false, // Predictable scheduling
|
||
thresholds: GateThresholds::default(),
|
||
}
|
||
}
|
||
|
||
/// Create config optimized for high throughput
|
||
pub fn high_throughput() -> Self {
|
||
Self {
|
||
num_threads: 0, // Use all cores
|
||
chunk_size: 8, // Smaller chunks = better load balancing
|
||
work_stealing: true,
|
||
thresholds: GateThresholds::default(),
|
||
}
|
||
}
|
||
}
|
||
|
||
/// Parallel fabric for multi-threaded syndrome processing
|
||
pub struct ParallelFabric {
|
||
/// Worker tiles (256 total, indices 1-255 are workers)
|
||
workers: Vec<WorkerTile>,
|
||
/// TileZero coordinator
|
||
coordinator: TileZero,
|
||
/// Configuration
|
||
config: ParallelConfig,
|
||
/// Statistics
|
||
stats: ParallelStats,
|
||
}
|
||
|
||
/// Statistics for parallel processing
|
||
#[derive(Clone, Copy, Debug, Default)]
|
||
pub struct ParallelStats {
|
||
/// Total syndromes processed
|
||
pub total_processed: u64,
|
||
/// Total parallel batches
|
||
pub batches: u64,
|
||
/// Average batch time (nanoseconds)
|
||
pub avg_batch_time_ns: u64,
|
||
/// Peak throughput (syndromes/sec)
|
||
pub peak_throughput: f64,
|
||
}
|
||
|
||
impl ParallelFabric {
|
||
/// Create a new parallel fabric
|
||
pub fn new(config: ParallelConfig) -> Result<Self> {
|
||
#[cfg(feature = "parallel")]
|
||
{
|
||
// Configure rayon thread pool if needed
|
||
if config.num_threads > 0 {
|
||
rayon::ThreadPoolBuilder::new()
|
||
.num_threads(config.num_threads)
|
||
.build_global()
|
||
.ok(); // Ignore if already initialized
|
||
}
|
||
}
|
||
|
||
// Create 255 worker tiles (1-255)
|
||
let workers: Vec<WorkerTile> = (1..=255u8).map(WorkerTile::new).collect();
|
||
|
||
let coordinator = TileZero::with_random_key(config.thresholds.clone());
|
||
|
||
Ok(Self {
|
||
workers,
|
||
coordinator,
|
||
config,
|
||
stats: ParallelStats::default(),
|
||
})
|
||
}
|
||
|
||
/// Process a syndrome batch in parallel
|
||
#[cfg(feature = "parallel")]
|
||
pub fn process_parallel(&mut self, syndrome: &SyndromeDelta) -> Result<GateDecision> {
|
||
use std::time::Instant;
|
||
let start = Instant::now();
|
||
|
||
// Process all workers in parallel
|
||
let reports: Vec<TileReport> = self
|
||
.workers
|
||
.par_iter_mut()
|
||
.with_min_len(self.config.chunk_size)
|
||
.map(|worker| worker.tick(syndrome))
|
||
.collect();
|
||
|
||
// Merge reports (single-threaded at coordinator)
|
||
let decision = self.coordinator.merge_reports(reports);
|
||
|
||
// Update stats
|
||
let elapsed_ns = start.elapsed().as_nanos() as u64;
|
||
self.stats.total_processed += 255;
|
||
self.stats.batches += 1;
|
||
self.stats.avg_batch_time_ns = (self.stats.avg_batch_time_ns * (self.stats.batches - 1)
|
||
+ elapsed_ns)
|
||
/ self.stats.batches;
|
||
|
||
let throughput = 255.0 / (elapsed_ns as f64 / 1_000_000_000.0);
|
||
if throughput > self.stats.peak_throughput {
|
||
self.stats.peak_throughput = throughput;
|
||
}
|
||
|
||
Ok(decision)
|
||
}
|
||
|
||
/// Process a syndrome batch (fallback for non-parallel builds)
|
||
#[cfg(not(feature = "parallel"))]
|
||
pub fn process_parallel(&mut self, syndrome: &SyndromeDelta) -> Result<GateDecision> {
|
||
use std::time::Instant;
|
||
let start = Instant::now();
|
||
|
||
// Process all workers sequentially
|
||
let reports: Vec<TileReport> = self
|
||
.workers
|
||
.iter_mut()
|
||
.map(|worker| worker.tick(syndrome))
|
||
.collect();
|
||
|
||
// Merge reports
|
||
let decision = self.coordinator.merge_reports(reports);
|
||
|
||
// Update stats
|
||
let elapsed_ns = start.elapsed().as_nanos() as u64;
|
||
self.stats.total_processed += 255;
|
||
self.stats.batches += 1;
|
||
self.stats.avg_batch_time_ns = (self.stats.avg_batch_time_ns * (self.stats.batches - 1)
|
||
+ elapsed_ns)
|
||
/ self.stats.batches;
|
||
|
||
Ok(decision)
|
||
}
|
||
|
||
/// Process multiple syndromes in parallel (batch mode)
|
||
#[cfg(feature = "parallel")]
|
||
pub fn process_batch(&mut self, syndromes: &[SyndromeDelta]) -> Result<Vec<GateDecision>> {
|
||
// Process each syndrome, parallelizing across tiles within each
|
||
let decisions: Vec<GateDecision> = syndromes
|
||
.iter()
|
||
.map(|s| self.process_parallel(s).unwrap_or(GateDecision::Defer))
|
||
.collect();
|
||
|
||
Ok(decisions)
|
||
}
|
||
|
||
/// Process multiple syndromes (fallback)
|
||
#[cfg(not(feature = "parallel"))]
|
||
pub fn process_batch(&mut self, syndromes: &[SyndromeDelta]) -> Result<Vec<GateDecision>> {
|
||
let decisions: Vec<GateDecision> = syndromes
|
||
.iter()
|
||
.map(|s| self.process_parallel(s).unwrap_or(GateDecision::Defer))
|
||
.collect();
|
||
|
||
Ok(decisions)
|
||
}
|
||
|
||
/// Get processing statistics
|
||
pub fn stats(&self) -> &ParallelStats {
|
||
&self.stats
|
||
}
|
||
|
||
/// Reset statistics
|
||
pub fn reset_stats(&mut self) {
|
||
self.stats = ParallelStats::default();
|
||
}
|
||
|
||
/// Get the coordinator for direct access
|
||
pub fn coordinator(&self) -> &TileZero {
|
||
&self.coordinator
|
||
}
|
||
|
||
/// Get mutable coordinator
|
||
pub fn coordinator_mut(&mut self) -> &mut TileZero {
|
||
&mut self.coordinator
|
||
}
|
||
}
|
||
|
||
/// Parallel reduce for aggregating tile reports
|
||
#[cfg(feature = "parallel")]
|
||
pub fn parallel_aggregate(reports: &[TileReport]) -> (f64, f64, f64) {
|
||
use rayon::prelude::*;
|
||
|
||
if reports.is_empty() {
|
||
return (f64::MAX, 0.0, 1.0);
|
||
}
|
||
|
||
// Parallel reduction for min_cut (minimum)
|
||
let min_cut = reports
|
||
.par_iter()
|
||
.map(|r| {
|
||
if r.local_cut > 0.0 {
|
||
r.local_cut
|
||
} else {
|
||
f64::MAX
|
||
}
|
||
})
|
||
.reduce(|| f64::MAX, |a, b| a.min(b));
|
||
|
||
// Parallel reduction for shift (maximum)
|
||
let max_shift = reports
|
||
.par_iter()
|
||
.map(|r| r.shift_score)
|
||
.reduce(|| 0.0, |a, b| a.max(b));
|
||
|
||
// Parallel reduction for e-value (geometric mean via log sum)
|
||
let log_sum: f64 = reports
|
||
.par_iter()
|
||
.map(|r| f64::log2(r.e_value.max(1e-10)))
|
||
.sum();
|
||
|
||
let e_aggregate = f64::exp2(log_sum / reports.len() as f64);
|
||
|
||
(min_cut, max_shift, e_aggregate)
|
||
}
|
||
|
||
/// Sequential aggregate (fallback)
|
||
#[cfg(not(feature = "parallel"))]
|
||
pub fn parallel_aggregate(reports: &[TileReport]) -> (f64, f64, f64) {
|
||
if reports.is_empty() {
|
||
return (f64::MAX, 0.0, 1.0);
|
||
}
|
||
|
||
let mut min_cut = f64::MAX;
|
||
let mut max_shift = 0.0;
|
||
let mut log_sum = 0.0;
|
||
|
||
for r in reports {
|
||
if r.local_cut > 0.0 && r.local_cut < min_cut {
|
||
min_cut = r.local_cut;
|
||
}
|
||
if r.shift_score > max_shift {
|
||
max_shift = r.shift_score;
|
||
}
|
||
log_sum += f64::log2(r.e_value.max(1e-10));
|
||
}
|
||
|
||
let e_aggregate = f64::exp2(log_sum / reports.len() as f64);
|
||
(min_cut, max_shift, e_aggregate)
|
||
}
|
||
|
||
#[cfg(test)]
|
||
mod tests {
|
||
use super::*;
|
||
|
||
#[test]
|
||
fn test_parallel_config_default() {
|
||
let config = ParallelConfig::default();
|
||
assert_eq!(config.num_threads, 0);
|
||
assert!(config.work_stealing);
|
||
}
|
||
|
||
#[test]
|
||
fn test_parallel_fabric_creation() {
|
||
let config = ParallelConfig::default();
|
||
let fabric = ParallelFabric::new(config);
|
||
assert!(fabric.is_ok());
|
||
|
||
let fabric = fabric.unwrap();
|
||
assert_eq!(fabric.workers.len(), 255);
|
||
}
|
||
|
||
#[test]
|
||
fn test_parallel_process() {
|
||
let config = ParallelConfig::default();
|
||
let mut fabric = ParallelFabric::new(config).unwrap();
|
||
|
||
let syndrome = SyndromeDelta::new(1, 2, 100);
|
||
let decision = fabric.process_parallel(&syndrome);
|
||
|
||
assert!(decision.is_ok());
|
||
}
|
||
|
||
#[test]
|
||
fn test_parallel_aggregate() {
|
||
let reports: Vec<TileReport> = (1..=10)
|
||
.map(|i| {
|
||
let mut r = TileReport::new(i);
|
||
r.local_cut = i as f64 * 2.0;
|
||
r.shift_score = i as f64 * 0.05;
|
||
r.e_value = 100.0;
|
||
r
|
||
})
|
||
.collect();
|
||
|
||
let (min_cut, max_shift, e_agg) = parallel_aggregate(&reports);
|
||
|
||
assert_eq!(min_cut, 2.0); // First report has 2.0
|
||
assert!((max_shift - 0.5).abs() < 0.001); // Last report has 0.5
|
||
assert!((e_agg - 100.0).abs() < 0.001); // All have 100.0
|
||
}
|
||
}
|