wifi-densepose/vendor/ruvector/crates/prime-radiant/src/gpu/mod.rs

157 lines
5.6 KiB
Rust

//! GPU acceleration module for Prime-Radiant coherence engine.
//!
//! This module provides GPU-accelerated computation using wgpu for:
//! - Parallel residual calculations across large graphs
//! - Matrix operations for restriction maps
//! - Energy aggregation with atomic operations
//! - Spectral analysis via power iteration
//!
//! # Architecture
//!
//! ```text
//! +------------------+ +------------------+ +------------------+
//! | GpuDevice |---->| GpuBuffer |---->| GpuDispatcher |
//! | (Init/Queue) | | (Alloc/Transfer)| | (Kernels/Sync) |
//! +------------------+ +------------------+ +------------------+
//! | | |
//! v v v
//! +------------------+ +------------------+ +------------------+
//! | Instance/Adapter | | BufferPool | | PipelineCache |
//! | Device/Queue | | Read/Write | | BindGroups |
//! +------------------+ +------------------+ +------------------+
//! ```
//!
//! # Feature Flag
//!
//! This module requires the `gpu` feature flag:
//! ```toml
//! [dependencies]
//! prime-radiant = { version = "0.1", features = ["gpu"] }
//! ```
//!
//! # Example
//!
//! ```rust,ignore
//! use prime_radiant::gpu::{GpuDevice, GpuBuffer, GpuDispatcher, ComputePipeline};
//!
//! #[tokio::main]
//! async fn main() -> Result<(), Box<dyn std::error::Error>> {
//! // Initialize GPU device
//! let device = GpuDevice::new().await?;
//!
//! // Create storage buffer with data
//! let input_data: Vec<f32> = vec![1.0, 2.0, 3.0, 4.0];
//! let input_buffer = GpuBuffer::new_storage(device.device(), &input_data, false);
//!
//! // Create output buffer
//! let output_buffer = GpuBuffer::new_storage_uninit::<f32>(
//! device.device(),
//! input_data.len(),
//! true,
//! );
//!
//! // Create compute pipeline
//! let pipeline = ComputePipeline::from_shader(
//! device.device(),
//! include_str!("shaders/compute_residuals.wgsl"),
//! "main",
//! &[BindingDesc::storage_readonly(), BindingDesc::storage_readwrite()],
//! )?;
//!
//! // Create dispatcher and execute
//! let dispatcher = GpuDispatcher::new(Arc::new(device));
//! let bind_group = pipeline.create_bind_group(
//! dispatcher.device().device(),
//! &[&input_buffer, &output_buffer],
//! )?;
//! dispatcher.dispatch(&pipeline, &bind_group, [4, 1, 1]).await?;
//!
//! Ok(())
//! }
//! ```
//!
//! # GPU Kernels
//!
//! The following WGSL compute shaders are implemented:
//!
//! 1. **compute_residuals.wgsl** - Parallel residual computation for all edges
//! 2. **compute_energy.wgsl** - Parallel energy aggregation with tree reduction
//! 3. **sheaf_attention.wgsl** - Batched attention: A_ij = exp(-beta * E_ij) / Z
//! 4. **token_routing.wgsl** - Parallel lane assignment based on energy thresholds
//!
//! # Performance Targets
//!
//! | Operation | Target | Notes |
//! |-----------|--------|-------|
//! | Buffer allocation | < 1ms | Pooled for hot paths |
//! | Kernel dispatch | < 100us | Excludes GPU execution |
//! | Residual (10K edges) | < 1ms | GPU parallel |
//! | Energy aggregation | < 500us | Atomic reduction |
mod buffer;
mod device;
mod dispatch;
mod engine;
mod error;
mod kernels;
mod pipeline;
// Core exports
pub use buffer::{
BufferKey, BufferUsage, BufferUsageFlags, GpuBuffer, GpuBufferManager, GpuBufferPool,
};
pub use device::{GpuDevice, GpuDeviceInfo, GpuDeviceOptions};
pub use dispatch::{DispatchBuilder, DispatchConfig, GpuDispatcher};
pub use error::{GpuError, GpuResult};
pub use pipeline::{BindingDesc, BindingType, ComputePipeline, PipelineCache};
// Re-export buffer types
pub use buffer::{GpuEdge, GpuNodeState, GpuParams, GpuRestrictionMap};
// Re-export engine types
pub use engine::{GpuCapabilities, GpuCoherenceEnergy, GpuCoherenceEngine, GpuConfig};
/// Synchronous API for GPU coherence engine (uses pollster)
pub mod sync {
pub use super::engine::sync::*;
}
// Re-export kernel types
pub use kernels::{
AttentionWeight, ComputeEnergyKernel, ComputeResidualsKernel, EnergyParams, LaneStats,
RoutingDecision, SheafAttentionKernel, Token, TokenRoutingKernel,
};
/// Default workgroup size for compute shaders
pub const DEFAULT_WORKGROUP_SIZE: u32 = 256;
/// Maximum buffer size for a single allocation (256MB)
pub const MAX_BUFFER_SIZE: u64 = 256 * 1024 * 1024;
/// Default pool capacity for buffer reuse
pub const DEFAULT_POOL_CAPACITY: usize = 32;
/// Shader source code embedded at compile time
pub mod shaders {
/// Compute residuals shader for parallel edge residual computation
pub const COMPUTE_RESIDUALS: &str = include_str!("shaders/compute_residuals.wgsl");
/// Compute energy shader for parallel reduction
pub const COMPUTE_ENERGY: &str = include_str!("shaders/compute_energy.wgsl");
/// Sheaf attention shader for attention weight computation
pub const SHEAF_ATTENTION: &str = include_str!("shaders/sheaf_attention.wgsl");
/// Token routing shader for lane assignment
pub const TOKEN_ROUTING: &str = include_str!("shaders/token_routing.wgsl");
}
/// GPU workgroup size constants
pub mod workgroup {
/// Default workgroup size for 1D compute
pub const SIZE_1D: u32 = 256;
/// Default workgroup size for 2D compute (x dimension)
pub const SIZE_2D_X: u32 = 16;
/// Default workgroup size for 2D compute (y dimension)
pub const SIZE_2D_Y: u32 = 16;
/// Maximum state vector dimension for GPU kernels
pub const MAX_STATE_DIM: u32 = 512;
}