wifi-densepose/vendor/ruvector/crates/ruvllm/src/quantize/mod.rs

69 lines
1.9 KiB
Rust

//! Quantization Pipeline for RuvLTRA Models
//!
//! This module provides quantization capabilities for converting full-precision
//! models to optimized quantized formats suitable for edge inference on Apple Silicon.
//!
//! ## Supported Quantization Formats
//!
//! | Format | Bits | Memory (0.5B) | Quality | Use Case |
//! |--------|------|---------------|---------|----------|
//! | Q4_K_M | 4.5 | ~300 MB | Good | Best quality/size tradeoff |
//! | Q5_K_M | 5.5 | ~375 MB | Better | Higher quality, still compact |
//! | Q8_0 | 8.5 | ~500 MB | Best | Near-lossless quantization |
//!
//! ## Apple Neural Engine (ANE) Optimization
//!
//! The quantization pipeline produces weights optimized for ANE inference:
//! - 16-byte aligned weight layouts
//! - Blocked quantization compatible with ANE tile operations
//! - Optimized memory access patterns for M4 Pro's unified memory
//!
//! ## Example
//!
//! ```rust,ignore
//! use ruvllm::quantize::{RuvltraQuantizer, QuantConfig, TargetFormat};
//! use std::path::Path;
//!
//! // Create quantizer for Q4_K_M format
//! let config = QuantConfig::default()
//! .with_format(TargetFormat::Q4_K_M)
//! .with_ane_optimization(true);
//!
//! let quantizer = RuvltraQuantizer::new(config)?;
//!
//! // Quantize a model
//! quantizer.quantize_model(
//! Path::new("qwen-0.5b.safetensors"),
//! Path::new("ruvltra-small-q4.gguf"),
//! )?;
//! ```
mod ruvltra_quant;
pub use ruvltra_quant::{
dequantize_for_ane,
// Memory estimation
estimate_memory_q4,
estimate_memory_q5,
estimate_memory_q8,
// Quantization functions
quantize_ruvltra_q4,
quantize_ruvltra_q5,
quantize_ruvltra_q8,
MemoryEstimate,
// Block types
Q4KMBlock,
Q5KMBlock,
Q8Block,
QuantConfig,
// Progress tracking
QuantProgress,
QuantStats,
// Core quantizer
RuvltraQuantizer,
TargetFormat,
};