245 lines
8.0 KiB
Rust
245 lines
8.0 KiB
Rust
//! Model definitions and aliases for RuvLLM CLI
|
|
//!
|
|
//! This module defines the recommended models for different use cases,
|
|
//! optimized for Mac M4 Pro with 36GB unified memory.
|
|
|
|
use serde::{Deserialize, Serialize};
|
|
use std::collections::HashMap;
|
|
|
|
/// Recommended models for RuvLLM on Mac M4 Pro
|
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
pub struct ModelDefinition {
|
|
/// HuggingFace model ID
|
|
pub hf_id: String,
|
|
/// Short alias for CLI
|
|
pub alias: String,
|
|
/// Display name
|
|
pub name: String,
|
|
/// Model architecture (mistral, llama, phi, qwen)
|
|
pub architecture: String,
|
|
/// Parameter count in billions
|
|
pub params_b: f32,
|
|
/// Primary use case
|
|
pub use_case: String,
|
|
/// Recommended quantization
|
|
pub recommended_quant: String,
|
|
/// Estimated memory usage in GB (for recommended quant)
|
|
pub memory_gb: f32,
|
|
/// Context length
|
|
pub context_length: usize,
|
|
/// Notes about the model
|
|
pub notes: String,
|
|
}
|
|
|
|
/// Get all recommended models
|
|
pub fn get_recommended_models() -> Vec<ModelDefinition> {
|
|
vec![
|
|
// Primary reasoning model
|
|
ModelDefinition {
|
|
hf_id: "Qwen/Qwen2.5-14B-Instruct-GGUF".to_string(),
|
|
alias: "qwen".to_string(),
|
|
name: "Qwen2.5-14B-Instruct".to_string(),
|
|
architecture: "qwen".to_string(),
|
|
params_b: 14.0,
|
|
use_case: "Primary reasoning, code generation, complex tasks".to_string(),
|
|
recommended_quant: "Q4_K_M".to_string(),
|
|
memory_gb: 9.5,
|
|
context_length: 32768,
|
|
notes: "Best overall performance for reasoning tasks on M4 Pro".to_string(),
|
|
},
|
|
// Fast instruction following
|
|
ModelDefinition {
|
|
hf_id: "mistralai/Mistral-7B-Instruct-v0.3".to_string(),
|
|
alias: "mistral".to_string(),
|
|
name: "Mistral-7B-Instruct-v0.3".to_string(),
|
|
architecture: "mistral".to_string(),
|
|
params_b: 7.0,
|
|
use_case: "Fast instruction following, general chat".to_string(),
|
|
recommended_quant: "Q4_K_M".to_string(),
|
|
memory_gb: 4.5,
|
|
context_length: 32768,
|
|
notes: "Excellent speed/quality tradeoff with sliding window attention".to_string(),
|
|
},
|
|
// Tiny/testing model
|
|
ModelDefinition {
|
|
hf_id: "microsoft/Phi-4-mini-instruct".to_string(),
|
|
alias: "phi".to_string(),
|
|
name: "Phi-4-mini".to_string(),
|
|
architecture: "phi".to_string(),
|
|
params_b: 3.8,
|
|
use_case: "Testing, quick prototyping, resource-constrained".to_string(),
|
|
recommended_quant: "Q4_K_M".to_string(),
|
|
memory_gb: 2.5,
|
|
context_length: 16384,
|
|
notes: "Surprisingly capable for its size, fast inference".to_string(),
|
|
},
|
|
// Tool use model
|
|
ModelDefinition {
|
|
hf_id: "meta-llama/Llama-3.2-3B-Instruct".to_string(),
|
|
alias: "llama".to_string(),
|
|
name: "Llama-3.2-3B-Instruct".to_string(),
|
|
architecture: "llama".to_string(),
|
|
params_b: 3.2,
|
|
use_case: "Tool use, function calling, structured output".to_string(),
|
|
recommended_quant: "Q4_K_M".to_string(),
|
|
memory_gb: 2.2,
|
|
context_length: 131072,
|
|
notes: "Optimized for tool use and function calling".to_string(),
|
|
},
|
|
// Code-specific model
|
|
ModelDefinition {
|
|
hf_id: "Qwen/Qwen2.5-Coder-7B-Instruct-GGUF".to_string(),
|
|
alias: "qwen-coder".to_string(),
|
|
name: "Qwen2.5-Coder-7B-Instruct".to_string(),
|
|
architecture: "qwen".to_string(),
|
|
params_b: 7.0,
|
|
use_case: "Code generation, code review, debugging".to_string(),
|
|
recommended_quant: "Q4_K_M".to_string(),
|
|
memory_gb: 4.8,
|
|
context_length: 32768,
|
|
notes: "Specialized for coding tasks, excellent at code completion".to_string(),
|
|
},
|
|
// Large reasoning model (for when you have the memory)
|
|
ModelDefinition {
|
|
hf_id: "Qwen/Qwen2.5-32B-Instruct-GGUF".to_string(),
|
|
alias: "qwen-large".to_string(),
|
|
name: "Qwen2.5-32B-Instruct".to_string(),
|
|
architecture: "qwen".to_string(),
|
|
params_b: 32.0,
|
|
use_case: "Complex reasoning, research, highest quality output".to_string(),
|
|
recommended_quant: "Q4_K_M".to_string(),
|
|
memory_gb: 20.0,
|
|
context_length: 32768,
|
|
notes: "Requires significant memory, but provides best quality".to_string(),
|
|
},
|
|
]
|
|
}
|
|
|
|
/// Get model by alias or HF ID
|
|
pub fn get_model(identifier: &str) -> Option<ModelDefinition> {
|
|
let models = get_recommended_models();
|
|
|
|
// First try exact alias match
|
|
if let Some(model) = models.iter().find(|m| m.alias == identifier) {
|
|
return Some(model.clone());
|
|
}
|
|
|
|
// Try HF ID match
|
|
if let Some(model) = models.iter().find(|m| m.hf_id == identifier) {
|
|
return Some(model.clone());
|
|
}
|
|
|
|
// Try partial HF ID match
|
|
if let Some(model) = models.iter().find(|m| m.hf_id.contains(identifier)) {
|
|
return Some(model.clone());
|
|
}
|
|
|
|
None
|
|
}
|
|
|
|
/// Resolve model identifier to HuggingFace ID
|
|
pub fn resolve_model_id(identifier: &str) -> String {
|
|
if let Some(model) = get_model(identifier) {
|
|
model.hf_id
|
|
} else {
|
|
// Assume it's a direct HF model ID
|
|
identifier.to_string()
|
|
}
|
|
}
|
|
|
|
/// Get model aliases map
|
|
pub fn get_aliases() -> HashMap<String, String> {
|
|
get_recommended_models()
|
|
.into_iter()
|
|
.map(|m| (m.alias, m.hf_id))
|
|
.collect()
|
|
}
|
|
|
|
/// Quantization presets
|
|
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
|
pub enum QuantPreset {
|
|
/// 4-bit K-quants (best quality/size tradeoff)
|
|
Q4K,
|
|
/// 8-bit quantization (higher quality, more memory)
|
|
Q8,
|
|
/// 16-bit floating point (high quality, most memory)
|
|
F16,
|
|
/// No quantization (full precision)
|
|
None,
|
|
}
|
|
|
|
impl QuantPreset {
|
|
/// Parse from string
|
|
pub fn from_str(s: &str) -> Option<Self> {
|
|
match s.to_lowercase().as_str() {
|
|
"q4k" | "q4_k" | "q4_k_m" | "q4" => Some(Self::Q4K),
|
|
"q8" | "q8_0" => Some(Self::Q8),
|
|
"f16" | "fp16" => Some(Self::F16),
|
|
"none" | "f32" | "fp32" => Some(Self::None),
|
|
_ => None,
|
|
}
|
|
}
|
|
|
|
/// Get GGUF file suffix
|
|
pub fn gguf_suffix(&self) -> &'static str {
|
|
match self {
|
|
Self::Q4K => "Q4_K_M.gguf",
|
|
Self::Q8 => "Q8_0.gguf",
|
|
Self::F16 => "F16.gguf",
|
|
Self::None => "F32.gguf",
|
|
}
|
|
}
|
|
|
|
/// Get bytes per weight
|
|
pub fn bytes_per_weight(&self) -> f32 {
|
|
match self {
|
|
Self::Q4K => 0.5,
|
|
Self::Q8 => 1.0,
|
|
Self::F16 => 2.0,
|
|
Self::None => 4.0,
|
|
}
|
|
}
|
|
|
|
/// Estimate memory usage in GB for given parameter count
|
|
pub fn estimate_memory_gb(&self, params_b: f32) -> f32 {
|
|
// Base memory for weights
|
|
let weight_memory = params_b * self.bytes_per_weight();
|
|
// Add overhead for KV cache, activations, etc. (roughly 20%)
|
|
weight_memory * 1.2
|
|
}
|
|
}
|
|
|
|
impl std::fmt::Display for QuantPreset {
|
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
|
match self {
|
|
Self::Q4K => write!(f, "Q4_K_M"),
|
|
Self::Q8 => write!(f, "Q8_0"),
|
|
Self::F16 => write!(f, "F16"),
|
|
Self::None => write!(f, "F32"),
|
|
}
|
|
}
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
|
|
#[test]
|
|
fn test_get_model_by_alias() {
|
|
let model = get_model("qwen").unwrap();
|
|
assert!(model.hf_id.contains("Qwen2.5-14B"));
|
|
}
|
|
|
|
#[test]
|
|
fn test_resolve_model_id() {
|
|
assert!(resolve_model_id("mistral").contains("Mistral-7B"));
|
|
assert_eq!(resolve_model_id("custom/model"), "custom/model");
|
|
}
|
|
|
|
#[test]
|
|
fn test_quant_preset() {
|
|
assert_eq!(QuantPreset::from_str("q4k"), Some(QuantPreset::Q4K));
|
|
assert_eq!(QuantPreset::Q4K.bytes_per_weight(), 0.5);
|
|
}
|
|
}
|