//! SMILES (Simplified Molecular Input Line Entry System) generator //! //! Converts chemical structure representations to SMILES notation. //! This is a simplified implementation - full chemistry support requires //! dedicated chemistry libraries like RDKit or OpenBabel. use super::OcrResult; /// SMILES notation generator for chemical structures pub struct SmilesGenerator { canonical: bool, include_stereochemistry: bool, } impl SmilesGenerator { pub fn new() -> Self { Self { canonical: true, include_stereochemistry: true, } } pub fn canonical(mut self, canonical: bool) -> Self { self.canonical = canonical; self } pub fn stereochemistry(mut self, include: bool) -> Self { self.include_stereochemistry = include; self } /// Generate SMILES from OCR result pub fn generate_from_result(&self, result: &OcrResult) -> Result { // Check if SMILES already available if let Some(smiles) = &result.formats.smiles { return Ok(smiles.clone()); } // Check for chemistry-related content in line data if let Some(line_data) = &result.line_data { for line in line_data { if line.line_type == "chemistry" || line.line_type == "molecule" { return self.parse_chemical_notation(&line.text); } } } Err("No chemical structure data found".to_string()) } /// Parse chemical notation to SMILES /// This is a placeholder - real implementation needs chemistry parsing fn parse_chemical_notation(&self, notation: &str) -> Result { // Check if already SMILES format if self.is_smiles(notation) { return Ok(notation.to_string()); } // Try to parse common chemical formulas if let Some(smiles) = self.simple_formula_to_smiles(notation) { return Ok(smiles); } Err(format!("Cannot convert '{}' to SMILES", notation)) } /// Check if string is already SMILES notation fn is_smiles(&self, s: &str) -> bool { // Basic SMILES characters let smiles_chars = "CNOPSFClBrI[]()=#@+-0123456789cnops"; s.chars().all(|c| smiles_chars.contains(c)) } /// Convert simple chemical formulas to SMILES fn simple_formula_to_smiles(&self, formula: &str) -> Option { // Common chemical formulas match formula.trim() { "H2O" | "water" => Some("O".to_string()), "CO2" | "carbon dioxide" => Some("O=C=O".to_string()), "CH4" | "methane" => Some("C".to_string()), "C2H6" | "ethane" => Some("CC".to_string()), "C2H5OH" | "ethanol" => Some("CCO".to_string()), "CH3COOH" | "acetic acid" => Some("CC(=O)O".to_string()), "C6H6" | "benzene" => Some("c1ccccc1".to_string()), "C6H12O6" | "glucose" => Some("OC[C@H]1OC(O)[C@H](O)[C@@H](O)[C@@H]1O".to_string()), "NH3" | "ammonia" => Some("N".to_string()), "H2SO4" | "sulfuric acid" => Some("OS(=O)(=O)O".to_string()), "NaCl" | "sodium chloride" => Some("[Na+].[Cl-]".to_string()), _ => None, } } /// Validate SMILES notation pub fn validate(&self, smiles: &str) -> Result<(), String> { // Basic validation checks // Check parentheses balance let mut depth = 0; for c in smiles.chars() { match c { '(' => depth += 1, ')' => { depth -= 1; if depth < 0 { return Err("Unbalanced parentheses".to_string()); } } _ => {} } } if depth != 0 { return Err("Unbalanced parentheses".to_string()); } // Check brackets balance let mut depth = 0; for c in smiles.chars() { match c { '[' => depth += 1, ']' => { depth -= 1; if depth < 0 { return Err("Unbalanced brackets".to_string()); } } _ => {} } } if depth != 0 { return Err("Unbalanced brackets".to_string()); } Ok(()) } /// Convert SMILES to molecular formula pub fn to_molecular_formula(&self, smiles: &str) -> Result { self.validate(smiles)?; // Simplified formula extraction // Real implementation would parse the SMILES properly let mut counts: std::collections::HashMap = std::collections::HashMap::new(); for c in smiles.chars() { if c.is_alphabetic() && c.is_uppercase() { *counts.entry(c).or_insert(0) += 1; } } let mut formula = String::new(); // Only use single-character elements for simplicity for element in &['C', 'H', 'N', 'O', 'S', 'P', 'F'] { if let Some(&count) = counts.get(element) { formula.push(*element); if count > 1 { formula.push_str(&count.to_string()); } } } if formula.is_empty() { Err("Could not determine molecular formula".to_string()) } else { Ok(formula) } } /// Calculate molecular weight (approximate) pub fn molecular_weight(&self, smiles: &str) -> Result { self.validate(smiles)?; // Simplified atomic weights let weights: std::collections::HashMap = [ ('C', 12.01), ('H', 1.008), ('N', 14.01), ('O', 16.00), ('S', 32.07), ('P', 30.97), ('F', 19.00), ] .iter() .cloned() .collect(); let mut total_weight = 0.0; for c in smiles.chars() { if let Some(&weight) = weights.get(&c) { total_weight += weight; } } Ok(total_weight) } } impl Default for SmilesGenerator { fn default() -> Self { Self::new() } } /// SMILES parser for extracting structure information pub struct SmilesParser; impl SmilesParser { pub fn new() -> Self { Self } /// Count atoms in SMILES notation pub fn count_atoms(&self, smiles: &str) -> std::collections::HashMap { let mut counts = std::collections::HashMap::new(); let mut i = 0; let chars: Vec = smiles.chars().collect(); while i < chars.len() { if chars[i].is_uppercase() { let mut atom = String::from(chars[i]); // Check for two-letter atoms (Cl, Br, etc.) if i + 1 < chars.len() && chars[i + 1].is_lowercase() { atom.push(chars[i + 1]); i += 1; } *counts.entry(atom).or_insert(0) += 1; } i += 1; } counts } /// Extract ring information pub fn find_rings(&self, smiles: &str) -> Vec { let mut rings = Vec::new(); for (_i, c) in smiles.chars().enumerate() { if c.is_numeric() { if let Some(digit) = c.to_digit(10) { rings.push(digit as usize); } } } rings } } impl Default for SmilesParser { fn default() -> Self { Self::new() } } #[cfg(test)] mod tests { use super::*; #[test] fn test_is_smiles() { let gen = SmilesGenerator::new(); assert!(gen.is_smiles("CCO")); assert!(gen.is_smiles("c1ccccc1")); assert!(gen.is_smiles("CC(=O)O")); assert!(!gen.is_smiles("not smiles!")); } #[test] fn test_simple_formula_conversion() { let gen = SmilesGenerator::new(); assert_eq!(gen.simple_formula_to_smiles("H2O"), Some("O".to_string())); assert_eq!( gen.simple_formula_to_smiles("CO2"), Some("O=C=O".to_string()) ); assert_eq!(gen.simple_formula_to_smiles("CH4"), Some("C".to_string())); assert_eq!( gen.simple_formula_to_smiles("benzene"), Some("c1ccccc1".to_string()) ); } #[test] fn test_validate_smiles() { let gen = SmilesGenerator::new(); assert!(gen.validate("CCO").is_ok()); assert!(gen.validate("CC(O)C").is_ok()); assert!(gen.validate("c1ccccc1").is_ok()); assert!(gen.validate("CC(O").is_err()); // Unbalanced assert!(gen.validate("CC)O").is_err()); // Unbalanced } #[test] fn test_molecular_formula() { let gen = SmilesGenerator::new(); let formula = gen.to_molecular_formula("CCO").unwrap(); assert!(formula.contains('C')); assert!(formula.contains('O')); } #[test] fn test_molecular_weight() { let gen = SmilesGenerator::new(); // Water: H2O (but SMILES is just "O", representing OH2) let weight = gen.molecular_weight("O").unwrap(); assert!(weight > 0.0); // Ethanol: C2H6O let weight = gen.molecular_weight("CCO").unwrap(); assert!(weight > 30.0); // Should be around 46 } #[test] fn test_count_atoms() { let parser = SmilesParser::new(); let counts = parser.count_atoms("CCO"); assert_eq!(counts.get("C"), Some(&2)); assert_eq!(counts.get("O"), Some(&1)); let counts = parser.count_atoms("CC(=O)O"); assert_eq!(counts.get("C"), Some(&2)); assert_eq!(counts.get("O"), Some(&2)); } #[test] fn test_find_rings() { let parser = SmilesParser::new(); let rings = parser.find_rings("c1ccccc1"); assert_eq!(rings, vec![1, 1]); let rings = parser.find_rings("C1CC1"); assert_eq!(rings, vec![1, 1]); } }