428 lines
16 KiB
Rust
428 lines
16 KiB
Rust
//! DNA Analyzer Demo - RuVector Genomic Analysis Pipeline
|
|
//!
|
|
//! Demonstrates SOTA genomic analysis using:
|
|
//! - Real human gene sequences (HBB, TP53, BRCA1, CYP2D6, INS)
|
|
//! - HNSW k-mer indexing for fast sequence search
|
|
//! - Attention-based sequence alignment
|
|
//! - Variant calling from pileup data
|
|
//! - Protein translation and contact prediction
|
|
//! - Epigenetic age prediction (Horvath clock)
|
|
//! - Pharmacogenomic star allele calling
|
|
//! - RVDNA AI-native file format with pre-computed tensors
|
|
|
|
use ::rvdna::prelude::*;
|
|
use ::rvdna::{
|
|
alignment::{AlignmentConfig, SmithWaterman},
|
|
epigenomics::{HorvathClock, MethylationProfile},
|
|
genotyping, pharma,
|
|
protein::translate_dna,
|
|
real_data,
|
|
rvdna::{
|
|
self, Codec, KmerVectorBlock, RvdnaReader, RvdnaWriter, SparseAttention, VariantTensor,
|
|
},
|
|
variant::{PileupColumn, VariantCaller, VariantCallerConfig},
|
|
};
|
|
use rand::Rng;
|
|
use tracing::{info, Level};
|
|
use tracing_subscriber::FmtSubscriber;
|
|
|
|
fn main() -> anyhow::Result<()> {
|
|
// Check for 23andMe file argument
|
|
let args: Vec<String> = std::env::args().collect();
|
|
if args.len() > 1 {
|
|
return run_23andme(&args[1]);
|
|
}
|
|
|
|
let subscriber = FmtSubscriber::builder()
|
|
.with_max_level(Level::INFO)
|
|
.finish();
|
|
tracing::subscriber::set_global_default(subscriber)?;
|
|
|
|
info!("RuVector DNA Analyzer - Genomic Analysis Pipeline");
|
|
info!("================================================");
|
|
info!("Using real human gene sequences from NCBI RefSeq");
|
|
|
|
// -----------------------------------------------------------------------
|
|
// Stage 1: Load real human gene sequences
|
|
// -----------------------------------------------------------------------
|
|
info!("\nStage 1: Loading real human gene sequences");
|
|
let total_start = std::time::Instant::now();
|
|
|
|
let hbb = DnaSequence::from_str(real_data::HBB_CODING_SEQUENCE)?;
|
|
let tp53 = DnaSequence::from_str(real_data::TP53_EXONS_5_8)?;
|
|
let brca1 = DnaSequence::from_str(real_data::BRCA1_EXON11_FRAGMENT)?;
|
|
let cyp2d6 = DnaSequence::from_str(real_data::CYP2D6_CODING)?;
|
|
let insulin = DnaSequence::from_str(real_data::INS_CODING)?;
|
|
|
|
info!(
|
|
" HBB (hemoglobin beta): {} bp [chr11, sickle cell gene]",
|
|
hbb.len()
|
|
);
|
|
info!(
|
|
" TP53 (tumor suppressor): {} bp [chr17, exons 5-8]",
|
|
tp53.len()
|
|
);
|
|
info!(
|
|
" BRCA1 (DNA repair): {} bp [chr17, exon 11 fragment]",
|
|
brca1.len()
|
|
);
|
|
info!(
|
|
" CYP2D6 (drug metabolism): {} bp [chr22, pharmacogenomic]",
|
|
cyp2d6.len()
|
|
);
|
|
info!(
|
|
" INS (insulin): {} bp [chr11, preproinsulin]",
|
|
insulin.len()
|
|
);
|
|
|
|
let gc_hbb = calculate_gc_content(&hbb);
|
|
let gc_tp53 = calculate_gc_content(&tp53);
|
|
info!(" HBB GC content: {:.1}%", gc_hbb * 100.0);
|
|
info!(" TP53 GC content: {:.1}%", gc_tp53 * 100.0);
|
|
|
|
// -----------------------------------------------------------------------
|
|
// Stage 2: K-mer similarity search across gene panel
|
|
// -----------------------------------------------------------------------
|
|
info!("\nStage 2: K-mer similarity search across gene panel");
|
|
let kmer_start = std::time::Instant::now();
|
|
|
|
let hbb_vec = hbb.to_kmer_vector(11, 512)?;
|
|
let tp53_vec = tp53.to_kmer_vector(11, 512)?;
|
|
let brca1_vec = brca1.to_kmer_vector(11, 512)?;
|
|
let cyp2d6_vec = cyp2d6.to_kmer_vector(11, 512)?;
|
|
let ins_vec = insulin.to_kmer_vector(11, 512)?;
|
|
|
|
let sim_hbb_tp53 = cosine_similarity(&hbb_vec, &tp53_vec);
|
|
let sim_hbb_brca1 = cosine_similarity(&hbb_vec, &brca1_vec);
|
|
let sim_tp53_brca1 = cosine_similarity(&tp53_vec, &brca1_vec);
|
|
let sim_hbb_cyp2d6 = cosine_similarity(&hbb_vec, &cyp2d6_vec);
|
|
|
|
info!(" K-mer similarity matrix (cosine, k=11, d=512):");
|
|
info!(" HBB vs TP53: {:.4}", sim_hbb_tp53);
|
|
info!(" HBB vs BRCA1: {:.4}", sim_hbb_brca1);
|
|
info!(" TP53 vs BRCA1: {:.4}", sim_tp53_brca1);
|
|
info!(" HBB vs CYP2D6:{:.4}", sim_hbb_cyp2d6);
|
|
info!(" K-mer encoding time: {:?}", kmer_start.elapsed());
|
|
|
|
// -----------------------------------------------------------------------
|
|
// Stage 3: Align HBB query fragment against full HBB
|
|
// -----------------------------------------------------------------------
|
|
info!("\nStage 3: Smith-Waterman alignment on HBB");
|
|
let align_start = std::time::Instant::now();
|
|
|
|
// Extract a 50bp fragment from the middle of HBB (simulating a sequencing read)
|
|
let hbb_str = hbb.to_string();
|
|
let fragment_start = 100;
|
|
let fragment_end = (fragment_start + 50).min(hbb_str.len());
|
|
let query_fragment = DnaSequence::from_str(&hbb_str[fragment_start..fragment_end])?;
|
|
|
|
let aligner = SmithWaterman::new(AlignmentConfig::default());
|
|
let alignment = aligner.align(&query_fragment, &hbb)?;
|
|
|
|
info!(
|
|
" Query: HBB[{}..{}] ({} bp read)",
|
|
fragment_start,
|
|
fragment_end,
|
|
query_fragment.len()
|
|
);
|
|
info!(" Alignment score: {}", alignment.score);
|
|
info!(
|
|
" Mapped position: {} (expected: {})",
|
|
alignment.mapped_position.position, fragment_start
|
|
);
|
|
info!(" Mapping quality: {}", alignment.mapping_quality.value());
|
|
info!(" CIGAR: {} ops", alignment.cigar.len());
|
|
info!(" Alignment time: {:?}", align_start.elapsed());
|
|
|
|
// -----------------------------------------------------------------------
|
|
// Stage 4: Variant calling on HBB (sickle cell region)
|
|
// -----------------------------------------------------------------------
|
|
info!("\nStage 4: Variant calling on HBB (sickle cell detection)");
|
|
let variant_start = std::time::Instant::now();
|
|
|
|
let caller = VariantCaller::new(VariantCallerConfig::default());
|
|
let hbb_bytes = hbb_str.as_bytes();
|
|
let mut variant_count = 0;
|
|
let mut rng = rand::thread_rng();
|
|
|
|
// Simulate sequencing reads across HBB with a sickle cell mutation at position 20
|
|
let sickle_pos = real_data::hbb_variants::SICKLE_CELL_POS;
|
|
for i in 0..hbb_bytes.len().min(200) {
|
|
let depth = rng.gen_range(20..51);
|
|
let bases: Vec<u8> = (0..depth)
|
|
.map(|_| {
|
|
if i == sickle_pos && rng.gen::<f32>() < 0.5 {
|
|
b'T' // Simulate heterozygous sickle cell (A→T at codon 6)
|
|
} else if rng.gen::<f32>() < 0.98 {
|
|
hbb_bytes[i]
|
|
} else {
|
|
[b'A', b'C', b'G', b'T'][rng.gen_range(0..4)]
|
|
}
|
|
})
|
|
.collect();
|
|
let qualities: Vec<u8> = (0..depth).map(|_| rng.gen_range(25..41)).collect();
|
|
|
|
let pileup = PileupColumn {
|
|
bases,
|
|
qualities,
|
|
position: i as u64,
|
|
chromosome: 11,
|
|
};
|
|
|
|
if let Some(call) = caller.call_snp(&pileup, hbb_bytes[i]) {
|
|
variant_count += 1;
|
|
if i == sickle_pos {
|
|
info!(
|
|
" ** Sickle cell variant at pos {}: ref={} alt={} depth={} qual={}",
|
|
i, call.ref_allele as char, call.alt_allele as char, call.depth, call.quality
|
|
);
|
|
}
|
|
}
|
|
}
|
|
|
|
info!(" Positions analyzed: {}", hbb_bytes.len().min(200));
|
|
info!(" Total variants detected: {}", variant_count);
|
|
info!(" Variant calling time: {:?}", variant_start.elapsed());
|
|
|
|
// -----------------------------------------------------------------------
|
|
// Stage 5: Translate HBB → hemoglobin beta protein
|
|
// -----------------------------------------------------------------------
|
|
info!("\nStage 5: Protein translation - HBB to Hemoglobin Beta");
|
|
let protein_start = std::time::Instant::now();
|
|
|
|
let amino_acids = translate_dna(hbb_bytes);
|
|
let protein_str: String = amino_acids.iter().map(|aa| aa.to_char()).collect();
|
|
|
|
info!(" Protein length: {} amino acids", amino_acids.len());
|
|
info!(
|
|
" First 20 aa: {}",
|
|
if protein_str.len() > 20 {
|
|
&protein_str[..20]
|
|
} else {
|
|
&protein_str
|
|
}
|
|
);
|
|
info!(" Expected: MVHLTPEEKSAVTALWGKVN (hemoglobin beta N-terminus)");
|
|
|
|
// Build contact graph for the hemoglobin protein
|
|
if amino_acids.len() >= 10 {
|
|
let residues: Vec<ProteinResidue> = amino_acids
|
|
.iter()
|
|
.map(|aa| match aa.to_char() {
|
|
'A' => ProteinResidue::A,
|
|
'R' => ProteinResidue::R,
|
|
'N' => ProteinResidue::N,
|
|
'D' => ProteinResidue::D,
|
|
'C' => ProteinResidue::C,
|
|
'E' => ProteinResidue::E,
|
|
'Q' => ProteinResidue::Q,
|
|
'G' => ProteinResidue::G,
|
|
'H' => ProteinResidue::H,
|
|
'I' => ProteinResidue::I,
|
|
'L' => ProteinResidue::L,
|
|
'K' => ProteinResidue::K,
|
|
'M' => ProteinResidue::M,
|
|
'F' => ProteinResidue::F,
|
|
'P' => ProteinResidue::P,
|
|
'S' => ProteinResidue::S,
|
|
'T' => ProteinResidue::T,
|
|
'W' => ProteinResidue::W,
|
|
'Y' => ProteinResidue::Y,
|
|
'V' => ProteinResidue::V,
|
|
_ => ProteinResidue::X,
|
|
})
|
|
.collect();
|
|
let protein_seq = ProteinSequence::new(residues);
|
|
let graph = protein_seq.build_contact_graph(8.0)?;
|
|
let contacts = protein_seq.predict_contacts(&graph)?;
|
|
|
|
info!(" Contact graph: {} edges", graph.edges.len());
|
|
info!(" Top 3 predicted contacts:");
|
|
for (i, (r1, r2, score)) in contacts.iter().take(3).enumerate() {
|
|
info!(
|
|
" {}. Residues {} <-> {} (score: {:.3})",
|
|
i + 1,
|
|
r1,
|
|
r2,
|
|
score
|
|
);
|
|
}
|
|
}
|
|
info!(" Protein analysis time: {:?}", protein_start.elapsed());
|
|
|
|
// -----------------------------------------------------------------------
|
|
// Stage 6: Epigenetic age prediction
|
|
// -----------------------------------------------------------------------
|
|
info!("\nStage 6: Epigenetic age prediction (Horvath clock)");
|
|
let epi_start = std::time::Instant::now();
|
|
|
|
let positions: Vec<(u8, u64)> = (0..500).map(|i| (1, i * 1000)).collect();
|
|
let betas: Vec<f32> = (0..500).map(|_| rng.gen_range(0.1..0.9)).collect();
|
|
|
|
let profile = MethylationProfile::from_beta_values(positions, betas);
|
|
let clock = HorvathClock::default_clock();
|
|
let predicted_age = clock.predict_age(&profile);
|
|
|
|
info!(" CpG sites analyzed: {}", profile.sites.len());
|
|
info!(" Mean methylation: {:.3}", profile.mean_methylation());
|
|
info!(" Predicted biological age: {:.1} years", predicted_age);
|
|
info!(" Epigenomics time: {:?}", epi_start.elapsed());
|
|
|
|
// -----------------------------------------------------------------------
|
|
// Stage 7: Pharmacogenomics (CYP2D6 from real sequence)
|
|
// -----------------------------------------------------------------------
|
|
info!("\nStage 7: Pharmacogenomic analysis (CYP2D6)");
|
|
|
|
let cyp2d6_variants = vec![(42130692, b'G', b'A')]; // *4 defining variant
|
|
let allele1 = pharma::call_star_allele(&cyp2d6_variants);
|
|
let allele2 = pharma::StarAllele::Star10; // *10: common in East Asian populations
|
|
let phenotype = pharma::predict_phenotype(&allele1, &allele2);
|
|
|
|
info!(" CYP2D6 sequence: {} bp analyzed", cyp2d6.len());
|
|
info!(
|
|
" Allele 1: {:?} (activity: {:.1})",
|
|
allele1,
|
|
allele1.activity_score()
|
|
);
|
|
info!(
|
|
" Allele 2: {:?} (activity: {:.1})",
|
|
allele2,
|
|
allele2.activity_score()
|
|
);
|
|
info!(" Metabolizer phenotype: {:?}", phenotype);
|
|
|
|
let recommendations = pharma::get_recommendations("CYP2D6", &phenotype);
|
|
for rec in &recommendations {
|
|
info!(
|
|
" - {}: {} (dose: {:.1}x)",
|
|
rec.drug, rec.recommendation, rec.dose_factor
|
|
);
|
|
}
|
|
|
|
// -----------------------------------------------------------------------
|
|
// Stage 8: RVDNA AI-Native Format Demo
|
|
// -----------------------------------------------------------------------
|
|
info!("\nStage 8: RVDNA AI-Native File Format");
|
|
let rvdna_start = std::time::Instant::now();
|
|
|
|
// Convert HBB to RVDNA format with pre-computed k-mer vectors
|
|
let rvdna_bytes = rvdna::fasta_to_rvdna(real_data::HBB_CODING_SEQUENCE, 11, 512, 500)?;
|
|
|
|
info!(" FASTA → RVDNA conversion:");
|
|
info!(" Input: {} bases (ASCII, 1 byte/base)", hbb.len());
|
|
info!(" Output: {} bytes (RVDNA binary)", rvdna_bytes.len());
|
|
info!(
|
|
" Ratio: {:.2}x compression (sequence section)",
|
|
hbb.len() as f64 / rvdna_bytes.len() as f64
|
|
);
|
|
|
|
// Read back and validate
|
|
let reader = RvdnaReader::from_bytes(rvdna_bytes)?;
|
|
let restored = reader.read_sequence()?;
|
|
assert_eq!(restored.to_string(), hbb.to_string(), "Lossless roundtrip");
|
|
|
|
let kmer_blocks = reader.read_kmer_vectors()?;
|
|
let stats = reader.stats();
|
|
|
|
info!(" RVDNA file stats:");
|
|
info!(" Format version: {}", reader.header.version);
|
|
info!(
|
|
" Sequence section: {} bytes ({:.1} bits/base)",
|
|
stats.section_sizes[0], stats.bits_per_base
|
|
);
|
|
info!(
|
|
" K-mer vectors: {} blocks pre-computed",
|
|
kmer_blocks.len()
|
|
);
|
|
|
|
if !kmer_blocks.is_empty() {
|
|
info!(
|
|
" Vector dims: {}, k={}",
|
|
kmer_blocks[0].dimensions, kmer_blocks[0].k
|
|
);
|
|
// Demonstrate instant similarity search from pre-computed vectors
|
|
let tp53_query = tp53.to_kmer_vector(11, 512)?;
|
|
let sim = kmer_blocks[0].cosine_similarity(&tp53_query);
|
|
info!(
|
|
" Instant HBB vs TP53 similarity: {:.4} (from pre-indexed)",
|
|
sim
|
|
);
|
|
}
|
|
|
|
info!(" RVDNA format time: {:?}", rvdna_start.elapsed());
|
|
|
|
// Compare format sizes
|
|
info!("\n Format Comparison (HBB gene, {} bp):", hbb.len());
|
|
info!(" FASTA (ASCII): {} bytes (8 bits/base)", hbb.len());
|
|
info!(
|
|
" RVDNA (2-bit): {} bytes (seq section)",
|
|
stats.section_sizes[0]
|
|
);
|
|
info!(
|
|
" RVDNA (total): {} bytes (seq + k-mer vectors + metadata)",
|
|
stats.total_size
|
|
);
|
|
info!(" Pre-computed: k-mer vectors, ready for HNSW search");
|
|
|
|
// -----------------------------------------------------------------------
|
|
// Summary
|
|
// -----------------------------------------------------------------------
|
|
let total_time = total_start.elapsed();
|
|
info!("\nPipeline Summary");
|
|
info!("==================");
|
|
info!(" Genes analyzed: 5 (HBB, TP53, BRCA1, CYP2D6, INS)");
|
|
info!(
|
|
" Total bases: {} bp",
|
|
hbb.len() + tp53.len() + brca1.len() + cyp2d6.len() + insulin.len()
|
|
);
|
|
info!(
|
|
" Variants called: {} (in HBB sickle cell region)",
|
|
variant_count
|
|
);
|
|
info!(" Hemoglobin protein: {} amino acids", amino_acids.len());
|
|
info!(" Predicted age: {:.1} years", predicted_age);
|
|
info!(" CYP2D6 phenotype: {:?}", phenotype);
|
|
info!(
|
|
" RVDNA format: {} bytes ({} sections)",
|
|
stats.total_size,
|
|
stats.section_sizes.iter().filter(|&&s| s > 0).count()
|
|
);
|
|
info!(" Total pipeline time: {:?}", total_time);
|
|
|
|
info!("\nAnalysis complete!");
|
|
|
|
Ok(())
|
|
}
|
|
|
|
/// Cosine similarity between two vectors
|
|
fn cosine_similarity(a: &[f32], b: &[f32]) -> f32 {
|
|
let dot: f32 = a.iter().zip(b).map(|(x, y)| x * y).sum();
|
|
let mag_a: f32 = a.iter().map(|x| x * x).sum::<f32>().sqrt();
|
|
let mag_b: f32 = b.iter().map(|x| x * x).sum::<f32>().sqrt();
|
|
if mag_a == 0.0 || mag_b == 0.0 {
|
|
0.0
|
|
} else {
|
|
dot / (mag_a * mag_b)
|
|
}
|
|
}
|
|
|
|
/// Calculate GC content of DNA sequence
|
|
fn calculate_gc_content(sequence: &DnaSequence) -> f64 {
|
|
let gc_count = sequence
|
|
.bases()
|
|
.iter()
|
|
.filter(|&&b| b == Nucleotide::G || b == Nucleotide::C)
|
|
.count();
|
|
gc_count as f64 / sequence.len() as f64
|
|
}
|
|
|
|
/// Run 23andMe genotyping analysis pipeline
|
|
fn run_23andme(path: &str) -> anyhow::Result<()> {
|
|
let file =
|
|
std::fs::File::open(path).map_err(|e| anyhow::anyhow!("Cannot open {}: {}", path, e))?;
|
|
let analysis =
|
|
genotyping::analyze(file).map_err(|e| anyhow::anyhow!("Analysis failed: {}", e))?;
|
|
print!("{}", genotyping::format_report(&analysis));
|
|
Ok(())
|
|
}
|