wifi-densepose/vendor/ruvector/examples/data/edgar/src/filings.rs

509 lines
14 KiB
Rust

//! SEC filing types and analysis
use chrono::NaiveDate;
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
/// SEC filing types
#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, Hash)]
pub enum FilingType {
/// Annual report
TenK,
/// Quarterly report
TenQ,
/// Current report (material events)
EightK,
/// Proxy statement
DefFourteen,
/// Insider trading
FormFour,
/// Institutional holdings
ThirteenF,
/// Registration statement
S1,
/// Other filing type
Other,
}
impl FilingType {
/// Parse from SEC form name
pub fn from_form(form: &str) -> Self {
match form.to_uppercase().as_str() {
"10-K" | "10-K/A" => FilingType::TenK,
"10-Q" | "10-Q/A" => FilingType::TenQ,
"8-K" | "8-K/A" => FilingType::EightK,
"DEF 14A" | "DEFA14A" => FilingType::DefFourteen,
"4" | "4/A" => FilingType::FormFour,
"13F-HR" | "13F-HR/A" => FilingType::ThirteenF,
"S-1" | "S-1/A" => FilingType::S1,
_ => FilingType::Other,
}
}
/// Get SEC form name
pub fn form_name(&self) -> &str {
match self {
FilingType::TenK => "10-K",
FilingType::TenQ => "10-Q",
FilingType::EightK => "8-K",
FilingType::DefFourteen => "DEF 14A",
FilingType::FormFour => "4",
FilingType::ThirteenF => "13F-HR",
FilingType::S1 => "S-1",
FilingType::Other => "Other",
}
}
}
/// A SEC filing
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Filing {
/// Accession number (unique identifier)
pub accession_number: String,
/// Company CIK
pub cik: String,
/// Filing type
pub filing_type: FilingType,
/// Date filed
pub filed_date: NaiveDate,
/// Primary document URL
pub document_url: String,
/// Description
pub description: Option<String>,
}
/// Filing analyzer for extracting insights
pub struct FilingAnalyzer {
/// Configuration
config: AnalyzerConfig,
}
/// Analyzer configuration
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct AnalyzerConfig {
/// Extract key phrases
pub extract_phrases: bool,
/// Sentiment analysis
pub analyze_sentiment: bool,
/// Risk factor extraction
pub extract_risks: bool,
/// Forward-looking statement extraction
pub extract_fls: bool,
}
impl Default for AnalyzerConfig {
fn default() -> Self {
Self {
extract_phrases: true,
analyze_sentiment: true,
extract_risks: true,
extract_fls: true,
}
}
}
impl FilingAnalyzer {
/// Create a new analyzer
pub fn new(config: AnalyzerConfig) -> Self {
Self { config }
}
/// Analyze a filing document
pub fn analyze(&self, content: &str, filing: &Filing) -> FilingAnalysis {
let sections = self.extract_sections(content, &filing.filing_type);
let sentiment = if self.config.analyze_sentiment {
Some(self.compute_sentiment(content))
} else {
None
};
let risk_factors = if self.config.extract_risks {
self.extract_risk_factors(content)
} else {
vec![]
};
let forward_looking = if self.config.extract_fls {
self.extract_forward_looking(content)
} else {
vec![]
};
let key_phrases = if self.config.extract_phrases {
self.extract_key_phrases(content)
} else {
vec![]
};
FilingAnalysis {
accession_number: filing.accession_number.clone(),
sections,
sentiment,
risk_factors,
forward_looking,
key_phrases,
word_count: content.split_whitespace().count(),
}
}
/// Extract standard sections from filing
fn extract_sections(&self, content: &str, filing_type: &FilingType) -> HashMap<String, String> {
let mut sections = HashMap::new();
// Section patterns vary by filing type
let section_patterns = match filing_type {
FilingType::TenK => vec![
("Business", "Item 1"),
("RiskFactors", "Item 1A"),
("Properties", "Item 2"),
("Legal", "Item 3"),
("MDA", "Item 7"),
("Financials", "Item 8"),
],
FilingType::TenQ => vec![
("Financials", "Part I"),
("MDA", "Item 2"),
("Controls", "Item 4"),
],
FilingType::EightK => vec![
("Item", "Item"),
],
_ => vec![],
};
// Simplified extraction - would use better text segmentation
for (name, marker) in section_patterns {
if let Some(idx) = content.find(marker) {
let section_text = &content[idx..];
let end_idx = section_text.len().min(5000);
sections.insert(name.to_string(), section_text[..end_idx].to_string());
}
}
sections
}
/// Compute sentiment score (-1 to 1)
fn compute_sentiment(&self, content: &str) -> f64 {
let positive_words = [
"growth", "profit", "increased", "strong", "improved", "successful",
"innovative", "opportunity", "favorable", "exceeded", "achieved",
];
let negative_words = [
"loss", "decline", "decreased", "weak", "challenging", "risk",
"uncertain", "adverse", "impairment", "litigation", "default",
];
let content_lower = content.to_lowercase();
let words: Vec<&str> = content_lower.split_whitespace().collect();
let total_words = words.len() as f64;
let positive_count = positive_words
.iter()
.map(|w| words.iter().filter(|word| word.contains(w)).count())
.sum::<usize>() as f64;
let negative_count = negative_words
.iter()
.map(|w| words.iter().filter(|word| word.contains(w)).count())
.sum::<usize>() as f64;
if total_words > 0.0 {
(positive_count - negative_count) / total_words.sqrt()
} else {
0.0
}
}
/// Extract risk factors
fn extract_risk_factors(&self, content: &str) -> Vec<RiskFactor> {
let mut risks = Vec::new();
let risk_patterns = [
("Regulatory", "regulatory", "regulation", "compliance"),
("Competition", "competitive", "competition", "competitors"),
("Cybersecurity", "cybersecurity", "data breach", "security"),
("Litigation", "litigation", "lawsuit", "legal proceedings"),
("Economic", "economic conditions", "recession", "downturn"),
("Supply Chain", "supply chain", "suppliers", "logistics"),
];
let content_lower = content.to_lowercase();
for (category, pattern1, pattern2, pattern3) in risk_patterns {
let count = [pattern1, pattern2, pattern3]
.iter()
.map(|p| content_lower.matches(p).count())
.sum::<usize>();
if count > 0 {
risks.push(RiskFactor {
category: category.to_string(),
severity: (count as f64 / 10.0).min(1.0),
mentions: count,
sample_text: None,
});
}
}
risks.sort_by(|a, b| b.severity.partial_cmp(&a.severity).unwrap_or(std::cmp::Ordering::Equal));
risks
}
/// Extract forward-looking statements
fn extract_forward_looking(&self, content: &str) -> Vec<ForwardLookingStatement> {
let mut statements = Vec::new();
let fls_patterns = [
"expect", "anticipate", "believe", "estimate", "project",
"forecast", "intend", "plan", "may", "will", "should",
];
let sentences: Vec<&str> = content.split(&['.', '!', '?'][..]).collect();
for sentence in sentences {
let sentence_lower = sentence.to_lowercase();
for pattern in fls_patterns {
if sentence_lower.contains(pattern) {
// Check if it's truly forward-looking
if sentence_lower.contains("future") ||
sentence_lower.contains("expect") ||
sentence_lower.contains("anticipate") {
statements.push(ForwardLookingStatement {
text: sentence.trim().to_string(),
sentiment: self.compute_sentiment(sentence),
confidence: 0.7,
});
break;
}
}
}
}
// Limit to most significant
statements.truncate(20);
statements
}
/// Extract key phrases
fn extract_key_phrases(&self, content: &str) -> Vec<KeyPhrase> {
let mut phrases = HashMap::new();
// Simple n-gram extraction
let words: Vec<&str> = content
.split_whitespace()
.filter(|w| w.len() > 3)
.collect();
// Bigrams
for window in words.windows(2) {
let phrase = format!("{} {}", window[0].to_lowercase(), window[1].to_lowercase());
if self.is_meaningful_phrase(&phrase) {
*phrases.entry(phrase).or_insert(0) += 1;
}
}
let mut result: Vec<KeyPhrase> = phrases
.into_iter()
.filter(|(_, count)| *count >= 3)
.map(|(phrase, count)| KeyPhrase {
phrase,
frequency: count,
importance: count as f64 / words.len() as f64,
})
.collect();
result.sort_by(|a, b| b.frequency.cmp(&a.frequency));
result.truncate(50);
result
}
/// Check if phrase is meaningful
fn is_meaningful_phrase(&self, phrase: &str) -> bool {
let stop_phrases = ["the", "and", "for", "this", "that", "with"];
!stop_phrases.iter().any(|s| phrase.starts_with(s))
}
}
/// Analysis result
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct FilingAnalysis {
/// Filing accession number
pub accession_number: String,
/// Extracted sections
pub sections: HashMap<String, String>,
/// Overall sentiment score
pub sentiment: Option<f64>,
/// Risk factors
pub risk_factors: Vec<RiskFactor>,
/// Forward-looking statements
pub forward_looking: Vec<ForwardLookingStatement>,
/// Key phrases
pub key_phrases: Vec<KeyPhrase>,
/// Total word count
pub word_count: usize,
}
/// A risk factor
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct RiskFactor {
/// Risk category
pub category: String,
/// Severity score (0-1)
pub severity: f64,
/// Number of mentions
pub mentions: usize,
/// Sample text
pub sample_text: Option<String>,
}
/// A forward-looking statement
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ForwardLookingStatement {
/// Statement text
pub text: String,
/// Sentiment score
pub sentiment: f64,
/// Confidence that this is FLS
pub confidence: f64,
}
/// A key phrase
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct KeyPhrase {
/// Phrase text
pub phrase: String,
/// Frequency count
pub frequency: usize,
/// Importance score
pub importance: f64,
}
/// Narrative extractor for text-to-vector
pub struct NarrativeExtractor {
/// Configuration
config: ExtractorConfig,
}
/// Extractor configuration
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ExtractorConfig {
/// Target embedding dimension
pub embedding_dim: usize,
/// Use TF-IDF weighting
pub use_tfidf: bool,
/// Normalize embeddings
pub normalize: bool,
}
impl Default for ExtractorConfig {
fn default() -> Self {
Self {
embedding_dim: 128,
use_tfidf: true,
normalize: true,
}
}
}
impl NarrativeExtractor {
/// Create a new extractor
pub fn new(config: ExtractorConfig) -> Self {
Self { config }
}
/// Extract embedding from filing analysis
pub fn extract_embedding(&self, analysis: &FilingAnalysis) -> Vec<f32> {
let mut embedding = Vec::with_capacity(self.config.embedding_dim);
// Sentiment feature
embedding.push(analysis.sentiment.unwrap_or(0.0) as f32);
// Word count (normalized)
embedding.push((analysis.word_count as f64 / 100000.0).min(1.0) as f32);
// Risk factor features
let total_risk_severity: f64 = analysis.risk_factors.iter().map(|r| r.severity).sum();
embedding.push((total_risk_severity / 5.0).min(1.0) as f32);
// FLS sentiment
let fls_sentiment: f64 = analysis.forward_looking
.iter()
.map(|f| f.sentiment)
.sum::<f64>() / analysis.forward_looking.len().max(1) as f64;
embedding.push(fls_sentiment as f32);
// Key phrase diversity
let phrase_diversity = analysis.key_phrases.len() as f64 / 100.0;
embedding.push(phrase_diversity.min(1.0) as f32);
// Pad to target dimension
while embedding.len() < self.config.embedding_dim {
embedding.push(0.0);
}
// Normalize
if self.config.normalize {
let norm: f32 = embedding.iter().map(|x| x * x).sum::<f32>().sqrt();
if norm > 0.0 {
for x in &mut embedding {
*x /= norm;
}
}
}
embedding
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_filing_type_from_form() {
assert_eq!(FilingType::from_form("10-K"), FilingType::TenK);
assert_eq!(FilingType::from_form("10-Q"), FilingType::TenQ);
assert_eq!(FilingType::from_form("8-K"), FilingType::EightK);
}
#[test]
fn test_sentiment_analysis() {
let config = AnalyzerConfig::default();
let analyzer = FilingAnalyzer::new(config);
let positive_text = "Growth and profit increased significantly. Strong performance exceeded expectations.";
let sentiment = analyzer.compute_sentiment(positive_text);
assert!(sentiment > 0.0);
let negative_text = "Loss and decline due to challenging conditions. Risk of default increased.";
let sentiment = analyzer.compute_sentiment(negative_text);
assert!(sentiment < 0.0);
}
}