wifi-densepose/vendor/ruvector/examples/apify/agentic-synth/src/main.js.backup

3897 lines
167 KiB
Plaintext

import { Actor, log } from 'apify';
import { GoogleGenerativeAI } from '@google/generative-ai';
import { createRequire } from 'module';
import { integrateActorData, SUPPORTED_ACTORS, USE_CASE_TEMPLATES, getTemplate, listSupportedActors, listTemplates } from './integrations.js';
import { addEmbeddingsToRecords, generateRandomEmbedding, EMBEDDING_MODELS } from './embeddings.js';
import { MemorySession, saveToMemorySession, loadFromMemorySession } from '../../../shared/memory-persistence.js';
// CJS import workaround for RuvLLM native extension
const require = createRequire(import.meta.url);
let ruvllm = null;
let sonaCoordinator = null;
let trajectoryBuilder = null;
// Safe Actor.charge helper - gracefully handles cases where monetization isn't set up
async function safeCharge(eventName, count = 1) {
try {
await Actor.charge({ eventName, count });
} catch (e) {
// Silently ignore charging errors - monetization may not be configured
log.debug?.(`Charge skipped for ${eventName}: ${e.message}`);
}
}
try {
ruvllm = require('@ruvector/ruvllm');
log.info('RuvLLM loaded successfully - TRM/SONA self-learning enabled');
} catch (e) {
log.warning(`RuvLLM not available: ${e.message}. Using standard generation.`);
}
// Initialize Actor
await Actor.init();
try {
// Get input
const input = await Actor.getInput() || {};
const {
// Mode selection
mode = 'generate',
// Integration parameters
integrateActorId,
integrateRunId = 'latest',
integrateDatasetId,
memorizeFields = [],
useTemplate,
// Output options
webhookUrl,
generateEmbeddings = false,
// Core parameters
dataType = 'ecommerce',
count = 100,
schema = {},
timeSeriesConfig = {},
eventTypes = ['page_view', 'click', 'scroll', 'form_submit'],
embeddingDimensions = 384,
provider = 'openrouter',
apiKey,
openrouterApiKey,
geminiApiKey,
anthropicApiKey,
model = 'deepseek/deepseek-chat',
outputFormat = 'json',
seed,
quality = 0.8,
// Web scraping specific options
websiteType = 'ecommerce',
apiEndpoint = '/api/products',
simulationMode = false,
batchSize = 100,
delayBetweenBatches = 0,
// SONA/TRM parameters
sonaEnabled = true,
ewcLambda = 2000,
patternThreshold = 0.7,
sonaLearningTiers = ['instant', 'background'],
// ONNX Embedding parameters
useOnnxEmbeddings = true,
embeddingModel = 'all-MiniLM-L6-v2',
// Crunchbase/Grounding parameters
crunchbaseCompanies = [],
crunchbaseIndustry = null,
// Memory Session parameters
memorySessionId = null,
memorySessionEnabled = false,
appendToSession = true
} = input;
log.info('AI Synthetic Data Generator v2.5 with ONNX Embeddings & TRM/SONA', { mode, dataType, count, provider, model, sonaEnabled, useOnnxEmbeddings, embeddingModel });
// Initialize SONA if available and enabled
if (ruvllm && sonaEnabled) {
try {
if (ruvllm.SonaCoordinator) {
sonaCoordinator = new ruvllm.SonaCoordinator({
tiers: sonaLearningTiers,
ewcLambda,
patternThreshold
});
log.info('SONA Coordinator initialized', { tiers: sonaLearningTiers, ewcLambda });
}
if (ruvllm.TrajectoryBuilder) {
trajectoryBuilder = new ruvllm.TrajectoryBuilder({
maxSteps: 100
});
log.info('Trajectory Builder initialized');
}
// Charge for SONA learning session
await safeCharge('sona-learning-session', 1);
} catch (e) {
log.warning(`SONA initialization failed: ${e.message}`);
}
}
// Check for API key based on provider - support both new separate fields and legacy apiKey
// Gemini key also needed for Crunchbase grounding regardless of provider
const geminiKey = (provider === 'gemini' || dataType === 'crunchbase') ? (geminiApiKey || apiKey || process.env.GEMINI_API_KEY) : null;
const openRouterKey = provider === 'openrouter' ? (openrouterApiKey || apiKey || process.env.OPENROUTER_API_KEY) : null;
const anthropicKey = provider === 'anthropic' ? (anthropicApiKey || apiKey || process.env.ANTHROPIC_API_KEY) : null;
if (provider === 'gemini' && !geminiKey) {
log.warning('No Gemini API key provided. Using algorithmic generation (still produces great data!)');
}
if (provider === 'openrouter' && !openRouterKey) {
log.warning('No OpenRouter API key provided. Using algorithmic generation.');
}
if (provider === 'anthropic' && !anthropicKey) {
log.warning('No Anthropic API key provided. Using algorithmic generation.');
}
let generatedData = [];
const startTime = Date.now();
// ============================================
// MODE HANDLING: generate, integrate, template
// ============================================
if (mode === 'integrate' || mode === 'template') {
// Integration mode - transform data from other Apify actors
log.info(`Running in ${mode} mode`, { integrateActorId, useTemplate });
// Get template config if using template mode
let templateConfig = null;
let effectiveActorId = integrateActorId;
let effectiveMemorizeFields = memorizeFields;
if (mode === 'template' && useTemplate) {
templateConfig = getTemplate(useTemplate);
log.info(`Using template: ${templateConfig.name}`, { suggestedActors: templateConfig.suggestedActors });
// Use template defaults if not overridden
if (!effectiveActorId && templateConfig.suggestedActors.length > 0) {
effectiveActorId = templateConfig.suggestedActors[0];
log.info(`Using template's suggested actor: ${effectiveActorId}`);
}
if (effectiveMemorizeFields.length === 0) {
effectiveMemorizeFields = templateConfig.memorizeFields || [];
}
// Charge for template execution
await safeCharge('template-execution', 1);
}
// Fetch data from the actor's dataset
let sourceData = [];
if (integrateDatasetId) {
// Direct dataset access
log.info(`Fetching from dataset: ${integrateDatasetId}`);
const dataset = await Actor.openDataset(integrateDatasetId, { forceCloud: true });
const { items } = await dataset.getData({ limit: count });
sourceData = items;
} else if (effectiveActorId) {
// Fetch from actor run
log.info(`Fetching from actor: ${effectiveActorId}, run: ${integrateRunId}`);
try {
// Use Apify client to fetch last run's dataset
const client = Actor.newClient();
let runInfo;
if (integrateRunId === 'latest') {
const runs = await client.actor(effectiveActorId).runs().list({ limit: 1 });
if (runs.items.length === 0) {
throw new Error(`No runs found for actor ${effectiveActorId}`);
}
runInfo = runs.items[0];
} else {
runInfo = await client.run(integrateRunId).get();
}
if (runInfo && runInfo.defaultDatasetId) {
const dataset = await client.dataset(runInfo.defaultDatasetId).listItems({ limit: count });
sourceData = dataset.items;
log.info(`Fetched ${sourceData.length} items from ${effectiveActorId}`);
}
} catch (e) {
log.error(`Failed to fetch data from ${effectiveActorId}: ${e.message}`);
log.info('Generating synthetic data as fallback...');
// Fall back to synthetic data generation
sourceData = [];
}
}
if (sourceData.length > 0) {
// Transform the data
const result = await integrateActorData({
actorId: effectiveActorId,
data: sourceData,
memorizeFields: effectiveMemorizeFields,
template: useTemplate,
maxItems: count
});
generatedData = result.data;
// Charge for integration
await safeCharge('actor-integration', 1);
await safeCharge('integrated-record', generatedData.length);
log.info(`Transformed ${generatedData.length} records from ${effectiveActorId}`);
} else if (mode === 'template' && templateConfig) {
// Generate synthetic data based on template output format
log.info('No source data available, generating synthetic data based on template schema...');
const random = createSeededRandom(seed);
generatedData = [];
for (let i = 0; i < count; i++) {
const record = generateFromTemplateSchema(templateConfig.outputFormat, random, i);
generatedData.push(record);
}
} else {
throw new Error('No data source specified. Provide integrateActorId or integrateDatasetId.');
}
} else {
// Generate mode - create synthetic data
// Generate data based on type - optimized for web scraping use cases
switch (dataType) {
case 'demo':
generatedData = await generateDemoData(count, geminiKey, model);
break;
case 'ecommerce':
generatedData = await generateEcommerceData(count, seed);
break;
case 'social':
generatedData = await generateSocialMediaData(count, seed);
break;
case 'api_response':
generatedData = await generateApiResponseData(count, apiEndpoint, seed);
break;
case 'search_results':
generatedData = await generateSearchResultsData(count, seed);
break;
case 'real_estate':
generatedData = await generateRealEstateData(count, seed);
break;
case 'jobs':
generatedData = await generateJobListingsData(count, seed);
break;
case 'news':
generatedData = await generateNewsData(count, seed);
break;
case 'structured':
generatedData = await generateStructuredData(count, schema, geminiKey || openRouterKey || anthropicKey, model, seed, provider);
break;
case 'timeseries':
generatedData = await generateTimeSeriesData(count, timeSeriesConfig, seed);
break;
case 'events':
generatedData = await generateEventData(count, eventTypes, seed);
break;
case 'embeddings':
generatedData = await generateEmbeddingData(count, embeddingDimensions, seed);
break;
// Enterprise/Company Simulators
case 'stock_trading':
generatedData = await generateStockTradingData(count, seed);
break;
case 'medical':
generatedData = await generateMedicalData(count, seed);
break;
case 'company':
generatedData = await generateCompanyData(count, seed);
break;
case 'supply_chain':
generatedData = await generateSupplyChainData(count, seed);
break;
case 'financial':
generatedData = await generateFinancialData(count, seed);
break;
case 'bloomberg':
generatedData = await generateBloombergData(count, seed);
break;
case 'zoominfo':
generatedData = await generateZoomInfoData(count, seed);
break;
case 'factset':
generatedData = await generateFactSetData(count, seed);
break;
case 'lseg':
generatedData = await generateLSEGData(count, seed);
break;
case 'crunchbase':
generatedData = await generateCrunchbaseData(count, geminiKey, crunchbaseCompanies, crunchbaseIndustry);
break;
// PRIORITY 1: High-Value Exotic Data Types
case 'eeg':
generatedData = await generateEEGData(count, seed);
break;
case 'cgm':
generatedData = await generateCGMData(count, seed);
break;
case 'siem':
generatedData = await generateSIEMData(count, seed);
break;
case 'threat_intel':
generatedData = await generateThreatIntelData(count, seed);
break;
case 'netflow':
generatedData = await generateNetFlowData(count, seed);
break;
// PRIORITY 2: Industrial & Scientific Data Types
case 'scada':
generatedData = await generateSCADAData(count, seed);
break;
case 'lidar':
generatedData = await generateLiDARData(count, seed);
break;
case 'canbus':
generatedData = await generateCANBusData(count, seed);
break;
case 'genomic_vcf':
generatedData = await generateGenomicVCFData(count, seed);
break;
case 'satellite':
generatedData = await generateSatelliteData(count, seed);
break;
// PRIORITY 3: Exotic/Niche Data Types
case 'fmri':
generatedData = await generateFMRIData(count, seed);
break;
case 'protein_pdb':
generatedData = await generateProteinPDBData(count, seed);
break;
case 'power_grid':
generatedData = await generatePowerGridData(count, seed);
break;
case 'ais':
generatedData = await generateAISData(count, seed);
break;
case 'radar':
generatedData = await generateRadarData(count, seed);
break;
default:
throw new Error(`Unknown data type: ${dataType}. Available: ecommerce, social, api_response, search_results, real_estate, jobs, news, structured, timeseries, events, embeddings, stock_trading, medical, company, supply_chain, financial, bloomberg, zoominfo, factset, lseg, crunchbase, eeg, cgm, siem, threat_intel, netflow, scada, lidar, canbus, genomic_vcf, satellite, fmri, protein_pdb, power_grid, ais, radar, demo`);
}
} // End of generate mode else block
const generationTime = Date.now() - startTime;
// ============================================
// EMBEDDING GENERATION (optional)
// ============================================
if (generateEmbeddings && generatedData.length > 0) {
const modelConfig = EMBEDDING_MODELS[embeddingModel] || EMBEDDING_MODELS['all-MiniLM-L6-v2'];
const effectiveDimensions = useOnnxEmbeddings ? modelConfig.dimensions : embeddingDimensions;
log.info(`Generating embeddings with ${effectiveDimensions} dimensions...`, {
useOnnx: useOnnxEmbeddings,
model: useOnnxEmbeddings ? embeddingModel : 'random'
});
if (useOnnxEmbeddings) {
// Use ONNX-powered semantic embeddings
try {
generatedData = await addEmbeddingsToRecords(generatedData, { modelName: embeddingModel });
log.info(`Added ONNX embeddings using ${embeddingModel} model`);
await safeCharge('onnx-embedding-generation', generatedData.length);
} catch (e) {
log.warning(`ONNX embedding failed: ${e.message}. Falling back to random embeddings.`);
// Fall back to random embeddings
const random = createSeededRandom(seed);
generatedData = generatedData.map((item) => ({
...item,
embedding: generateRandomEmbedding(effectiveDimensions, random),
embeddingModel: 'random',
embeddingDimensions: effectiveDimensions
}));
}
} else {
// Use random embeddings (faster, for testing)
const random = createSeededRandom(seed);
generatedData = generatedData.map((item) => ({
...item,
embedding: generateRandomEmbedding(effectiveDimensions, random),
embeddingModel: 'random',
embeddingDimensions: effectiveDimensions
}));
}
// Charge for embedding generation
await safeCharge('embedding-generation', generatedData.length);
log.info(`Added embeddings to ${generatedData.length} records`);
}
// Track generation trajectory for SONA learning
if (trajectoryBuilder && sonaEnabled) {
try {
// Use correct TrajectoryBuilder API: startStep -> endStep -> complete
const stepId = trajectoryBuilder.startStep('generate', {
dataType,
count: generatedData.length,
quality,
seed: seed || 'random'
});
trajectoryBuilder.endStep(stepId, {
duration: generationTime,
success: true,
recordsGenerated: generatedData.length
});
log.info('Generation trajectory tracked for SONA learning');
} catch (e) {
log.warning(`Trajectory tracking failed: ${e.message}`);
}
}
// SONA pattern learning from generated data with data-type specific training
if (sonaCoordinator && sonaEnabled && generatedData.length > 0) {
try {
const sampleSize = Math.min(10, generatedData.length);
const sample = generatedData.slice(0, sampleSize);
// Record data-type specific patterns for neural training
const dataTypePatterns = extractDataTypePatterns(dataType, sample);
// Use correct SonaCoordinator API: recordSignal for instant learning
sonaCoordinator.recordSignal({
type: 'generation_complete',
dataType,
samples: sample,
quality,
generationTime,
count: generatedData.length,
patterns: dataTypePatterns
});
// Process instant learning tier with data-type optimization
if (sonaLearningTiers.includes('instant')) {
await sonaCoordinator.processInstantLearning();
}
// Train neural patterns for this data type (use safe method detection)
if (trajectoryBuilder && sonaLearningTiers.includes('background')) {
const trainingData = {
action: `generate_${dataType}`,
observation: { quality, count: generatedData.length, time: generationTime },
reward: quality * (generationTime < 100 ? 1.0 : 0.8),
patterns: dataTypePatterns
};
// Try available trajectory methods
const method = trajectoryBuilder.track || trajectoryBuilder.recordTrajectory || trajectoryBuilder.add;
if (typeof method === 'function') {
method.call(trajectoryBuilder, trainingData);
}
}
log.info(`SONA recorded signal from ${sampleSize} samples`, {
stats: sonaCoordinator.stats(),
patterns: Object.keys(dataTypePatterns).length
});
} catch (e) {
log.warning(`SONA pattern learning failed: ${e.message}`);
}
}
// Helper function to extract data-type specific patterns for training
function extractDataTypePatterns(type, samples) {
const patterns = {};
if (!samples || samples.length === 0) return patterns;
switch (type) {
case 'ecommerce':
patterns.priceRange = { min: Math.min(...samples.map(s => s.price || 0)), max: Math.max(...samples.map(s => s.price || 0)) };
patterns.ratingDistribution = samples.reduce((acc, s) => { acc[Math.floor(s.rating || 0)] = (acc[Math.floor(s.rating || 0)] || 0) + 1; return acc; }, {});
patterns.categoryFreq = samples.reduce((acc, s) => { acc[s.category] = (acc[s.category] || 0) + 1; return acc; }, {});
break;
case 'bloomberg':
patterns.sectorDistribution = samples.reduce((acc, s) => { acc[s.security?.sector] = (acc[s.security?.sector] || 0) + 1; return acc; }, {});
patterns.recommendationFreq = samples.reduce((acc, s) => { acc[s.consensus?.recommendation] = (acc[s.consensus?.recommendation] || 0) + 1; return acc; }, {});
patterns.avgVolume = samples.reduce((sum, s) => sum + (s.pricing?.volume || 0), 0) / samples.length;
break;
case 'medical':
patterns.severityDistribution = samples.reduce((acc, s) => { acc[s.diagnosis?.severity] = (acc[s.diagnosis?.severity] || 0) + 1; return acc; }, {});
patterns.avgAge = samples.reduce((sum, s) => sum + (s.patient?.age || 0), 0) / samples.length;
break;
case 'supply_chain':
patterns.statusDistribution = samples.reduce((acc, s) => { acc[s.order?.status] = (acc[s.order?.status] || 0) + 1; return acc; }, {});
patterns.avgLeadTime = samples.reduce((sum, s) => sum + (s.supplier?.leadTime || 0), 0) / samples.length;
break;
default:
patterns.recordCount = samples.length;
}
return patterns;
}
log.info(`Generated ${generatedData.length} records in ${generationTime}ms`);
// Charge custom events based on data type
const eventMap = {
'ecommerce': 'ecommerce-product',
'social': 'social-media-post',
'jobs': 'job-listing',
'real_estate': 'real-estate-listing',
'search_results': 'search-result',
'api_response': 'api-mock-response',
'news': 'news-article',
// Enterprise data types
'stock_trading': 'stock-trading-record',
'medical': 'medical-record',
'company': 'company-record',
'supply_chain': 'supply-chain-record',
'financial': 'financial-record',
'bloomberg': 'bloomberg-terminal-record'
};
// Simulation mode - push in batches with delays
if (simulationMode && delayBetweenBatches > 0) {
log.info(`Simulation mode: pushing ${batchSize} records every ${delayBetweenBatches}ms`);
// Charge for simulation session
await safeCharge('simulation-session', 1);
const totalBatches = Math.ceil(generatedData.length / batchSize);
for (let i = 0; i < generatedData.length; i += batchSize) {
const batch = generatedData.slice(i, i + batchSize);
const batchNum = Math.floor(i / batchSize) + 1;
await Actor.pushData(batch.map((item, idx) => ({
id: i + idx + 1,
type: dataType,
data: item,
metadata: {
generatedAt: new Date().toISOString(),
provider,
model,
quality,
seed: seed || 'random',
batch: batchNum,
totalBatches,
simulationMode: true
}
})));
// Charge for simulation batch
await safeCharge('simulation-batch', 1);
log.info(`Pushed batch ${batchNum}/${totalBatches}`);
if (i + batchSize < generatedData.length) {
await new Promise(resolve => setTimeout(resolve, delayBetweenBatches));
}
}
} else {
// Push all results at once
await Actor.pushData(generatedData.map((item, index) => ({
id: index + 1,
type: mode === 'generate' ? dataType : mode,
data: item,
metadata: {
generatedAt: new Date().toISOString(),
mode,
dataType: mode === 'generate' ? dataType : null,
actorId: integrateActorId || null,
template: useTemplate || null,
provider,
model,
quality,
seed: seed || 'random',
hasEmbedding: generateEmbeddings
}
})));
// Charge for data type specific events
const eventName = eventMap[dataType];
if (eventName && mode === 'generate') {
await safeCharge(eventName, generatedData.length);
log.info(`Charged ${generatedData.length} ${eventName} events`);
}
// Charge for AI-enhanced records if using AI
if ((geminiKey || openRouterKey || anthropicKey) && dataType === 'structured') {
await safeCharge('ai-enhanced-record', generatedData.length);
log.info(`Charged ${generatedData.length} AI-enhanced events`);
}
}
log.info(`Pushed ${generatedData.length} records to dataset`);
// ============================================
// MEMORY SESSION PERSISTENCE (optional)
// ============================================
let memorySessionResult = null;
if (memorySessionEnabled && memorySessionId) {
try {
log.info(`Saving to memory session: ${memorySessionId}`);
const session = new MemorySession(memorySessionId, { actorName: 'agentic-synth' });
await session.init();
// Load existing memories if appending
if (appendToSession) {
await session.load();
log.info(`Loaded ${session.getMemories().length} existing memories`);
}
// Add generated data to session
const memoryRecords = generatedData.map((item, index) => ({
id: `synth_${Date.now()}_${index}`,
text: typeof item === 'string' ? item : JSON.stringify(item).substring(0, 500),
data: item,
type: mode === 'generate' ? dataType : mode,
embedding: item.embedding || null,
metadata: {
generatedAt: new Date().toISOString(),
mode,
dataType: mode === 'generate' ? dataType : null,
actorId: integrateActorId || null,
template: useTemplate || null,
provider,
model
}
}));
await session.addBatch(memoryRecords);
await session.save();
memorySessionResult = {
sessionId: memorySessionId,
totalMemories: session.getMemories().length,
addedMemories: memoryRecords.length,
metadata: session.getMetadata()
};
log.info(`Saved ${memoryRecords.length} records to memory session ${memorySessionId}`);
log.info(`Total memories in session: ${session.getMemories().length}`);
} catch (e) {
log.warning(`Memory session save failed: ${e.message}`);
memorySessionResult = { error: e.message };
}
}
// ============================================
// WEBHOOK NOTIFICATION (optional)
// ============================================
if (webhookUrl) {
log.info(`Sending webhook to: ${webhookUrl}`);
try {
const webhookPayload = {
actorId: 'ruv/ai-synthetic-data-generator',
runId: process.env.APIFY_ACTOR_RUN_ID,
status: 'success',
mode,
dataType: mode === 'generate' ? dataType : null,
template: useTemplate || null,
integrateActorId: integrateActorId || null,
totalRecords: generatedData.length,
generationTime,
hasEmbeddings: generateEmbeddings,
datasetId: process.env.APIFY_DEFAULT_DATASET_ID,
memorySession: memorySessionResult,
timestamp: new Date().toISOString()
};
const response = await fetch(webhookUrl, {
method: 'POST',
headers: {
'Content-Type': 'application/json',
'User-Agent': 'Apify-AI-Synthetic-Data-Generator/2.5'
},
body: JSON.stringify(webhookPayload)
});
if (response.ok) {
log.info('Webhook notification sent successfully');
await safeCharge('webhook-notification', 1);
} else {
log.warning(`Webhook failed with status: ${response.status}`);
}
} catch (e) {
log.warning(`Webhook notification failed: ${e.message}`);
}
}
} catch (error) {
log.error('Actor failed', { error: error.message });
throw error;
} finally {
await Actor.exit();
}
// ============================================
// WEB SCRAPING FOCUSED GENERATORS
// ============================================
async function generateEcommerceData(count, seed) {
log.info('Generating e-commerce product data...');
const random = createSeededRandom(seed);
const results = [];
// Category-matched brands for realistic data
const categoryBrands = {
'Electronics': ['Samsung', 'Sony', 'Apple', 'LG', 'Bose', 'JBL', 'Anker', 'Logitech'],
'Clothing': ['Nike', 'Adidas', 'Zara', 'H&M', 'Levi\'s', 'Gap', 'Uniqlo', 'Calvin Klein'],
'Home & Garden': ['IKEA', 'Pottery Barn', 'West Elm', 'Crate & Barrel', 'HomeGoods', 'Wayfair'],
'Sports': ['Nike', 'Under Armour', 'Adidas', 'Puma', 'Wilson', 'Spalding', 'Callaway'],
'Books': ['Penguin', 'HarperCollins', 'Simon & Schuster', 'Random House', 'Scholastic'],
'Toys': ['LEGO', 'Hasbro', 'Mattel', 'Fisher-Price', 'Melissa & Doug', 'Nerf'],
'Beauty': ['L\'Oreal', 'Maybelline', 'Neutrogena', 'Olay', 'Revlon', 'CeraVe', 'The Ordinary'],
'Automotive': ['Bosch', 'Michelin', 'Goodyear', 'Mobil', 'Castrol', 'WeatherTech', 'AutoZone']
};
const categories = Object.keys(categoryBrands);
const conditions = ['New', 'Used - Like New', 'Used - Good', 'Refurbished'];
for (let i = 0; i < count; i++) {
const category = categories[Math.floor(random() * categories.length)];
const brandsForCategory = categoryBrands[category];
const brand = brandsForCategory[Math.floor(random() * brandsForCategory.length)];
const basePrice = 10 + random() * 990;
const hasDiscount = random() > 0.6;
// Consistent stock logic: if stockCount is 0, inStock is false
const stockCount = Math.floor(random() * 500);
const inStock = stockCount > 0 && random() > 0.1;
// Consistent shipping logic: free shipping means price is 0
const isFreeShipping = random() > 0.4;
const shippingPrice = isFreeShipping ? 0 : Math.round((5 + random() * 10) * 100) / 100;
results.push({
url: `https://example-store.com/products/${generateSlug(random)}-${i}`,
title: `${brand} ${generateProductName(category, random)}`,
price: Math.round(basePrice * 100) / 100,
originalPrice: hasDiscount ? Math.round(basePrice * (1.1 + random() * 0.4) * 100) / 100 : null,
currency: 'USD',
category,
brand,
rating: Math.round((3 + random() * 2) * 10) / 10,
reviewCount: Math.floor(random() * 5000),
inStock,
stockCount: inStock ? stockCount : 0,
condition: conditions[Math.floor(random() * conditions.length)],
seller: {
name: `Seller${Math.floor(random() * 1000)}`,
rating: Math.round((3.5 + random() * 1.5) * 10) / 10,
totalSales: Math.floor(random() * 50000)
},
shipping: {
free: isFreeShipping,
estimatedDays: Math.floor(2 + random() * 8),
price: shippingPrice
},
images: Array.from({ length: Math.floor(1 + random() * 5) }, (_, j) =>
`https://example-store.com/images/product-${i}-${j}.jpg`
),
specifications: generateSpecs(category, random),
scrapedAt: new Date().toISOString()
});
}
return results;
}
async function generateSocialMediaData(count, seed) {
log.info('Generating social media data...');
const random = createSeededRandom(seed);
const results = [];
const platforms = ['twitter', 'instagram', 'facebook', 'linkedin', 'tiktok'];
const postTypes = ['text', 'image', 'video', 'link', 'poll'];
for (let i = 0; i < count; i++) {
const platform = platforms[Math.floor(random() * platforms.length)];
const postType = postTypes[Math.floor(random() * postTypes.length)];
const timestamp = new Date(Date.now() - random() * 30 * 24 * 60 * 60 * 1000);
results.push({
url: `https://${platform}.com/post/${generateId(random)}`,
platform,
postType,
author: {
username: `user_${generateId(random)}`,
displayName: generateName(random),
verified: random() > 0.85,
followers: Math.floor(random() * 1000000),
following: Math.floor(random() * 5000),
profileUrl: `https://${platform}.com/user_${generateId(random)}`
},
content: {
text: generateSocialText(random),
hashtags: Array.from({ length: Math.floor(random() * 6) }, () => `#${generateHashtag(random)}`),
mentions: Array.from({ length: Math.floor(random() * 3) }, () => `@user_${generateId(random)}`),
mediaUrls: postType !== 'text' ? [`https://${platform}.com/media/${generateId(random)}.jpg`] : []
},
engagement: {
likes: Math.floor(random() * 100000),
comments: Math.floor(random() * 5000),
shares: Math.floor(random() * 10000),
views: Math.floor(random() * 1000000)
},
timestamp: timestamp.toISOString(),
scrapedAt: new Date().toISOString()
});
}
return results;
}
async function generateApiResponseData(count, endpoint, seed) {
log.info('Generating API response data...', { endpoint });
const random = createSeededRandom(seed);
const results = [];
for (let i = 0; i < count; i++) {
const statusCodes = [200, 200, 200, 200, 201, 400, 401, 404, 500];
const statusCode = statusCodes[Math.floor(random() * statusCodes.length)];
results.push({
endpoint: `${endpoint}/${i}`,
method: 'GET',
statusCode,
headers: {
'content-type': 'application/json',
'x-request-id': generateId(random),
'x-rate-limit-remaining': Math.floor(random() * 1000),
'cache-control': random() > 0.5 ? 'max-age=3600' : 'no-cache'
},
responseTime: Math.floor(50 + random() * 500),
body: statusCode < 400 ? {
id: generateId(random),
data: generateRandomObject(random),
pagination: {
page: 1,
perPage: 20,
total: Math.floor(random() * 10000),
hasMore: random() > 0.3
}
} : {
error: {
code: `ERR_${statusCode}`,
message: getErrorMessage(statusCode)
}
},
timestamp: new Date().toISOString()
});
}
return results;
}
async function generateSearchResultsData(count, seed) {
log.info('Generating search results data...');
const random = createSeededRandom(seed);
const results = [];
const domains = ['example.com', 'blog.example.org', 'news.example.net', 'shop.example.io', 'docs.example.dev'];
for (let i = 0; i < count; i++) {
const domain = domains[Math.floor(random() * domains.length)];
results.push({
position: i + 1,
url: `https://${domain}/${generateSlug(random)}`,
title: generateSearchTitle(random),
snippet: generateSnippet(random),
domain,
displayUrl: `${domain} > ${generateBreadcrumb(random)}`,
type: random() > 0.8 ? 'featured' : 'organic',
sitelinks: random() > 0.7 ? Array.from({ length: Math.floor(2 + random() * 4) }, () => ({
title: generateSearchTitle(random),
url: `https://${domain}/${generateSlug(random)}`
})) : null,
rich_snippet: random() > 0.6 ? {
rating: Math.round((3 + random() * 2) * 10) / 10,
reviewCount: Math.floor(random() * 10000),
price: random() > 0.5 ? `$${Math.floor(10 + random() * 500)}` : null
} : null,
scrapedAt: new Date().toISOString()
});
}
return results;
}
async function generateRealEstateData(count, seed) {
log.info('Generating real estate listing data...');
const random = createSeededRandom(seed);
const results = [];
const propertyTypes = ['House', 'Apartment', 'Condo', 'Townhouse', 'Land', 'Commercial'];
const cities = ['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix', 'San Diego', 'Dallas', 'Austin'];
const listingTypes = ['For Sale', 'For Rent', 'Auction'];
for (let i = 0; i < count; i++) {
const propertyType = propertyTypes[Math.floor(random() * propertyTypes.length)];
const city = cities[Math.floor(random() * cities.length)];
const listingType = listingTypes[Math.floor(random() * listingTypes.length)];
const bedrooms = Math.floor(1 + random() * 6);
const sqft = Math.floor(500 + random() * 4500);
results.push({
url: `https://realestate-example.com/listing/${generateId(random)}`,
listingId: generateId(random),
title: `${bedrooms} Bed ${propertyType} in ${city}`,
price: Math.floor(100000 + random() * 2000000),
listingType,
propertyType,
address: {
street: `${Math.floor(100 + random() * 9900)} ${generateStreetName(random)}`,
city,
state: getState(city),
zipCode: String(Math.floor(10000 + random() * 90000)),
country: 'USA'
},
details: {
bedrooms,
bathrooms: Math.floor(1 + random() * 4),
sqft,
lotSize: Math.floor(sqft * (1.5 + random() * 3)),
yearBuilt: Math.floor(1950 + random() * 74),
parking: Math.floor(random() * 4),
stories: Math.floor(1 + random() * 3)
},
features: generateRealEstateFeatures(random),
agent: {
name: generateName(random),
phone: generatePhone(random),
email: `agent${Math.floor(random() * 1000)}@realestate.com`,
company: `${generateName(random)} Realty`
},
images: Array.from({ length: Math.floor(5 + random() * 20) }, (_, j) =>
`https://realestate-example.com/images/listing-${i}-${j}.jpg`
),
daysOnMarket: Math.floor(random() * 180),
scrapedAt: new Date().toISOString()
});
}
return results;
}
async function generateJobListingsData(count, seed) {
log.info('Generating job listings data...');
const random = createSeededRandom(seed);
const results = [];
const titles = ['Software Engineer', 'Product Manager', 'Data Scientist', 'UX Designer', 'DevOps Engineer', 'Marketing Manager', 'Sales Representative', 'Customer Success Manager'];
const companies = ['TechCorp', 'InnovateLabs', 'DataDriven Inc', 'CloudScale', 'StartupXYZ', 'Enterprise Solutions', 'Digital Agency', 'Growth Partners'];
const locations = ['Remote', 'New York, NY', 'San Francisco, CA', 'Austin, TX', 'Seattle, WA', 'Boston, MA', 'Chicago, IL', 'Los Angeles, CA'];
const types = ['Full-time', 'Part-time', 'Contract', 'Internship'];
for (let i = 0; i < count; i++) {
const title = titles[Math.floor(random() * titles.length)];
const company = companies[Math.floor(random() * companies.length)];
const location = locations[Math.floor(random() * locations.length)];
const salaryMin = Math.floor(50000 + random() * 100000);
results.push({
url: `https://jobs-example.com/job/${generateId(random)}`,
jobId: generateId(random),
title,
company: {
name: company,
logo: `https://jobs-example.com/logos/${company.toLowerCase().replace(/\s/g, '-')}.png`,
rating: Math.round((3 + random() * 2) * 10) / 10,
reviewCount: Math.floor(random() * 5000),
size: ['1-50', '51-200', '201-500', '501-1000', '1000+'][Math.floor(random() * 5)]
},
location,
remote: location === 'Remote' || random() > 0.7,
type: types[Math.floor(random() * types.length)],
salary: {
min: salaryMin,
max: salaryMin + Math.floor(random() * 50000),
currency: 'USD',
period: 'yearly'
},
description: generateJobDescription(random),
requirements: Array.from({ length: Math.floor(3 + random() * 5) }, () => generateRequirement(random)),
benefits: generateBenefits(random),
postedDate: new Date(Date.now() - random() * 30 * 24 * 60 * 60 * 1000).toISOString(),
applicants: Math.floor(random() * 500),
scrapedAt: new Date().toISOString()
});
}
return results;
}
async function generateNewsData(count, seed) {
log.info('Generating news article data...');
const random = createSeededRandom(seed);
const results = [];
const sources = ['TechNews', 'BusinessDaily', 'WorldReport', 'ScienceToday', 'HealthWatch', 'SportsCentral'];
const categories = ['Technology', 'Business', 'Politics', 'Science', 'Health', 'Sports', 'Entertainment'];
const authors = ['John Smith', 'Sarah Johnson', 'Mike Williams', 'Emily Brown', 'David Lee', 'Lisa Chen'];
for (let i = 0; i < count; i++) {
const source = sources[Math.floor(random() * sources.length)];
const category = categories[Math.floor(random() * categories.length)];
const publishDate = new Date(Date.now() - random() * 7 * 24 * 60 * 60 * 1000);
results.push({
url: `https://${source.toLowerCase()}.com/article/${generateSlug(random)}`,
title: generateNewsTitle(category, random),
subtitle: generateSubtitle(random),
source,
category,
author: {
name: authors[Math.floor(random() * authors.length)],
url: `https://${source.toLowerCase()}.com/author/${generateSlug(random)}`
},
publishedAt: publishDate.toISOString(),
updatedAt: random() > 0.7 ? new Date(publishDate.getTime() + random() * 24 * 60 * 60 * 1000).toISOString() : null,
content: {
text: generateArticleContent(random),
wordCount: Math.floor(300 + random() * 1500),
readingTime: Math.floor(2 + random() * 10)
},
images: [{
url: `https://${source.toLowerCase()}.com/images/article-${i}.jpg`,
caption: generateCaption(random)
}],
tags: Array.from({ length: Math.floor(2 + random() * 5) }, () => generateTag(random)),
engagement: {
views: Math.floor(random() * 100000),
comments: Math.floor(random() * 500),
shares: Math.floor(random() * 2000)
},
scrapedAt: new Date().toISOString()
});
}
return results;
}
// ============================================
// ORIGINAL GENERATORS (kept for compatibility)
// ============================================
async function generateDemoData(count, apiKey, model) {
log.info('Generating demo data with web scraping examples...');
const results = [];
const perType = Math.ceil(count / 5);
// E-commerce products
const ecommerce = await generateEcommerceData(perType);
results.push(...ecommerce.map(d => ({ ...d, _type: 'ecommerce' })));
// Social media posts
const social = await generateSocialMediaData(perType);
results.push(...social.map(d => ({ ...d, _type: 'social' })));
// Search results
const search = await generateSearchResultsData(perType);
results.push(...search.map(d => ({ ...d, _type: 'search_results' })));
// Job listings
const jobs = await generateJobListingsData(perType);
results.push(...jobs.map(d => ({ ...d, _type: 'jobs' })));
// News articles
const news = await generateNewsData(perType);
results.push(...news.map(d => ({ ...d, _type: 'news' })));
return results.slice(0, count);
}
async function generateStructuredData(count, schema, apiKey, model, seed, provider = 'gemini') {
log.info('Generating structured data...', { count, schema, provider, model });
const results = [];
const random = createSeededRandom(seed);
if (apiKey && Object.keys(schema).length > 0) {
try {
const prompt = `Generate ${Math.min(count, 20)} unique records matching this schema:
${JSON.stringify(schema, null, 2)}
Return ONLY a valid JSON array with no additional text. Each record should be realistic and diverse.`;
let text;
if (provider === 'openrouter') {
// Use OpenRouter API (supports DeepSeek, GPT, Claude, Llama, etc.)
const response = await fetch('https://openrouter.ai/api/v1/chat/completions', {
method: 'POST',
headers: {
'Authorization': `Bearer ${apiKey}`,
'Content-Type': 'application/json',
'HTTP-Referer': 'https://apify.com',
'X-Title': 'AI Synthetic Data Generator'
},
body: JSON.stringify({
model: model || 'deepseek/deepseek-chat',
messages: [{ role: 'user', content: prompt }],
temperature: 0.7
})
});
const data = await response.json();
text = data.choices?.[0]?.message?.content || '';
log.info('OpenRouter response received', { model });
} else if (provider === 'anthropic') {
// Use Anthropic Claude API directly
const response = await fetch('https://api.anthropic.com/v1/messages', {
method: 'POST',
headers: {
'x-api-key': apiKey,
'Content-Type': 'application/json',
'anthropic-version': '2023-06-01'
},
body: JSON.stringify({
model: model || 'claude-3-5-haiku-20241022',
max_tokens: 4096,
messages: [{ role: 'user', content: prompt }]
})
});
const data = await response.json();
text = data.content?.[0]?.text || '';
log.info('Anthropic response received', { model });
} else {
// Use Gemini
const genAI = new GoogleGenerativeAI(apiKey);
const gemini = genAI.getGenerativeModel({ model: model || 'gemini-2.0-flash-exp' });
const result = await gemini.generateContent(prompt);
text = result.response.text();
log.info('Gemini response received', { model });
}
const jsonMatch = text.match(/\[[\s\S]*\]/);
if (jsonMatch) {
const parsed = JSON.parse(jsonMatch[0]);
results.push(...parsed);
log.info(`AI generated ${parsed.length} records`);
}
while (results.length < count) {
results.push(generateFallbackStructured(schema, random));
}
} catch (e) {
log.warning(`AI generation failed: ${e.message}. Using fallback.`);
for (let i = 0; i < count; i++) {
results.push(generateFallbackStructured(schema, random));
}
}
} else {
for (let i = 0; i < count; i++) {
results.push(generateFallbackStructured(schema, random));
}
}
return results.slice(0, count);
}
function generateFallbackStructured(schema, random) {
const record = {};
for (const [key, type] of Object.entries(schema)) {
if (typeof type === 'string') {
if (type.includes('url')) {
record[key] = `https://example.com/${generateSlug(random)}`;
} else if (type.includes('email')) {
record[key] = `user${Math.floor(random() * 10000)}@example.com`;
} else if (type.includes('fullName') || type.includes('name')) {
record[key] = generateName(random);
} else if (type.includes('number')) {
const match = type.match(/\((\d+)-(\d+)\)/);
if (match) {
const min = parseInt(match[1]);
const max = parseInt(match[2]);
record[key] = min + Math.floor(random() * (max - min + 1));
} else {
record[key] = Math.floor(random() * 100);
}
} else if (type.includes('boolean')) {
record[key] = random() > 0.5;
} else if (type.includes('(') && type.includes(',')) {
const options = type.match(/\(([^)]+)\)/)?.[1].split(',').map(s => s.trim()) || ['Option1', 'Option2'];
record[key] = options[Math.floor(random() * options.length)];
} else {
record[key] = `value_${Math.floor(random() * 1000)}`;
}
}
}
return record;
}
async function generateTimeSeriesData(count, config, seed) {
log.info('Generating time-series data...', { count, config });
const {
interval = '1h',
trend = 'flat',
seasonality = false,
noise = 0.1,
startDate = '2024-01-01'
} = config;
const random = createSeededRandom(seed);
const results = [];
const start = new Date(startDate);
const intervalMs = parseInterval(interval);
let value = 100;
const trendFactor = trend === 'upward' ? 0.01 : trend === 'downward' ? -0.01 : 0;
for (let i = 0; i < count; i++) {
const timestamp = new Date(start.getTime() + i * intervalMs);
value *= (1 + trendFactor);
let seasonalValue = value;
if (seasonality) {
const hour = timestamp.getHours();
const seasonalFactor = 1 + 0.2 * Math.sin((hour / 24) * 2 * Math.PI);
seasonalValue = value * seasonalFactor;
}
const noiseValue = seasonalValue * (1 + (random() - 0.5) * 2 * noise);
results.push({
timestamp: timestamp.toISOString(),
value: Math.round(noiseValue * 100) / 100,
open: Math.round(noiseValue * (1 - random() * 0.02) * 100) / 100,
high: Math.round(noiseValue * (1 + random() * 0.03) * 100) / 100,
low: Math.round(noiseValue * (1 - random() * 0.03) * 100) / 100,
close: Math.round(noiseValue * (1 + (random() - 0.5) * 0.02) * 100) / 100,
volume: Math.floor(random() * 1000000)
});
}
return results;
}
async function generateEventData(count, eventTypes, seed) {
log.info('Generating web event data...', { count, eventTypes });
const random = createSeededRandom(seed);
const results = [];
const now = Date.now();
const dayMs = 24 * 60 * 60 * 1000;
for (let i = 0; i < count; i++) {
const eventType = eventTypes[Math.floor(random() * eventTypes.length)];
const timestamp = new Date(now - random() * 30 * dayMs);
const event = {
eventId: `evt_${Date.now()}_${i}`,
type: eventType,
timestamp: timestamp.toISOString(),
userId: `user_${Math.floor(random() * 1000)}`,
sessionId: `sess_${Math.floor(random() * 10000)}`,
page: {
url: `https://example.com/${generateSlug(random)}`,
title: generateSearchTitle(random),
referrer: random() > 0.3 ? 'https://google.com' : 'direct'
},
device: {
type: random() > 0.6 ? 'mobile' : 'desktop',
browser: ['Chrome', 'Firefox', 'Safari', 'Edge'][Math.floor(random() * 4)],
os: ['Windows', 'macOS', 'iOS', 'Android', 'Linux'][Math.floor(random() * 5)]
},
properties: generateEventProperties(eventType, random)
};
results.push(event);
}
results.sort((a, b) => new Date(a.timestamp) - new Date(b.timestamp));
return results;
}
function generateEventProperties(eventType, random) {
switch (eventType) {
case 'page_view':
return {
loadTime: Math.floor(100 + random() * 3000),
scrollDepth: Math.floor(random() * 100)
};
case 'click':
return {
element: ['button', 'link', 'image', 'card'][Math.floor(random() * 4)],
elementId: `el_${Math.floor(random() * 1000)}`,
x: Math.floor(random() * 1920),
y: Math.floor(random() * 1080)
};
case 'scroll':
return {
direction: random() > 0.8 ? 'up' : 'down',
depth: Math.floor(random() * 100),
velocity: Math.floor(random() * 500)
};
case 'form_submit':
return {
formId: `form_${Math.floor(random() * 100)}`,
formName: ['contact', 'signup', 'checkout', 'search'][Math.floor(random() * 4)],
success: random() > 0.1,
fieldCount: Math.floor(2 + random() * 10)
};
case 'api_call':
return {
endpoint: `/api/${['users', 'products', 'orders', 'search'][Math.floor(random() * 4)]}`,
method: ['GET', 'POST', 'PUT', 'DELETE'][Math.floor(random() * 4)],
statusCode: random() > 0.9 ? 500 : random() > 0.1 ? 200 : 400,
responseTime: Math.floor(50 + random() * 500)
};
default:
return { value: Math.floor(random() * 100) };
}
}
async function generateEmbeddingData(count, dimensions, seed) {
log.info('Generating embedding data...', { count, dimensions });
const random = createSeededRandom(seed);
const results = [];
const topics = [
'Product search optimization',
'Customer sentiment analysis',
'Price comparison algorithms',
'Inventory management',
'User behavior tracking',
'Market trend analysis',
'Competitor monitoring',
'Review aggregation',
'Category classification',
'Recommendation engines'
];
for (let i = 0; i < count; i++) {
const embedding = [];
let norm = 0;
for (let j = 0; j < dimensions; j++) {
const val = random() * 2 - 1;
embedding.push(val);
norm += val * val;
}
norm = Math.sqrt(norm);
for (let j = 0; j < dimensions; j++) {
embedding[j] = Math.round((embedding[j] / norm) * 1000000) / 1000000;
}
results.push({
id: `emb_${i}`,
text: topics[i % topics.length] + ` - variant ${Math.floor(i / topics.length)}`,
embedding,
dimensions,
model: 'synthetic'
});
}
return results;
}
// ============================================
// UTILITY FUNCTIONS
// ============================================
function createSeededRandom(seed) {
if (!seed) return Math.random;
let s = hashCode(String(seed));
return function() {
s = Math.sin(s) * 10000;
return s - Math.floor(s);
};
}
function hashCode(str) {
let hash = 0;
for (let i = 0; i < str.length; i++) {
const char = str.charCodeAt(i);
hash = ((hash << 5) - hash) + char;
hash = hash & hash;
}
return Math.abs(hash);
}
/**
* Generate synthetic data based on a template output schema
*/
function generateFromTemplateSchema(outputFormat, random, index) {
const record = {};
for (const [key, type] of Object.entries(outputFormat)) {
if (type === 'string') {
record[key] = generateTemplateString(key, random);
} else if (type.startsWith('number')) {
const match = type.match(/\((\d+)-(\d+)\)/);
if (match) {
const min = parseInt(match[1]);
const max = parseInt(match[2]);
record[key] = min + Math.floor(random() * (max - min + 1));
} else {
record[key] = Math.floor(random() * 100);
}
} else if (type === 'object') {
record[key] = { id: generateId(random), value: Math.floor(random() * 1000) };
} else if (type.startsWith('array')) {
const itemType = type.match(/<(\w+)>/)?.[1] || 'string';
const count = Math.floor(2 + random() * 4);
record[key] = Array.from({ length: count }, () =>
itemType === 'string' ? generateTemplateString(key, random) :
itemType === 'object' ? { id: generateId(random), value: Math.floor(random() * 100) } :
Math.floor(random() * 1000)
);
} else {
record[key] = `value_${index}_${Math.floor(random() * 1000)}`;
}
}
record._templateId = `tpl_${index}`;
record._generatedAt = new Date().toISOString();
return record;
}
/**
* Generate context-aware string values based on field name
*/
function generateTemplateString(fieldName, random) {
const lowerField = fieldName.toLowerCase();
if (lowerField.includes('id') || lowerField.includes('Id')) {
return `id_${Math.floor(random() * 100000)}`;
}
if (lowerField.includes('name') || lowerField.includes('title')) {
return generateName(random);
}
if (lowerField.includes('email')) {
return `user${Math.floor(random() * 10000)}@example.com`;
}
if (lowerField.includes('phone')) {
return generatePhone(random);
}
if (lowerField.includes('url') || lowerField.includes('website')) {
return `https://example.com/${generateSlug(random)}`;
}
if (lowerField.includes('description') || lowerField.includes('content') || lowerField.includes('summary')) {
return generateSnippet(random);
}
if (lowerField.includes('approach') || lowerField.includes('strategy')) {
const approaches = ['Direct outreach', 'Email campaign', 'Social engagement', 'Referral network', 'Content marketing'];
return approaches[Math.floor(random() * approaches.length)];
}
if (lowerField.includes('insight') || lowerField.includes('finding')) {
const insights = ['High growth potential', 'Active buyer signals', 'Recent funding round', 'Expanding market', 'Technology adoption'];
return insights[Math.floor(random() * insights.length)];
}
if (lowerField.includes('style') || lowerField.includes('type')) {
const styles = ['Professional', 'Casual', 'Educational', 'Entertaining', 'Promotional'];
return styles[Math.floor(random() * styles.length)];
}
return `value_${Math.floor(random() * 1000)}`;
}
function parseInterval(interval) {
const match = interval.match(/(\d+)([mhd])/);
if (!match) return 3600000;
const value = parseInt(match[1]);
const unit = match[2];
switch (unit) {
case 'm': return value * 60 * 1000;
case 'h': return value * 60 * 60 * 1000;
case 'd': return value * 24 * 60 * 60 * 1000;
default: return 3600000;
}
}
function generateId(random) {
return Math.random().toString(36).substring(2, 15);
}
function generateSlug(random) {
const words = ['best', 'top', 'new', 'amazing', 'premium', 'ultra', 'pro', 'max', 'elite', 'smart'];
const nouns = ['product', 'item', 'deal', 'offer', 'guide', 'review', 'article', 'post'];
return `${words[Math.floor(random() * words.length)]}-${nouns[Math.floor(random() * nouns.length)]}-${Math.floor(random() * 10000)}`;
}
function generateName(random) {
const firstNames = ['John', 'Jane', 'Alex', 'Sarah', 'Mike', 'Emma', 'Chris', 'Lisa', 'David', 'Amy'];
const lastNames = ['Smith', 'Johnson', 'Williams', 'Brown', 'Jones', 'Garcia', 'Miller', 'Davis', 'Wilson'];
return `${firstNames[Math.floor(random() * firstNames.length)]} ${lastNames[Math.floor(random() * lastNames.length)]}`;
}
function generateProductName(category, random) {
const adjectives = ['Premium', 'Ultra', 'Pro', 'Classic', 'Smart', 'Portable', 'Wireless', 'Advanced'];
const products = {
'Electronics': ['Headphones', 'Speaker', 'Charger', 'Cable', 'Adapter', 'Mouse', 'Keyboard'],
'Clothing': ['T-Shirt', 'Jacket', 'Jeans', 'Sneakers', 'Hat', 'Sweater', 'Dress'],
'Home & Garden': ['Lamp', 'Planter', 'Organizer', 'Tool Set', 'Decoration', 'Rug'],
'Sports': ['Ball', 'Gloves', 'Bag', 'Mat', 'Weights', 'Bottle', 'Band'],
'Books': ['Guide', 'Novel', 'Textbook', 'Cookbook', 'Biography', 'Manual'],
'Toys': ['Figure', 'Game', 'Puzzle', 'Set', 'Doll', 'Car'],
'Beauty': ['Cream', 'Serum', 'Mask', 'Oil', 'Brush', 'Palette'],
'Automotive': ['Cover', 'Mat', 'Charger', 'Holder', 'Cleaner', 'Light']
};
const items = products[category] || products['Electronics'];
return `${adjectives[Math.floor(random() * adjectives.length)]} ${items[Math.floor(random() * items.length)]}`;
}
function generateSpecs(category, random) {
const specs = {
'Electronics': { battery: `${Math.floor(1000 + random() * 4000)}mAh`, connectivity: 'Bluetooth 5.0', warranty: '1 year' },
'Clothing': { material: random() > 0.5 ? 'Cotton' : 'Polyester', size: ['S', 'M', 'L', 'XL'][Math.floor(random() * 4)] },
'Home & Garden': { dimensions: `${Math.floor(10 + random() * 50)}x${Math.floor(10 + random() * 50)}cm`, weight: `${Math.floor(random() * 10)}kg` }
};
return specs[category] || { general: 'Standard specifications' };
}
function generateSocialText(random) {
const texts = [
'Just discovered this amazing product! Highly recommend',
'Working on something exciting today',
'Can\'t believe how good this turned out',
'Who else is enjoying this beautiful day?',
'Sharing my latest project with you all',
'This is a game changer for productivity',
'Thoughts on the latest industry trends?'
];
return texts[Math.floor(random() * texts.length)];
}
function generateHashtag(random) {
const tags = ['tech', 'innovation', 'business', 'startup', 'coding', 'design', 'marketing', 'growth', 'success', 'tips'];
return tags[Math.floor(random() * tags.length)];
}
function generateRandomObject(random) {
return {
name: generateName(random),
value: Math.floor(random() * 1000),
active: random() > 0.3,
tags: ['tag1', 'tag2', 'tag3'].slice(0, Math.floor(1 + random() * 3))
};
}
function getErrorMessage(code) {
const messages = {
400: 'Bad Request - Invalid parameters',
401: 'Unauthorized - Invalid API key',
403: 'Forbidden - Access denied',
404: 'Not Found - Resource does not exist',
500: 'Internal Server Error'
};
return messages[code] || 'Unknown error';
}
function generateSearchTitle(random) {
const templates = [
'How to Get Started with {topic}',
'The Complete Guide to {topic}',
'Top 10 {topic} Tips for Beginners',
'Best {topic} Practices in 2024',
'{topic}: Everything You Need to Know'
];
const topics = ['Web Scraping', 'Data Analysis', 'API Integration', 'Automation', 'Machine Learning'];
const template = templates[Math.floor(random() * templates.length)];
const topic = topics[Math.floor(random() * topics.length)];
return template.replace('{topic}', topic);
}
function generateSnippet(random) {
const snippets = [
'Learn how to effectively implement solutions with our comprehensive guide. Discover best practices and expert tips.',
'This detailed tutorial walks you through step-by-step instructions for achieving optimal results.',
'Get started quickly with our beginner-friendly approach. No prior experience required.',
'Explore advanced techniques used by industry professionals to maximize efficiency.',
'Find out why thousands of users trust our methods for reliable, consistent outcomes.'
];
return snippets[Math.floor(random() * snippets.length)];
}
function generateBreadcrumb(random) {
const paths = ['guides', 'tutorials', 'blog', 'docs', 'resources'];
return paths[Math.floor(random() * paths.length)];
}
function generateStreetName(random) {
const types = ['St', 'Ave', 'Blvd', 'Dr', 'Ln', 'Way', 'Ct'];
const names = ['Oak', 'Main', 'Park', 'Cedar', 'Elm', 'Washington', 'Lake', 'Hill'];
return `${names[Math.floor(random() * names.length)]} ${types[Math.floor(random() * types.length)]}`;
}
function getState(city) {
const states = {
'New York': 'NY', 'Los Angeles': 'CA', 'Chicago': 'IL', 'Houston': 'TX',
'Phoenix': 'AZ', 'San Diego': 'CA', 'Dallas': 'TX', 'Austin': 'TX'
};
return states[city] || 'CA';
}
function generateRealEstateFeatures(random) {
const allFeatures = ['Pool', 'Garage', 'Garden', 'Fireplace', 'Central AC', 'Hardwood Floors', 'Updated Kitchen', 'Smart Home', 'Solar Panels', 'Home Office'];
const count = Math.floor(2 + random() * 5);
return allFeatures.sort(() => random() - 0.5).slice(0, count);
}
function generatePhone(random) {
return `(${Math.floor(200 + random() * 800)}) ${Math.floor(100 + random() * 900)}-${Math.floor(1000 + random() * 9000)}`;
}
function generateJobDescription(random) {
return 'We are looking for a talented professional to join our growing team. You will work on challenging projects and collaborate with cross-functional teams to deliver exceptional results.';
}
function generateRequirement(random) {
const reqs = [
'3+ years of relevant experience',
'Strong communication skills',
'Bachelor\'s degree or equivalent',
'Experience with modern tools',
'Ability to work independently',
'Team collaboration experience',
'Problem-solving mindset'
];
return reqs[Math.floor(random() * reqs.length)];
}
function generateBenefits(random) {
const allBenefits = ['Health Insurance', '401k Match', 'Remote Work', 'Unlimited PTO', 'Stock Options', 'Learning Budget', 'Gym Membership', 'Free Lunch'];
return allBenefits.sort(() => random() - 0.5).slice(0, Math.floor(3 + random() * 4));
}
function generateNewsTitle(category, random) {
const templates = {
'Technology': ['New AI Breakthrough Transforms {x}', 'Tech Giants Announce {x} Initiative', 'The Future of {x} is Here'],
'Business': ['Market Sees Record {x}', 'Company Reports {x} Growth', 'Industry Leaders Discuss {x}'],
'Politics': ['Government Announces {x} Policy', 'Leaders Meet to Discuss {x}', 'New {x} Legislation Proposed'],
'Science': ['Scientists Discover {x}', 'New Research Reveals {x}', 'Breakthrough in {x} Studies'],
'Health': ['Health Experts Recommend {x}', 'New Study Links {x} to Wellness', 'Medical Advances in {x}'],
'Sports': ['Team Wins {x} Championship', 'Athletes Break {x} Record', 'Sports World Reacts to {x}'],
'Entertainment': ['Celebrity Announces {x}', 'New {x} Series Premieres', 'Entertainment Industry Embraces {x}']
};
const words = ['Major', 'Surprising', 'Historic', 'Unprecedented', 'Exciting'];
const catTemplates = templates[category] || templates['Technology'];
const template = catTemplates[Math.floor(random() * catTemplates.length)];
const word = words[Math.floor(random() * words.length)];
return template.replace('{x}', word);
}
function generateSubtitle(random) {
return 'Industry experts weigh in on the implications and what it means for the future.';
}
function generateArticleContent(random) {
return 'Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris.';
}
function generateCaption(random) {
return 'Image: Illustration of the main topic covered in this article.';
}
function generateTag(random) {
const tags = ['trending', 'breaking', 'exclusive', 'analysis', 'opinion', 'featured', 'popular'];
return tags[Math.floor(random() * tags.length)];
}
// ============================================
// ENTERPRISE/COMPANY SIMULATORS
// ============================================
async function generateStockTradingData(count, seed) {
log.info('Generating stock trading data (Bloomberg-style)...');
const random = createSeededRandom(seed);
const results = [];
const symbols = ['AAPL', 'GOOGL', 'MSFT', 'AMZN', 'META', 'NVDA', 'TSLA', 'JPM', 'V', 'WMT', 'UNH', 'JNJ', 'PG', 'HD', 'BAC'];
const exchanges = ['NYSE', 'NASDAQ', 'LSE', 'TSE', 'HKEX'];
const orderTypes = ['market', 'limit', 'stop', 'stop_limit', 'trailing_stop'];
const sides = ['buy', 'sell'];
for (let i = 0; i < count; i++) {
const symbol = symbols[Math.floor(random() * symbols.length)];
const basePrice = 50 + random() * 500;
const timestamp = new Date(Date.now() - random() * 24 * 60 * 60 * 1000);
const volume = Math.floor(100 + random() * 100000);
results.push({
tradeId: `TRD${Date.now()}${i}`,
symbol,
exchange: exchanges[Math.floor(random() * exchanges.length)],
timestamp: timestamp.toISOString(),
ohlcv: {
open: Math.round(basePrice * (1 - random() * 0.02) * 100) / 100,
high: Math.round(basePrice * (1 + random() * 0.03) * 100) / 100,
low: Math.round(basePrice * (1 - random() * 0.03) * 100) / 100,
close: Math.round(basePrice * 100) / 100,
volume,
vwap: Math.round(basePrice * (1 + (random() - 0.5) * 0.01) * 100) / 100
},
quote: {
bid: Math.round(basePrice * 0.999 * 100) / 100,
ask: Math.round(basePrice * 1.001 * 100) / 100,
bidSize: Math.floor(100 + random() * 10000),
askSize: Math.floor(100 + random() * 10000),
spread: Math.round(basePrice * 0.002 * 100) / 100
},
order: {
type: orderTypes[Math.floor(random() * orderTypes.length)],
side: sides[Math.floor(random() * sides.length)],
quantity: Math.floor(10 + random() * 1000),
filledQuantity: Math.floor(10 + random() * 1000),
status: random() > 0.1 ? 'filled' : random() > 0.5 ? 'partial' : 'pending'
},
marketData: {
marketCap: Math.floor(random() * 3000) + 'B',
peRatio: Math.round((10 + random() * 40) * 10) / 10,
dividendYield: Math.round(random() * 5 * 100) / 100,
beta: Math.round((0.5 + random() * 1.5) * 100) / 100,
fiftyTwoWeekHigh: Math.round(basePrice * 1.3 * 100) / 100,
fiftyTwoWeekLow: Math.round(basePrice * 0.7 * 100) / 100
},
analytics: {
rsi: Math.round((20 + random() * 60) * 10) / 10,
macd: Math.round((random() - 0.5) * 10 * 100) / 100,
movingAvg50: Math.round(basePrice * (1 + (random() - 0.5) * 0.1) * 100) / 100,
movingAvg200: Math.round(basePrice * (1 + (random() - 0.5) * 0.15) * 100) / 100
},
scrapedAt: new Date().toISOString()
});
}
return results;
}
async function generateMedicalData(count, seed) {
log.info('Generating medical/healthcare data...');
const random = createSeededRandom(seed);
const results = [];
const departments = ['Cardiology', 'Neurology', 'Orthopedics', 'Oncology', 'Pediatrics', 'Emergency', 'Radiology', 'Surgery'];
const diagnoses = ['Hypertension', 'Type 2 Diabetes', 'Chronic Pain', 'Respiratory Infection', 'Anxiety Disorder', 'Cardiac Arrhythmia', 'Migraine', 'Osteoarthritis'];
const procedures = ['Blood Test', 'MRI Scan', 'X-Ray', 'CT Scan', 'Ultrasound', 'ECG', 'Endoscopy', 'Biopsy'];
const insurers = ['Blue Cross', 'Aetna', 'UnitedHealth', 'Cigna', 'Humana', 'Kaiser', 'Medicare', 'Medicaid'];
const statuses = ['admitted', 'discharged', 'outpatient', 'emergency', 'scheduled'];
for (let i = 0; i < count; i++) {
const admitDate = new Date(Date.now() - random() * 365 * 24 * 60 * 60 * 1000);
const age = Math.floor(18 + random() * 70);
results.push({
recordId: `MED${Date.now()}${i}`,
patient: {
id: `PAT${Math.floor(random() * 1000000)}`,
age,
gender: random() > 0.5 ? 'M' : 'F',
bloodType: ['A+', 'A-', 'B+', 'B-', 'O+', 'O-', 'AB+', 'AB-'][Math.floor(random() * 8)],
allergies: random() > 0.7 ? ['Penicillin', 'Sulfa', 'Latex'][Math.floor(random() * 3)] : null
},
encounter: {
type: statuses[Math.floor(random() * statuses.length)],
department: departments[Math.floor(random() * departments.length)],
admitDate: admitDate.toISOString(),
dischargeDate: random() > 0.3 ? new Date(admitDate.getTime() + random() * 7 * 24 * 60 * 60 * 1000).toISOString() : null,
lengthOfStay: Math.floor(1 + random() * 14)
},
diagnosis: {
primary: diagnoses[Math.floor(random() * diagnoses.length)],
secondary: random() > 0.5 ? diagnoses[Math.floor(random() * diagnoses.length)] : null,
icdCode: `I${Math.floor(10 + random() * 90)}.${Math.floor(random() * 10)}`,
severity: ['mild', 'moderate', 'severe', 'critical'][Math.floor(random() * 4)]
},
procedures: Array.from({ length: Math.floor(1 + random() * 3) }, () => ({
name: procedures[Math.floor(random() * procedures.length)],
cptCode: `${Math.floor(10000 + random() * 90000)}`,
date: new Date(admitDate.getTime() + random() * 3 * 24 * 60 * 60 * 1000).toISOString(),
result: random() > 0.1 ? 'normal' : 'abnormal'
})),
vitals: {
bloodPressure: `${Math.floor(100 + random() * 60)}/${Math.floor(60 + random() * 40)}`,
heartRate: Math.floor(60 + random() * 40),
temperature: Math.round((97 + random() * 4) * 10) / 10,
oxygenSaturation: Math.floor(94 + random() * 6),
weight: Math.floor(120 + random() * 150),
height: Math.floor(60 + random() * 20)
},
billing: {
insurer: insurers[Math.floor(random() * insurers.length)],
policyNumber: `POL${Math.floor(random() * 10000000)}`,
totalCharges: Math.floor(1000 + random() * 50000),
covered: Math.floor(800 + random() * 40000),
patientResponsibility: Math.floor(100 + random() * 5000),
claimStatus: random() > 0.2 ? 'approved' : random() > 0.5 ? 'pending' : 'denied'
},
provider: {
physician: generateName(random),
npi: `${Math.floor(1000000000 + random() * 9000000000)}`,
facility: `${['Metro', 'Central', 'Regional', 'University'][Math.floor(random() * 4)]} Medical Center`
},
scrapedAt: new Date().toISOString()
});
}
return results;
}
/**
* Generate Crunchbase-style company data using Gemini Grounding API
* Uses Google Search grounding for real, up-to-date company information
*/
async function generateCrunchbaseData(count, apiKey, companyNames = [], industry = null) {
log.info('Generating Crunchbase-style data with Gemini Grounding...', { count, industry });
const results = [];
if (!apiKey) {
log.warning('No Gemini API key - falling back to synthetic company data');
return generateCompanyData(count, 'crunchbase-fallback');
}
const { GoogleGenerativeAI } = await import('@google/generative-ai');
const genAI = new GoogleGenerativeAI(apiKey);
// Use Gemini 2.0 Flash with Google Search grounding
const model = genAI.getGenerativeModel({
model: 'gemini-2.0-flash-exp',
tools: [{ google_search: {} }]
});
// Generate company names if not provided
const targetCompanies = companyNames.length > 0 ? companyNames : await generateCompanyList(model, count, industry);
for (let i = 0; i < Math.min(count, targetCompanies.length); i++) {
const companyName = targetCompanies[i];
try {
const prompt = `Research "${companyName}" company and provide current information in this exact JSON format:
{
"name": "Official company name",
"description": "Brief company description (1-2 sentences)",
"founded": 2010,
"founders": ["Founder Name 1", "Founder Name 2"],
"headquarters": {"city": "City", "state": "State", "country": "Country"},
"industry": "Primary industry",
"subIndustry": "Sub-industry or sector",
"employeeCount": "Range like 1001-5000 or exact number",
"fundingTotal": "$X million/billion or 'Private/Not disclosed'",
"lastFundingRound": {"type": "Series X or IPO", "amount": "$X", "date": "YYYY-MM"},
"valuation": "$X billion or 'Private'",
"revenue": "$X million/billion or 'Not disclosed'",
"website": "https://company.com",
"linkedIn": "linkedin.com/company/name",
"ceo": "CEO Name",
"publicStatus": "Public (NASDAQ:TICK)" or "Private",
"competitors": ["Competitor 1", "Competitor 2"],
"keyProducts": ["Product 1", "Product 2"],
"recentNews": "Brief recent news (1 sentence)"
}
Only return valid JSON, no markdown or explanation.`;
const result = await model.generateContent(prompt);
const text = result.response.text();
// Extract JSON from response
const jsonMatch = text.match(/\{[\s\S]*\}/);
if (jsonMatch) {
const companyData = JSON.parse(jsonMatch[0]);
results.push({
id: `crunchbase_${Date.now()}_${i}`,
type: 'crunchbase',
data: {
...companyData,
dataSource: 'gemini-grounding',
groundingUsed: true,
lastUpdated: new Date().toISOString()
},
metadata: {
query: companyName,
generatedAt: new Date().toISOString(),
provider: 'gemini',
model: 'gemini-2.0-flash-exp',
grounded: true
}
});
log.info(`Grounded data for: ${companyName}`);
}
} catch (e) {
log.warning(`Failed to get grounded data for ${companyName}: ${e.message}`);
// Add fallback synthetic data
results.push({
id: `crunchbase_${Date.now()}_${i}`,
type: 'crunchbase',
data: {
name: companyName,
description: 'Company information not available',
dataSource: 'fallback',
groundingUsed: false,
error: e.message
},
metadata: {
query: companyName,
generatedAt: new Date().toISOString(),
grounded: false
}
});
}
// Rate limiting - 15 RPM for Gemini free tier
if (i < count - 1) {
await new Promise(r => setTimeout(r, 4100));
}
}
return results;
}
/**
* Generate a list of companies to research using Gemini Grounding
*/
async function generateCompanyList(model, count, industry = null) {
const industryFilter = industry ? ` in the ${industry} industry` : '';
const prompt = `List ${Math.min(count, 20)} notable startup and tech companies${industryFilter} that are frequently covered on Crunchbase.
Include a mix of:
- Unicorns (valued over $1B)
- Recently funded startups
- Established tech companies
Return only company names, one per line, no numbering or bullets.`;
try {
const result = await model.generateContent(prompt);
const text = result.response.text();
return text.split('\n').filter(line => line.trim().length > 0).slice(0, count);
} catch (e) {
log.warning(`Failed to generate company list: ${e.message}`);
// Fallback to well-known companies
return ['OpenAI', 'Anthropic', 'Stripe', 'SpaceX', 'Databricks', 'Figma', 'Notion', 'Discord', 'Canva', 'Airtable'].slice(0, count);
}
}
async function generateCompanyData(count, seed) {
log.info('Generating company/corporate data...');
const random = createSeededRandom(seed);
const results = [];
const industries = ['Technology', 'Healthcare', 'Finance', 'Manufacturing', 'Retail', 'Energy', 'Telecommunications', 'Transportation'];
const companyTypes = ['Corporation', 'LLC', 'Partnership', 'Sole Proprietorship', 'S-Corp', 'Non-Profit'];
const departments = ['Engineering', 'Sales', 'Marketing', 'Finance', 'HR', 'Operations', 'Legal', 'R&D'];
for (let i = 0; i < count; i++) {
const founded = Math.floor(1950 + random() * 74);
const employees = Math.floor(10 + random() * 100000);
const revenue = Math.floor(100000 + random() * 50000000000);
results.push({
companyId: `COM${Date.now()}${i}`,
profile: {
name: `${generateName(random).split(' ')[1]} ${['Industries', 'Corp', 'Inc', 'Holdings', 'Group', 'Technologies', 'Solutions'][Math.floor(random() * 7)]}`,
ticker: random() > 0.5 ? `${String.fromCharCode(65 + Math.floor(random() * 26))}${String.fromCharCode(65 + Math.floor(random() * 26))}${String.fromCharCode(65 + Math.floor(random() * 26))}${String.fromCharCode(65 + Math.floor(random() * 26))}` : null,
type: companyTypes[Math.floor(random() * companyTypes.length)],
industry: industries[Math.floor(random() * industries.length)],
founded,
website: `https://example-company-${i}.com`,
description: 'Leading provider of innovative solutions for modern enterprises.'
},
headquarters: {
address: `${Math.floor(100 + random() * 9900)} Corporate Blvd`,
city: ['New York', 'San Francisco', 'Chicago', 'Boston', 'Austin', 'Seattle'][Math.floor(random() * 6)],
state: ['NY', 'CA', 'IL', 'MA', 'TX', 'WA'][Math.floor(random() * 6)],
country: 'USA',
timezone: 'America/New_York'
},
financials: {
revenue,
revenueGrowth: Math.round((random() * 40 - 10) * 10) / 10,
netIncome: Math.floor(revenue * (0.05 + random() * 0.15)),
grossMargin: Math.round((30 + random() * 40) * 10) / 10,
operatingMargin: Math.round((10 + random() * 25) * 10) / 10,
debtToEquity: Math.round(random() * 2 * 100) / 100,
currentRatio: Math.round((1 + random() * 2) * 100) / 100,
fiscalYearEnd: ['December', 'March', 'June', 'September'][Math.floor(random() * 4)]
},
workforce: {
totalEmployees: employees,
fullTime: Math.floor(employees * 0.85),
partTime: Math.floor(employees * 0.1),
contractors: Math.floor(employees * 0.05),
departments: departments.slice(0, Math.floor(3 + random() * 5)).map(dept => ({
name: dept,
headcount: Math.floor(employees * (0.05 + random() * 0.2)),
budget: Math.floor(revenue * (0.01 + random() * 0.1))
})),
avgTenure: Math.round((2 + random() * 8) * 10) / 10,
turnoverRate: Math.round((5 + random() * 20) * 10) / 10
},
leadership: Array.from({ length: Math.floor(3 + random() * 5) }, () => ({
name: generateName(random),
title: ['CEO', 'CFO', 'CTO', 'COO', 'CMO', 'CHRO', 'CLO', 'CIO'][Math.floor(random() * 8)],
since: Math.floor(2010 + random() * 14),
compensation: Math.floor(500000 + random() * 10000000)
})),
metrics: {
customerCount: Math.floor(100 + random() * 1000000),
nps: Math.floor(-20 + random() * 100),
marketShare: Math.round(random() * 30 * 10) / 10,
brandValue: Math.floor(random() * 50) + 'B'
},
scrapedAt: new Date().toISOString()
});
}
return results;
}
async function generateSupplyChainData(count, seed) {
log.info('Generating supply chain data...');
const random = createSeededRandom(seed);
const results = [];
const productCategories = ['Electronics', 'Raw Materials', 'Components', 'Finished Goods', 'Packaging', 'Chemicals', 'Textiles', 'Machinery'];
const statuses = ['in_transit', 'delivered', 'pending', 'delayed', 'customs_hold', 'processing', 'shipped', 'cancelled'];
const transportModes = ['air', 'sea', 'rail', 'truck', 'multimodal'];
const warehouses = ['WH-NYC-01', 'WH-LAX-02', 'WH-CHI-03', 'WH-HOU-04', 'WH-SEA-05', 'WH-MIA-06'];
const countries = ['USA', 'China', 'Germany', 'Japan', 'Mexico', 'Vietnam', 'India', 'South Korea'];
for (let i = 0; i < count; i++) {
const orderDate = new Date(Date.now() - random() * 90 * 24 * 60 * 60 * 1000);
const quantity = Math.floor(10 + random() * 10000);
const unitPrice = Math.round((1 + random() * 500) * 100) / 100;
results.push({
shipmentId: `SHP${Date.now()}${i}`,
order: {
orderId: `ORD${Math.floor(random() * 10000000)}`,
orderDate: orderDate.toISOString(),
priority: ['standard', 'express', 'critical'][Math.floor(random() * 3)],
status: statuses[Math.floor(random() * statuses.length)]
},
product: {
sku: `SKU-${Math.floor(100000 + random() * 900000)}`,
name: `${productCategories[Math.floor(random() * productCategories.length)]} Item ${Math.floor(random() * 1000)}`,
category: productCategories[Math.floor(random() * productCategories.length)],
quantity,
unitPrice,
totalValue: Math.round(quantity * unitPrice * 100) / 100,
weight: Math.round((0.1 + random() * 100) * 10) / 10,
dimensions: {
length: Math.floor(10 + random() * 100),
width: Math.floor(10 + random() * 100),
height: Math.floor(10 + random() * 50)
}
},
supplier: {
id: `SUP${Math.floor(random() * 10000)}`,
name: `${generateName(random).split(' ')[1]} Supply Co`,
country: countries[Math.floor(random() * countries.length)],
leadTime: Math.floor(7 + random() * 60),
rating: Math.round((3 + random() * 2) * 10) / 10,
onTimeDelivery: Math.round((70 + random() * 30) * 10) / 10
},
logistics: {
carrier: ['FedEx', 'UPS', 'DHL', 'Maersk', 'Expeditors', 'DB Schenker'][Math.floor(random() * 6)],
mode: transportModes[Math.floor(random() * transportModes.length)],
trackingNumber: `TRK${Math.floor(random() * 1000000000000)}`,
origin: {
facility: warehouses[Math.floor(random() * warehouses.length)],
country: countries[Math.floor(random() * countries.length)],
departureDate: orderDate.toISOString()
},
destination: {
facility: warehouses[Math.floor(random() * warehouses.length)],
country: countries[Math.floor(random() * countries.length)],
eta: new Date(orderDate.getTime() + (7 + random() * 30) * 24 * 60 * 60 * 1000).toISOString()
},
currentLocation: {
lat: 25 + random() * 25,
lng: -120 + random() * 60,
lastUpdate: new Date(orderDate.getTime() + random() * 7 * 24 * 60 * 60 * 1000).toISOString()
}
},
inventory: {
warehouse: warehouses[Math.floor(random() * warehouses.length)],
stockLevel: Math.floor(random() * 5000),
reorderPoint: Math.floor(100 + random() * 500),
safetyStock: Math.floor(50 + random() * 200),
daysOfSupply: Math.floor(10 + random() * 90)
},
costs: {
productCost: Math.round(quantity * unitPrice * 100) / 100,
shippingCost: Math.round(quantity * unitPrice * (0.05 + random() * 0.15) * 100) / 100,
tariffs: Math.round(quantity * unitPrice * random() * 0.1 * 100) / 100,
insurance: Math.round(quantity * unitPrice * 0.02 * 100) / 100,
totalLandedCost: Math.round(quantity * unitPrice * (1.1 + random() * 0.2) * 100) / 100
},
compliance: {
hsCode: `${Math.floor(1000 + random() * 9000)}.${Math.floor(10 + random() * 90)}`,
countryOfOrigin: countries[Math.floor(random() * countries.length)],
certificates: random() > 0.5 ? ['ISO 9001', 'CE', 'RoHS'][Math.floor(random() * 3)] : null,
customsCleared: random() > 0.3
},
scrapedAt: new Date().toISOString()
});
}
return results;
}
async function generateFinancialData(count, seed) {
log.info('Generating financial services data...');
const random = createSeededRandom(seed);
const results = [];
const accountTypes = ['checking', 'savings', 'investment', 'retirement', 'credit', 'loan', 'mortgage'];
const transactionTypes = ['debit', 'credit', 'transfer', 'payment', 'withdrawal', 'deposit', 'fee', 'interest'];
const categories = ['groceries', 'utilities', 'entertainment', 'dining', 'travel', 'shopping', 'healthcare', 'insurance', 'investment'];
const institutions = ['Chase', 'Bank of America', 'Wells Fargo', 'Citi', 'Capital One', 'Goldman Sachs', 'Morgan Stanley', 'Fidelity'];
for (let i = 0; i < count; i++) {
const transactionDate = new Date(Date.now() - random() * 365 * 24 * 60 * 60 * 1000);
const amount = Math.round((1 + random() * 10000) * 100) / 100;
results.push({
transactionId: `TXN${Date.now()}${i}`,
account: {
accountId: `ACC${Math.floor(random() * 100000000)}`,
type: accountTypes[Math.floor(random() * accountTypes.length)],
institution: institutions[Math.floor(random() * institutions.length)],
balance: Math.round((1000 + random() * 500000) * 100) / 100,
availableCredit: random() > 0.5 ? Math.round((5000 + random() * 50000) * 100) / 100 : null,
interestRate: Math.round((random() * 25) * 100) / 100
},
transaction: {
type: transactionTypes[Math.floor(random() * transactionTypes.length)],
amount,
currency: 'USD',
date: transactionDate.toISOString(),
description: `${categories[Math.floor(random() * categories.length)].toUpperCase()} - ${generateName(random).split(' ')[1]} Store`,
category: categories[Math.floor(random() * categories.length)],
status: random() > 0.05 ? 'completed' : random() > 0.5 ? 'pending' : 'failed',
merchant: {
name: `${generateName(random).split(' ')[1]} ${['Store', 'Shop', 'Market', 'Services'][Math.floor(random() * 4)]}`,
category: categories[Math.floor(random() * categories.length)],
mcc: `${Math.floor(1000 + random() * 9000)}`
}
},
card: random() > 0.3 ? {
last4: `${Math.floor(1000 + random() * 9000)}`,
brand: ['Visa', 'Mastercard', 'Amex', 'Discover'][Math.floor(random() * 4)],
expiryMonth: Math.floor(1 + random() * 12),
expiryYear: Math.floor(2025 + random() * 5)
} : null,
fraud: {
score: Math.round(random() * 100),
flagged: random() > 0.95,
rules: random() > 0.9 ? ['unusual_location', 'high_amount', 'velocity_check'][Math.floor(random() * 3)] : null
},
analytics: {
dayOfWeek: transactionDate.getDay(),
hourOfDay: transactionDate.getHours(),
isRecurring: random() > 0.7,
monthlyAverage: Math.round((100 + random() * 2000) * 100) / 100
},
scrapedAt: new Date().toISOString()
});
}
return results;
}
async function generateBloombergData(count, seed) {
log.info('Generating Bloomberg terminal-style data...');
const random = createSeededRandom(seed);
const results = [];
const assetClasses = ['equity', 'fixed_income', 'commodity', 'fx', 'derivative', 'crypto'];
const sectors = ['Technology', 'Healthcare', 'Financials', 'Consumer', 'Energy', 'Industrials', 'Materials', 'Utilities'];
const ratings = ['AAA', 'AA+', 'AA', 'AA-', 'A+', 'A', 'A-', 'BBB+', 'BBB', 'BBB-', 'BB+', 'BB', 'B', 'CCC'];
const newsCategories = ['earnings', 'merger', 'regulatory', 'analyst_upgrade', 'analyst_downgrade', 'dividend', 'lawsuit', 'executive'];
for (let i = 0; i < count; i++) {
const timestamp = new Date(Date.now() - random() * 24 * 60 * 60 * 1000);
const basePrice = 10 + random() * 500;
results.push({
terminalId: `BBG${Date.now()}${i}`,
security: {
ticker: `${String.fromCharCode(65 + Math.floor(random() * 26))}${String.fromCharCode(65 + Math.floor(random() * 26))}${String.fromCharCode(65 + Math.floor(random() * 26))}${String.fromCharCode(65 + Math.floor(random() * 26))}`,
name: `${generateName(random).split(' ')[1]} ${['Corp', 'Inc', 'Ltd', 'Holdings', 'Group'][Math.floor(random() * 5)]}`,
assetClass: assetClasses[Math.floor(random() * assetClasses.length)],
sector: sectors[Math.floor(random() * sectors.length)],
country: ['US', 'GB', 'JP', 'DE', 'CN', 'FR', 'CA', 'AU'][Math.floor(random() * 8)],
currency: ['USD', 'EUR', 'GBP', 'JPY', 'CNY'][Math.floor(random() * 5)],
isin: `US${Math.floor(1000000000 + random() * 9000000000)}`,
cusip: `${Math.floor(100000000 + random() * 900000000)}`
},
pricing: {
last: Math.round(basePrice * 100) / 100,
bid: Math.round(basePrice * 0.999 * 100) / 100,
ask: Math.round(basePrice * 1.001 * 100) / 100,
open: Math.round(basePrice * (1 - random() * 0.02) * 100) / 100,
high: Math.round(basePrice * (1 + random() * 0.03) * 100) / 100,
low: Math.round(basePrice * (1 - random() * 0.03) * 100) / 100,
close: Math.round(basePrice * (1 + (random() - 0.5) * 0.02) * 100) / 100,
change: Math.round((random() - 0.5) * 10 * 100) / 100,
changePercent: Math.round((random() - 0.5) * 5 * 100) / 100,
volume: Math.floor(random() * 50000000),
avgVolume: Math.floor(random() * 30000000)
},
fundamentals: {
marketCap: Math.floor(random() * 3000) + 'B',
enterpriseValue: Math.floor(random() * 3500) + 'B',
peRatio: Math.round((5 + random() * 50) * 10) / 10,
forwardPe: Math.round((5 + random() * 40) * 10) / 10,
pbRatio: Math.round((0.5 + random() * 10) * 10) / 10,
evEbitda: Math.round((5 + random() * 30) * 10) / 10,
debtToEquity: Math.round(random() * 3 * 100) / 100,
roe: Math.round((5 + random() * 30) * 10) / 10,
eps: Math.round((random() * 20) * 100) / 100,
dividend: Math.round(random() * 5 * 100) / 100,
payoutRatio: Math.round((20 + random() * 60) * 10) / 10
},
credit: {
rating: ratings[Math.floor(random() * ratings.length)],
outlook: ['positive', 'stable', 'negative'][Math.floor(random() * 3)],
agency: ['S&P', 'Moody\'s', 'Fitch'][Math.floor(random() * 3)],
spread: Math.round((50 + random() * 500)),
cds: Math.round((20 + random() * 300))
},
analytics: {
beta: Math.round((0.5 + random() * 1.5) * 100) / 100,
sharpeRatio: Math.round((random() * 3) * 100) / 100,
volatility: Math.round((10 + random() * 40) * 10) / 10,
correlation: Math.round((random() * 2 - 1) * 100) / 100,
var95: Math.round((random() * 10) * 100) / 100,
maxDrawdown: Math.round((5 + random() * 30) * 10) / 10
},
consensus: (() => {
// Generate consistent analyst ratings
const numAnalysts = Math.floor(5 + random() * 40);
const buyPct = random();
const sellPct = random() * (1 - buyPct);
const holdPct = 1 - buyPct - sellPct;
const buyRatings = Math.floor(numAnalysts * buyPct);
const sellRatings = Math.floor(numAnalysts * sellPct);
const holdRatings = numAnalysts - buyRatings - sellRatings;
// Derive recommendation from actual ratings
const buyScore = buyRatings / numAnalysts;
let recommendation;
if (buyScore > 0.7) recommendation = 'strong_buy';
else if (buyScore > 0.5) recommendation = 'buy';
else if (buyScore > 0.3) recommendation = 'hold';
else if (buyScore > 0.15) recommendation = 'sell';
else recommendation = 'strong_sell';
return {
recommendation,
targetPrice: Math.round(basePrice * (1 + (random() - 0.3) * 0.5) * 100) / 100,
numAnalysts,
buyRatings,
holdRatings,
sellRatings
};
})(),
news: {
headline: `${generateName(random).split(' ')[1]} Corp ${newsCategories[Math.floor(random() * newsCategories.length)].replace('_', ' ')} update`,
source: ['Reuters', 'Bloomberg', 'WSJ', 'FT', 'CNBC'][Math.floor(random() * 5)],
timestamp: timestamp.toISOString(),
sentiment: ['positive', 'neutral', 'negative'][Math.floor(random() * 3)],
relevance: Math.round(random() * 100)
},
events: {
nextEarnings: new Date(Date.now() + random() * 90 * 24 * 60 * 60 * 1000).toISOString().split('T')[0],
exDividendDate: random() > 0.5 ? new Date(Date.now() + random() * 30 * 24 * 60 * 60 * 1000).toISOString().split('T')[0] : null,
annualMeeting: new Date(Date.now() + random() * 180 * 24 * 60 * 60 * 1000).toISOString().split('T')[0]
},
scrapedAt: new Date().toISOString()
});
}
return results;
}
async function generateZoomInfoData(count, seed) {
log.info('Generating ZoomInfo-style B2B enrichment data...');
const random = createSeededRandom(seed);
const results = [];
const industries = ['Software', 'Healthcare', 'Financial Services', 'Manufacturing', 'Retail', 'Telecommunications', 'Professional Services', 'Real Estate'];
const departments = ['Engineering', 'Sales', 'Marketing', 'Finance', 'Operations', 'Product', 'HR', 'Customer Success', 'Legal', 'IT'];
const seniority = ['C-Level', 'VP', 'Director', 'Manager', 'Individual Contributor', 'Entry Level'];
const technologies = ['Salesforce', 'AWS', 'Microsoft Azure', 'Google Cloud', 'HubSpot', 'SAP', 'Oracle', 'Workday', 'Tableau', 'Snowflake', 'MongoDB', 'PostgreSQL'];
const fundingStages = ['Seed', 'Series A', 'Series B', 'Series C', 'Series D+', 'IPO', 'Acquired', 'Bootstrapped'];
const intentSignals = ['product_research', 'competitor_analysis', 'pricing_page_visit', 'demo_request', 'content_download', 'job_posting', 'technology_install', 'budget_approval'];
for (let i = 0; i < count; i++) {
const companyName = `${generateName(random).split(' ')[1]} ${['Corp', 'Inc', 'Solutions', 'Technologies', 'Systems', 'Group'][Math.floor(random() * 6)]}`;
const domain = companyName.toLowerCase().replace(/[^a-z]/g, '') + '.com';
const employees = Math.floor(10 + random() * 50000);
const revenueM = Math.floor(1 + random() * 5000);
const firstName = generateName(random).split(' ')[0];
const lastName = generateName(random).split(' ')[1];
const dept = departments[Math.floor(random() * departments.length)];
const level = seniority[Math.floor(random() * seniority.length)];
results.push({
recordId: `ZI${Date.now()}${i}`,
company: {
name: companyName,
domain: domain,
industry: industries[Math.floor(random() * industries.length)],
subIndustry: `${industries[Math.floor(random() * industries.length)]} - ${['Enterprise', 'Mid-Market', 'SMB'][Math.floor(random() * 3)]}`,
employees: employees,
employeeRange: employees < 50 ? '1-50' : employees < 200 ? '51-200' : employees < 1000 ? '201-1000' : employees < 5000 ? '1001-5000' : '5000+',
revenue: `$${revenueM}M`,
revenueRange: revenueM < 10 ? '$1M-$10M' : revenueM < 50 ? '$10M-$50M' : revenueM < 200 ? '$50M-$200M' : revenueM < 1000 ? '$200M-$1B' : '$1B+',
founded: Math.floor(1970 + random() * 50),
headquarters: {
street: `${Math.floor(100 + random() * 9900)} ${['Main', 'Market', 'Broadway', 'Park', 'Tech'][Math.floor(random() * 5)]} St`,
city: ['San Francisco', 'New York', 'Boston', 'Austin', 'Seattle', 'Chicago', 'Denver'][Math.floor(random() * 7)],
state: ['CA', 'NY', 'MA', 'TX', 'WA', 'IL', 'CO'][Math.floor(random() * 7)],
country: 'USA',
postalCode: String(Math.floor(10000 + random() * 90000))
},
phone: `+1-${Math.floor(200 + random() * 800)}-${Math.floor(100 + random() * 900)}-${Math.floor(1000 + random() * 9000)}`,
website: `https://${domain}`,
description: `Leading provider of ${industries[Math.floor(random() * industries.length)].toLowerCase()} solutions for enterprise customers`,
fundingStage: fundingStages[Math.floor(random() * fundingStages.length)],
totalFunding: `$${Math.floor(1 + random() * 500)}M`,
lastFundingDate: new Date(Date.now() - random() * 1095 * 24 * 60 * 60 * 1000).toISOString().split('T')[0],
investors: Array.from({length: Math.floor(1 + random() * 5)}, () =>
`${generateName(random).split(' ')[1]} ${['Ventures', 'Capital', 'Partners'][Math.floor(random() * 3)]}`
)
},
contact: {
firstName: firstName,
lastName: lastName,
fullName: `${firstName} ${lastName}`,
email: `${firstName.toLowerCase()}.${lastName.toLowerCase()}@${domain}`,
directPhone: `+1-${Math.floor(200 + random() * 800)}-${Math.floor(100 + random() * 900)}-${Math.floor(1000 + random() * 9000)}`,
mobilePhone: random() > 0.5 ? `+1-${Math.floor(200 + random() * 800)}-${Math.floor(100 + random() * 900)}-${Math.floor(1000 + random() * 9000)}` : null,
title: `${level === 'C-Level' ? ['CEO', 'CTO', 'CFO', 'COO', 'CMO'][Math.floor(random() * 5)] :
level === 'VP' ? `VP of ${dept}` :
level === 'Director' ? `Director of ${dept}` :
level === 'Manager' ? `${dept} Manager` :
`${dept} ${['Specialist', 'Analyst', 'Associate'][Math.floor(random() * 3)]}`}`,
department: dept,
seniority: level,
linkedIn: `https://linkedin.com/in/${firstName.toLowerCase()}-${lastName.toLowerCase()}-${Math.floor(random() * 99999)}`,
twitter: random() > 0.6 ? `@${firstName.toLowerCase()}${lastName.toLowerCase()}` : null,
yearsInRole: Math.floor(random() * 8),
yearsAtCompany: Math.floor(random() * 12),
previousCompanies: Array.from({length: Math.floor(1 + random() * 3)}, () =>
`${generateName(random).split(' ')[1]} ${['Corp', 'Inc', 'Technologies'][Math.floor(random() * 3)]}`
),
education: {
degree: ['Bachelor\'s', 'Master\'s', 'MBA', 'PhD'][Math.floor(random() * 4)],
field: ['Computer Science', 'Business', 'Engineering', 'Marketing', 'Finance'][Math.floor(random() * 5)],
school: ['Stanford', 'MIT', 'Harvard', 'Berkeley', 'Carnegie Mellon', 'Northwestern'][Math.floor(random() * 6)]
}
},
technographics: {
installedTechnologies: Array.from({length: Math.floor(3 + random() * 8)}, () =>
technologies[Math.floor(random() * technologies.length)]
).filter((v, i, a) => a.indexOf(v) === i),
technologySpend: `$${Math.floor(100 + random() * 10000)}K`,
cloudProvider: ['AWS', 'Azure', 'Google Cloud', 'Multi-Cloud'][Math.floor(random() * 4)],
crmSystem: ['Salesforce', 'HubSpot', 'Microsoft Dynamics', 'Zoho'][Math.floor(random() * 4)],
marketingAutomation: ['HubSpot', 'Marketo', 'Pardot', 'Eloqua'][Math.floor(random() * 4)],
analyticsTools: ['Google Analytics', 'Adobe Analytics', 'Mixpanel', 'Amplitude'][Math.floor(random() * 4)]
},
intentSignals: {
recentActivity: Array.from({length: Math.floor(1 + random() * 5)}, () => ({
signal: intentSignals[Math.floor(random() * intentSignals.length)],
timestamp: new Date(Date.now() - random() * 30 * 24 * 60 * 60 * 1000).toISOString(),
score: Math.floor(1 + random() * 100),
source: ['website', 'content', 'events', 'social', 'search'][Math.floor(random() * 5)]
})),
buyingStage: ['awareness', 'consideration', 'decision', 'purchase'][Math.floor(random() * 4)],
engagementScore: Math.floor(1 + random() * 100),
lastEngagement: new Date(Date.now() - random() * 60 * 24 * 60 * 60 * 1000).toISOString()
},
organizationChart: {
reportsTo: random() > 0.3 ? `${generateName(random)}` : null,
directReports: Math.floor(random() * 15),
totalTeamSize: Math.floor(random() * 50),
peers: Array.from({length: Math.floor(2 + random() * 5)}, () => generateName(random))
},
dataQuality: {
emailVerified: random() > 0.2,
phoneVerified: random() > 0.3,
lastVerified: new Date(Date.now() - random() * 90 * 24 * 60 * 60 * 1000).toISOString(),
confidenceScore: Math.floor(70 + random() * 30),
dataFreshness: Math.floor(random() * 60) + ' days'
},
scrapedAt: new Date().toISOString()
});
}
return results;
}
async function generateFactSetData(count, seed) {
log.info('Generating FactSet-style financial analytics data...');
const random = createSeededRandom(seed);
const results = [];
const sectors = ['Technology', 'Healthcare', 'Financials', 'Consumer Discretionary', 'Consumer Staples', 'Energy', 'Industrials', 'Materials', 'Real Estate', 'Utilities', 'Communication Services'];
const exchanges = ['NYSE', 'NASDAQ', 'LSE', 'TSE', 'HKEX', 'Euronext', 'SSE'];
const analystFirms = ['Goldman Sachs', 'Morgan Stanley', 'JP Morgan', 'Bank of America', 'Citi', 'Deutsche Bank', 'Barclays', 'UBS', 'Credit Suisse', 'Wells Fargo'];
const institutionalTypes = ['Mutual Fund', 'Hedge Fund', 'Pension Fund', 'Sovereign Wealth', 'ETF', 'Private Equity', 'Insurance', 'Endowment'];
for (let i = 0; i < count; i++) {
const companyName = `${generateName(random).split(' ')[1]} ${['Corporation', 'Inc', 'Holdings', 'Group', 'International'][Math.floor(random() * 5)]}`;
const ticker = `${String.fromCharCode(65 + Math.floor(random() * 26))}${String.fromCharCode(65 + Math.floor(random() * 26))}${String.fromCharCode(65 + Math.floor(random() * 26))}`;
const basePrice = 10 + random() * 500;
const revenue = Math.floor(100 + random() * 50000);
const employees = Math.floor(100 + random() * 200000);
results.push({
entityId: `FS${Date.now()}${i}`,
company: {
name: companyName,
ticker: ticker,
exchange: exchanges[Math.floor(random() * exchanges.length)],
sector: sectors[Math.floor(random() * sectors.length)],
industry: `${sectors[Math.floor(random() * sectors.length)]} - Specialized`,
country: ['USA', 'UK', 'Japan', 'Germany', 'China', 'France', 'Canada'][Math.floor(random() * 7)],
employees: employees,
fiscalYearEnd: ['December', 'March', 'June', 'September'][Math.floor(random() * 4)],
ipoDate: new Date(Date.now() - random() * 7300 * 24 * 60 * 60 * 1000).toISOString().split('T')[0],
description: `Global leader in ${sectors[Math.floor(random() * sectors.length)].toLowerCase()} with operations across multiple continents`
},
fundamentals: {
revenue: {
current: revenue,
yoy_growth: Math.round((random() - 0.3) * 30 * 10) / 10,
trailing_12m: revenue,
quarterly: [
Math.round(revenue * 0.24 * 100) / 100,
Math.round(revenue * 0.25 * 100) / 100,
Math.round(revenue * 0.26 * 100) / 100,
Math.round(revenue * 0.25 * 100) / 100
]
},
profitability: {
ebitda: Math.round(revenue * (0.1 + random() * 0.3)),
ebitda_margin: Math.round((10 + random() * 30) * 10) / 10,
operating_income: Math.round(revenue * (0.08 + random() * 0.25)),
operating_margin: Math.round((8 + random() * 25) * 10) / 10,
net_income: Math.round(revenue * (0.05 + random() * 0.20)),
net_margin: Math.round((5 + random() * 20) * 10) / 10,
roe: Math.round((5 + random() * 30) * 10) / 10,
roa: Math.round((3 + random() * 15) * 10) / 10,
roic: Math.round((5 + random() * 25) * 10) / 10
},
growth_rates: {
revenue_1yr: Math.round((random() - 0.2) * 30 * 10) / 10,
revenue_3yr_cagr: Math.round((random() - 0.1) * 25 * 10) / 10,
revenue_5yr_cagr: Math.round((random() - 0.1) * 20 * 10) / 10,
earnings_1yr: Math.round((random() - 0.3) * 40 * 10) / 10,
earnings_3yr_cagr: Math.round((random() - 0.2) * 30 * 10) / 10,
earnings_5yr_cagr: Math.round((random() - 0.1) * 25 * 10) / 10
},
balance_sheet: {
total_assets: Math.round(revenue * (1.5 + random() * 3)),
total_liabilities: Math.round(revenue * (0.8 + random() * 2)),
stockholders_equity: Math.round(revenue * (0.5 + random() * 1.5)),
cash: Math.round(revenue * (0.1 + random() * 0.5)),
debt: Math.round(revenue * (0.2 + random() * 1.2)),
working_capital: Math.round(revenue * (0.1 + random() * 0.4))
},
cash_flow: {
operating_cf: Math.round(revenue * (0.1 + random() * 0.25)),
investing_cf: Math.round(revenue * (-0.15 - random() * 0.15)),
financing_cf: Math.round(revenue * (-0.05 + random() * 0.15)),
free_cash_flow: Math.round(revenue * (0.05 + random() * 0.20)),
fcf_yield: Math.round((3 + random() * 8) * 10) / 10
}
},
estimates: {
eps: {
current_quarter: Math.round((basePrice * 0.01 + random() * basePrice * 0.02) * 100) / 100,
next_quarter: Math.round((basePrice * 0.01 + random() * basePrice * 0.025) * 100) / 100,
current_year: Math.round((basePrice * 0.04 + random() * basePrice * 0.06) * 100) / 100,
next_year: Math.round((basePrice * 0.05 + random() * basePrice * 0.08) * 100) / 100,
consensus_growth: Math.round((5 + random() * 20) * 10) / 10,
surprise_history: Array.from({length: 4}, () => Math.round((random() - 0.5) * 20 * 10) / 10)
},
revenue: {
current_quarter: Math.round(revenue * 0.25 * (1 + (random() - 0.3) * 0.1)),
next_quarter: Math.round(revenue * 0.26 * (1 + (random() - 0.2) * 0.1)),
current_year: Math.round(revenue * (1 + (random() - 0.2) * 0.15)),
next_year: Math.round(revenue * (1.05 + random() * 0.15)),
consensus_growth: Math.round((3 + random() * 15) * 10) / 10
},
price_targets: {
high: Math.round(basePrice * (1.3 + random() * 0.5) * 100) / 100,
low: Math.round(basePrice * (0.7 - random() * 0.2) * 100) / 100,
mean: Math.round(basePrice * (1 + (random() - 0.5) * 0.3) * 100) / 100,
median: Math.round(basePrice * (1 + (random() - 0.5) * 0.25) * 100) / 100,
num_analysts: Math.floor(8 + random() * 35)
}
},
ownership: {
institutional: {
percentage: Math.round((40 + random() * 50) * 10) / 10,
holders: Math.floor(100 + random() * 900),
topHolders: Array.from({length: 10}, (_, idx) => ({
name: `${generateName(random).split(' ')[1]} ${institutionalTypes[Math.floor(random() * institutionalTypes.length)]}`,
shares: Math.floor(1000000 + random() * 50000000),
percentage: Math.round((1 + random() * 8) * 100) / 100,
value: Math.round(basePrice * (1000000 + random() * 50000000) / 1000000),
changeQoQ: Math.round((random() - 0.5) * 20 * 100) / 100,
rank: idx + 1
}))
},
insider: {
percentage: Math.round((1 + random() * 15) * 10) / 10,
recentTransactions: Array.from({length: Math.floor(5 + random() * 10)}, () => ({
date: new Date(Date.now() - random() * 180 * 24 * 60 * 60 * 1000).toISOString().split('T')[0],
insider: generateName(random),
title: ['CEO', 'CFO', 'COO', 'Director', 'EVP', 'SVP'][Math.floor(random() * 6)],
transaction: ['Buy', 'Sell'][Math.floor(random() * 2)],
shares: Math.floor(1000 + random() * 100000),
price: Math.round(basePrice * (1 + (random() - 0.5) * 0.1) * 100) / 100,
value: Math.round(basePrice * (1000 + random() * 100000) / 1000)
}))
},
buybacks: {
active_program: random() > 0.3,
authorization: Math.round(revenue * (0.05 + random() * 0.15)),
remaining: Math.round(revenue * (0.02 + random() * 0.10)),
shares_repurchased_ltm: Math.floor(random() * 10000000)
}
},
supplyChain: {
majorCustomers: Array.from({length: Math.floor(3 + random() * 7)}, () => ({
name: `${generateName(random).split(' ')[1]} ${['Corp', 'Inc', 'Group'][Math.floor(random() * 3)]}`,
revenueContribution: Math.round((2 + random() * 15) * 10) / 10,
relationship: ['Strategic Partner', 'Key Customer', 'Major Account'][Math.floor(random() * 3)],
yearsOfBusiness: Math.floor(1 + random() * 15)
})),
majorSuppliers: Array.from({length: Math.floor(3 + random() * 7)}, () => ({
name: `${generateName(random).split(' ')[1]} ${['Corp', 'Systems', 'Technologies'][Math.floor(random() * 3)]}`,
category: ['Components', 'Raw Materials', 'Services', 'Software'][Math.floor(random() * 4)],
dependencyLevel: ['Critical', 'High', 'Medium', 'Low'][Math.floor(random() * 4)],
geographicRisk: ['Low', 'Medium', 'High'][Math.floor(random() * 3)]
})),
geographicExposure: {
north_america: Math.round((20 + random() * 60) * 10) / 10,
europe: Math.round((10 + random() * 40) * 10) / 10,
asia_pacific: Math.round((10 + random() * 50) * 10) / 10,
rest_of_world: Math.round((5 + random() * 20) * 10) / 10
}
},
analystCoverage: Array.from({length: Math.floor(5 + random() * 20)}, () => ({
firm: analystFirms[Math.floor(random() * analystFirms.length)],
analyst: generateName(random),
rating: ['Strong Buy', 'Buy', 'Hold', 'Sell', 'Strong Sell'][Math.floor(random() * 5)],
priceTarget: Math.round(basePrice * (0.8 + random() * 0.6) * 100) / 100,
lastUpdate: new Date(Date.now() - random() * 90 * 24 * 60 * 60 * 1000).toISOString().split('T')[0],
confidence: ['High', 'Medium', 'Low'][Math.floor(random() * 3)]
})),
scrapedAt: new Date().toISOString()
});
}
return results;
}
async function generateLSEGData(count, seed) {
log.info('Generating LSEG/Refinitiv-style workspace data...');
const random = createSeededRandom(seed);
const results = [];
const newsSources = ['Reuters', 'Dow Jones', 'PR Newswire', 'Business Wire', 'Bloomberg', 'Financial Times', 'WSJ'];
const newsCategories = ['Earnings', 'M&A', 'Regulatory', 'Corporate', 'Market', 'Economic', 'Political', 'ESG'];
const dealTypes = ['M&A', 'IPO', 'Secondary Offering', 'Bond Issuance', 'Loan', 'Private Placement', 'Buyout', 'Joint Venture'];
const esgCategories = ['Environmental', 'Social', 'Governance'];
const controversyTypes = ['Legal', 'Environmental', 'Labor', 'Ethical', 'Regulatory', 'Product'];
const regions = ['North America', 'Europe', 'Asia Pacific', 'Latin America', 'Middle East', 'Africa'];
for (let i = 0; i < count; i++) {
const companyName = `${generateName(random).split(' ')[1]} ${['Corporation', 'Group', 'Holdings', 'International', 'Industries'][Math.floor(random() * 5)]}`;
const ticker = `${String.fromCharCode(65 + Math.floor(random() * 26))}${String.fromCharCode(65 + Math.floor(random() * 26))}${String.fromCharCode(65 + Math.floor(random() * 26))}`;
const basePrice = 10 + random() * 500;
results.push({
workspaceId: `LSEG${Date.now()}${i}`,
company: {
name: companyName,
ticker: ticker,
ric: `${ticker}.${['N', 'O', 'L', 'T', 'HK'][Math.floor(random() * 5)]}`,
permId: `${Math.floor(1000000000 + random() * 9000000000)}`,
lei: `${Math.floor(100000000000000000000 + random() * 900000000000000000000)}`,
sector: ['Technology', 'Healthcare', 'Financials', 'Energy', 'Industrials'][Math.floor(random() * 5)],
region: regions[Math.floor(random() * regions.length)]
},
news: {
stories: Array.from({length: Math.floor(3 + random() * 12)}, () => ({
headline: `${companyName} ${['announces', 'reports', 'unveils', 'confirms', 'explores'][Math.floor(random() * 5)]} ${newsCategories[Math.floor(random() * newsCategories.length)].toLowerCase()} ${['update', 'initiative', 'strategy', 'partnership', 'results'][Math.floor(random() * 5)]}`,
source: newsSources[Math.floor(random() * newsSources.length)],
timestamp: new Date(Date.now() - random() * 168 * 60 * 60 * 1000).toISOString(),
category: newsCategories[Math.floor(random() * newsCategories.length)],
sentiment: {
score: Math.round((random() - 0.5) * 2 * 100) / 100,
label: ['Very Positive', 'Positive', 'Neutral', 'Negative', 'Very Negative'][Math.floor(random() * 5)],
confidence: Math.round((70 + random() * 30) * 10) / 10
},
topics: Array.from({length: Math.floor(2 + random() * 5)}, () =>
['Revenue', 'Expansion', 'Innovation', 'Partnership', 'Regulation', 'Sustainability'][Math.floor(random() * 6)]
),
entities: {
people: Array.from({length: Math.floor(1 + random() * 3)}, () => generateName(random)),
organizations: Array.from({length: Math.floor(1 + random() * 4)}, () =>
`${generateName(random).split(' ')[1]} ${['Corp', 'Inc', 'Group'][Math.floor(random() * 3)]}`
),
locations: Array.from({length: Math.floor(1 + random() * 3)}, () =>
['New York', 'London', 'Tokyo', 'Singapore', 'Hong Kong', 'Dubai'][Math.floor(random() * 6)]
)
},
relevance: Math.round((60 + random() * 40) * 10) / 10,
language: ['en', 'en-US', 'en-GB'][Math.floor(random() * 3)],
wordCount: Math.floor(200 + random() * 1500)
})),
realTimeAlerts: Array.from({length: Math.floor(1 + random() * 5)}, () => ({
type: ['Price', 'Volume', 'News', 'Rating', 'Insider'][Math.floor(random() * 5)],
severity: ['Critical', 'High', 'Medium', 'Low'][Math.floor(random() * 4)],
message: `Alert triggered for ${companyName}`,
timestamp: new Date(Date.now() - random() * 24 * 60 * 60 * 1000).toISOString()
}))
},
deals: {
announced: Array.from({length: Math.floor(1 + random() * 8)}, () => ({
dealId: `D${Math.floor(100000000 + random() * 900000000)}`,
type: dealTypes[Math.floor(random() * dealTypes.length)],
status: ['Announced', 'Pending', 'Completed', 'Withdrawn'][Math.floor(random() * 4)],
value: Math.round((50 + random() * 10000) * 10) / 10,
currency: 'USD',
announceDate: new Date(Date.now() - random() * 730 * 24 * 60 * 60 * 1000).toISOString().split('T')[0],
expectedClose: new Date(Date.now() + random() * 365 * 24 * 60 * 60 * 1000).toISOString().split('T')[0],
parties: {
acquirer: companyName,
target: `${generateName(random).split(' ')[1]} ${['Corp', 'Inc', 'Group'][Math.floor(random() * 3)]}`,
advisors: {
financial: Array.from({length: Math.floor(1 + random() * 3)}, () =>
['Goldman Sachs', 'Morgan Stanley', 'JP Morgan', 'Bank of America'][Math.floor(random() * 4)]
),
legal: Array.from({length: Math.floor(1 + random() * 2)}, () =>
['Wachtell', 'Skadden', 'Sullivan & Cromwell', 'Cleary Gottlieb'][Math.floor(random() * 4)]
)
}
},
rationale: ['Strategic Expansion', 'Market Entry', 'Technology Acquisition', 'Vertical Integration'][Math.floor(random() * 4)],
synergies: Math.round((10 + random() * 500) * 10) / 10,
premium: Math.round((10 + random() * 50) * 10) / 10
})),
issuances: Array.from({length: Math.floor(1 + random() * 5)}, () => ({
type: ['Investment Grade Bond', 'High Yield Bond', 'Convertible', 'Green Bond'][Math.floor(random() * 4)],
amount: Math.round((100 + random() * 5000) * 10) / 10,
maturity: Math.floor(3 + random() * 27) + ' years',
coupon: Math.round((1 + random() * 8) * 100) / 100,
rating: ['AAA', 'AA', 'A', 'BBB', 'BB', 'B'][Math.floor(random() * 6)],
issueDate: new Date(Date.now() - random() * 365 * 24 * 60 * 60 * 1000).toISOString().split('T')[0],
underwriters: Array.from({length: Math.floor(2 + random() * 5)}, () =>
['JP Morgan', 'Bank of America', 'Citi', 'Goldman Sachs', 'Morgan Stanley'][Math.floor(random() * 5)]
)
}))
},
esg: {
scores: {
overall: Math.round((30 + random() * 70) * 10) / 10,
environmental: Math.round((30 + random() * 70) * 10) / 10,
social: Math.round((30 + random() * 70) * 10) / 10,
governance: Math.round((30 + random() * 70) * 10) / 10,
controversy: Math.round((0 + random() * 100) * 10) / 10
},
percentileRank: {
industry: Math.floor(1 + random() * 100),
global: Math.floor(1 + random() * 100)
},
grade: ['A+', 'A', 'A-', 'B+', 'B', 'B-', 'C+', 'C', 'D'][Math.floor(random() * 9)],
categories: esgCategories.map(cat => ({
category: cat,
score: Math.round((30 + random() * 70) * 10) / 10,
trend: ['Improving', 'Stable', 'Declining'][Math.floor(random() * 3)],
keyIssues: Array.from({length: Math.floor(2 + random() * 4)}, () =>
['Carbon Emissions', 'Water Usage', 'Diversity', 'Labor Practices', 'Board Independence', 'Executive Pay'][Math.floor(random() * 6)]
)
})),
controversies: Array.from({length: Math.floor(random() * 4)}, () => ({
type: controversyTypes[Math.floor(random() * controversyTypes.length)],
severity: ['Critical', 'High', 'Medium', 'Low'][Math.floor(random() * 4)],
date: new Date(Date.now() - random() * 1825 * 24 * 60 * 60 * 1000).toISOString().split('T')[0],
description: `${controversyTypes[Math.floor(random() * controversyTypes.length)]} controversy involving ${companyName}`,
status: ['Ongoing', 'Resolved', 'Under Investigation'][Math.floor(random() * 3)],
impact: Math.round((1 + random() * 10) * 10) / 10
})),
sdgAlignment: Array.from({length: Math.floor(3 + random() * 8)}, () => ({
goal: Math.floor(1 + random() * 17),
score: Math.round((30 + random() * 70) * 10) / 10
}))
},
research: {
analystReports: Array.from({length: Math.floor(5 + random() * 15)}, () => ({
firm: ['Goldman Sachs Research', 'Morgan Stanley Research', 'JP Morgan Research'][Math.floor(random() * 3)],
analyst: generateName(random),
title: `${companyName} - ${['Initiating Coverage', 'Q4 Update', 'Sector Outlook', 'Deep Dive'][Math.floor(random() * 4)]}`,
date: new Date(Date.now() - random() * 180 * 24 * 60 * 60 * 1000).toISOString().split('T')[0],
rating: ['Overweight', 'Equal-weight', 'Underweight', 'Buy', 'Hold', 'Sell'][Math.floor(random() * 6)],
priceTarget: Math.round(basePrice * (0.8 + random() * 0.6) * 100) / 100,
pages: Math.floor(15 + random() * 100),
keyTakeaways: Array.from({length: 3}, () =>
['Strong fundamentals', 'Market expansion opportunity', 'Valuation attractive', 'Execution risk'][Math.floor(random() * 4)]
)
})),
earnings: {
nextDate: new Date(Date.now() + random() * 90 * 24 * 60 * 60 * 1000).toISOString().split('T')[0],
consensus: {
eps: Math.round((basePrice * 0.02) * 100) / 100,
revenue: Math.round((1000 + random() * 50000) * 10) / 10,
numEstimates: Math.floor(8 + random() * 30)
},
whisperNumber: Math.round((basePrice * 0.021) * 100) / 100
}
},
marketData: {
price: Math.round(basePrice * 100) / 100,
change: Math.round((random() - 0.5) * 10 * 100) / 100,
changePercent: Math.round((random() - 0.5) * 5 * 100) / 100,
volume: Math.floor(random() * 20000000),
marketCap: Math.round(basePrice * (10 + random() * 990) * 100) / 100 + 'B',
beta: Math.round((0.5 + random() * 1.5) * 100) / 100,
shortInterest: Math.round((1 + random() * 15) * 10) / 10
},
scrapedAt: new Date().toISOString()
});
}
return results;
}
/**
* Generate fMRI (Functional Magnetic Resonance Imaging) brain activity data
* Simulates BOLD signal time series and brain voxel coordinates
*/
async function generateFMRIData(count, seed) {
log.info('Generating fMRI brain activity data...');
const random = createSeededRandom(seed);
const results = [];
const brainRegions = [
{ name: 'Dorsolateral Prefrontal Cortex', abbr: 'DLPFC', type: 'cortical', x: [30, 50], y: [20, 40], z: [20, 35] },
{ name: 'Anterior Cingulate Cortex', abbr: 'ACC', type: 'cortical', x: [0, 10], y: [30, 45], z: [15, 30] },
{ name: 'Amygdala', abbr: 'AMY', type: 'subcortical', x: [20, 30], y: [-10, 5], z: [-15, -5] },
{ name: 'Hippocampus', abbr: 'HIP', type: 'subcortical', x: [25, 35], y: [-20, -10], z: [-10, 0] },
{ name: 'Primary Motor Cortex', abbr: 'M1', type: 'cortical', x: [35, 45], y: [-15, 0], z: [45, 60] },
{ name: 'Primary Visual Cortex', abbr: 'V1', type: 'cortical', x: [10, 25], y: [-90, -75], z: [0, 15] },
{ name: 'Thalamus', abbr: 'THA', type: 'subcortical', x: [10, 15], y: [-15, -5], z: [5, 15] },
{ name: 'Caudate Nucleus', abbr: 'CAU', type: 'subcortical', x: [12, 18], y: [10, 20], z: [10, 20] }
];
const conditions = ['rest', 'task', 'visual_stim', 'motor_task', 'cognitive_load', 'emotional_stim'];
const TR = 2.0; // Repetition time in seconds (standard fMRI)
for (let i = 0; i < count; i++) {
const region = brainRegions[Math.floor(random() * brainRegions.length)];
const condition = conditions[Math.floor(random() * conditions.length)];
const numTimePoints = 100 + Math.floor(random() * 200); // 100-300 time points
// Generate voxel coordinates within brain region
const voxelX = Math.floor(region.x[0] + random() * (region.x[1] - region.x[0]));
const voxelY = Math.floor(region.y[0] + random() * (region.y[1] - region.y[0]));
const voxelZ = Math.floor(region.z[0] + random() * (region.z[1] - region.z[0]));
// Generate BOLD signal time series with realistic noise and activation
const baseline = 100 + random() * 20;
const activationMagnitude = condition === 'rest' ? 0 : (2 + random() * 4); // 2-6% signal change
const boldSignal = Array.from({ length: numTimePoints }, (_, t) => {
const noise = (random() - 0.5) * 1.5; // Physiological noise
const drift = Math.sin(t / numTimePoints * Math.PI) * 0.5; // Scanner drift
const activation = condition !== 'rest' ? Math.sin(t / 20) * activationMagnitude : 0;
return Math.round((baseline + activation + noise + drift) * 100) / 100;
});
// Generate connectivity matrix (correlation with other voxels)
const connectivityMatrix = Array.from({ length: 8 }, () =>
Array.from({ length: 8 }, () => Math.round((random() * 2 - 1) * 100) / 100)
);
results.push({
scanId: `fMRI_${Date.now()}_${i}`,
subject: {
id: `SUB${String(Math.floor(1 + random() * 999)).padStart(3, '0')}`,
age: Math.floor(18 + random() * 50),
gender: random() > 0.5 ? 'M' : 'F',
handedness: random() > 0.1 ? 'right' : 'left'
},
acquisition: {
scanner: ['Siemens Prisma 3T', 'GE Discovery MR750 3T', 'Philips Ingenia 3T'][Math.floor(random() * 3)],
fieldStrength: '3T',
TR: TR,
TE: Math.round((25 + random() * 10) * 10) / 10, // Echo time (ms)
flipAngle: 75 + Math.floor(random() * 15), // degrees
voxelSize: [3, 3, 3], // mm
slices: 32 + Math.floor(random() * 16)
},
voxel: {
coordinates: { x: voxelX, y: voxelY, z: voxelZ },
mniCoordinates: { x: voxelX - 45, y: voxelY - 60, z: voxelZ - 35 }, // MNI space
region: region.name,
regionAbbr: region.abbr,
regionType: region.type,
hemisphere: voxelX > 45 ? 'right' : 'left'
},
timeSeries: {
condition,
numTimePoints,
TR: TR,
duration: numTimePoints * TR,
boldSignal: boldSignal.slice(0, 50), // Store first 50 points for space
fullSeriesStats: {
mean: Math.round(boldSignal.reduce((a, b) => a + b, 0) / boldSignal.length * 100) / 100,
stdDev: Math.round(Math.sqrt(boldSignal.reduce((sum, val) => sum + Math.pow(val - baseline, 2), 0) / boldSignal.length) * 100) / 100,
min: Math.min(...boldSignal),
max: Math.max(...boldSignal)
}
},
activation: {
isActive: activationMagnitude > 0,
percentSignalChange: Math.round(activationMagnitude * 100) / 100,
tStatistic: activationMagnitude > 0 ? Math.round((2 + random() * 4) * 100) / 100 : 0,
pValue: activationMagnitude > 0 ? Math.round(random() * 0.05 * 10000) / 10000 : 1,
clusterSize: activationMagnitude > 0 ? Math.floor(10 + random() * 200) : 0
},
connectivity: {
matrix: connectivityMatrix,
meanCorrelation: Math.round(connectivityMatrix[0].reduce((a, b) => a + b, 0) / 8 * 100) / 100,
strongestConnection: {
region: brainRegions[Math.floor(random() * brainRegions.length)].abbr,
correlation: Math.round((0.5 + random() * 0.5) * 100) / 100
}
},
quality: {
snr: Math.round((20 + random() * 30) * 10) / 10, // Signal-to-noise ratio
motion: Math.round(random() * 2 * 100) / 100, // mm displacement
artifacts: random() > 0.8 ? ['susceptibility', 'motion'][Math.floor(random() * 2)] : null,
qualityRating: ['excellent', 'good', 'fair', 'poor'][Math.floor(random() * 4)]
},
scrapedAt: new Date().toISOString()
});
}
return results;
}
/**
* Generate Protein Data Bank (PDB) molecular structure data
* Simulates protein atomic coordinates and structural information
*/
async function generateProteinPDBData(count, seed) {
log.info('Generating Protein PDB molecular structure data...');
const random = createSeededRandom(seed);
const results = [];
const aminoAcids = ['ALA', 'ARG', 'ASN', 'ASP', 'CYS', 'GLN', 'GLU', 'GLY', 'HIS', 'ILE',
'LEU', 'LYS', 'MET', 'PHE', 'PRO', 'SER', 'THR', 'TRP', 'TYR', 'VAL'];
const secondaryStructures = ['helix', 'sheet', 'coil', 'turn'];
const chains = ['A', 'B', 'C', 'D', 'E', 'F'];
const atomTypes = ['N', 'CA', 'C', 'O', 'CB', 'CG', 'CD', 'CE', 'NZ', 'OG'];
for (let i = 0; i < count; i++) {
const pdbId = `${Math.floor(1000 + random() * 8999)}`;
const numResidues = 50 + Math.floor(random() * 450); // 50-500 residues
const numChains = 1 + Math.floor(random() * 3);
const numAtoms = numResidues * 8; // ~8 atoms per residue average
// Generate atom records (sample)
const atoms = Array.from({ length: Math.min(50, numAtoms) }, (_, atomIdx) => {
const residueIdx = Math.floor(atomIdx / 8) + 1;
return {
serial: atomIdx + 1,
atomName: atomTypes[atomIdx % atomTypes.length],
altLoc: '',
residueName: aminoAcids[Math.floor(random() * aminoAcids.length)],
chainId: chains[Math.floor(random() * numChains)],
residueSeq: residueIdx,
iCode: '',
coordinates: {
x: Math.round((random() * 100 - 50) * 1000) / 1000,
y: Math.round((random() * 100 - 50) * 1000) / 1000,
z: Math.round((random() * 100 - 50) * 1000) / 1000
},
occupancy: Math.round((0.8 + random() * 0.2) * 100) / 100,
tempFactor: Math.round((10 + random() * 40) * 100) / 100, // B-factor
element: atomTypes[atomIdx % atomTypes.length][0],
charge: ''
};
});
// Generate secondary structure assignment
const secondaryStructureMap = Array.from({ length: numResidues }, () =>
secondaryStructures[Math.floor(random() * secondaryStructures.length)]
);
// Calculate secondary structure percentages
const helixCount = secondaryStructureMap.filter(s => s === 'helix').length;
const sheetCount = secondaryStructureMap.filter(s => s === 'sheet').length;
const coilCount = secondaryStructureMap.filter(s => s === 'coil').length;
results.push({
pdbId: pdbId,
header: {
classification: ['HYDROLASE', 'TRANSFERASE', 'OXIDOREDUCTASE', 'LYASE', 'ISOMERASE', 'LIGASE', 'MEMBRANE PROTEIN', 'SIGNALING PROTEIN'][Math.floor(random() * 8)],
depositionDate: new Date(Date.now() - random() * 365 * 10 * 24 * 60 * 60 * 1000).toISOString().split('T')[0],
title: `Crystal structure of ${aminoAcids[Math.floor(random() * aminoAcids.length)]} rich domain at ${Math.round((1.5 + random() * 1.5) * 10) / 10}A resolution`,
organism: ['Homo sapiens', 'Escherichia coli', 'Saccharomyces cerevisiae', 'Mus musculus'][Math.floor(random() * 4)],
expression: ['Escherichia coli', 'Insect cells', 'Mammalian cells', 'Yeast'][Math.floor(random() * 4)]
},
structure: {
numChains,
numResidues,
numAtoms,
resolution: Math.round((1.5 + random() * 1.5) * 100) / 100, // Angstroms
rValue: Math.round((0.15 + random() * 0.15) * 1000) / 1000,
rFree: Math.round((0.18 + random() * 0.15) * 1000) / 1000,
spaceGroup: ['P 21 21 21', 'P 1 21 1', 'C 2 2 21', 'P 43 21 2'][Math.floor(random() * 4)],
unitCell: {
a: Math.round((40 + random() * 60) * 100) / 100,
b: Math.round((40 + random() * 60) * 100) / 100,
c: Math.round((40 + random() * 60) * 100) / 100,
alpha: 90,
beta: 90 + Math.round(random() * 20),
gamma: 90
}
},
sequence: {
chains: Array.from({ length: numChains }, (_, chainIdx) => ({
chainId: chains[chainIdx],
length: Math.floor(numResidues / numChains),
sequence: Array.from({ length: 30 }, () => aminoAcids[Math.floor(random() * aminoAcids.length)]).join('-')
}))
},
secondaryStructure: {
helixPercent: Math.round((helixCount / numResidues) * 100),
sheetPercent: Math.round((sheetCount / numResidues) * 100),
coilPercent: Math.round((coilCount / numResidues) * 100),
assignments: secondaryStructureMap.slice(0, 30) // Sample
},
atoms: atoms,
ligands: random() > 0.3 ? [{
hetId: ['ATP', 'NAD', 'FAD', 'HEM', 'MG', 'ZN', 'CA'][Math.floor(random() * 7)],
chainId: chains[Math.floor(random() * numChains)],
residueSeq: numResidues + 1,
numAtoms: Math.floor(10 + random() * 40),
bindingSite: {
residues: Array.from({ length: 5 }, () => Math.floor(1 + random() * numResidues)),
bindingEnergy: Math.round((-5 - random() * 10) * 100) / 100 // kcal/mol
}
}] : [],
quality: {
clashScore: Math.round(random() * 20 * 10) / 10,
ramachandranFavored: Math.round((85 + random() * 12) * 10) / 10,
ramachandranOutliers: Math.round(random() * 3 * 10) / 10,
rotamerOutliers: Math.round(random() * 5 * 10) / 10,
cbetaDeviations: Math.floor(random() * 5)
},
scrapedAt: new Date().toISOString()
});
}
return results;
}
/**
* Generate Power Grid electrical telemetry data
* Simulates 3-phase power, voltage, current, and grid events
*/
async function generatePowerGridData(count, seed) {
log.info('Generating Power Grid telemetry data...');
const random = createSeededRandom(seed);
const results = [];
const substations = ['North', 'South', 'East', 'West', 'Central', 'Industrial', 'Residential', 'Commercial'];
const voltageClasses = [
{ nominal: 765000, tolerance: 0.05, name: 'Extra High Voltage' },
{ nominal: 345000, tolerance: 0.05, name: 'Extra High Voltage' },
{ nominal: 138000, tolerance: 0.06, name: 'High Voltage' },
{ nominal: 69000, tolerance: 0.06, name: 'High Voltage' },
{ nominal: 13800, tolerance: 0.08, name: 'Medium Voltage' },
{ nominal: 480, tolerance: 0.1, name: 'Low Voltage' }
];
const eventTypes = ['normal', 'fault', 'switching', 'load_change', 'voltage_sag', 'voltage_swell', 'harmonic_distortion'];
for (let i = 0; i < count; i++) {
const voltageClass = voltageClasses[Math.floor(random() * voltageClasses.length)];
const eventType = random() > 0.8 ? eventTypes[1 + Math.floor(random() * (eventTypes.length - 1))] : 'normal';
const baseVoltage = voltageClass.nominal;
// 3-phase voltage with realistic variation
const phaseA_V = Math.round((baseVoltage * (1 + (random() - 0.5) * voltageClass.tolerance)) * 100) / 100;
const phaseB_V = Math.round((baseVoltage * (1 + (random() - 0.5) * voltageClass.tolerance)) * 100) / 100;
const phaseC_V = Math.round((baseVoltage * (1 + (random() - 0.5) * voltageClass.tolerance)) * 100) / 100;
// Current based on power and voltage
const apparentPower = Math.floor(100000 + random() * 50000000); // VA
const avgVoltage = (phaseA_V + phaseB_V + phaseC_V) / 3;
const baseCurrent = apparentPower / (Math.sqrt(3) * avgVoltage);
const phaseA_I = Math.round((baseCurrent * (0.9 + random() * 0.2)) * 100) / 100;
const phaseB_I = Math.round((baseCurrent * (0.9 + random() * 0.2)) * 100) / 100;
const phaseC_I = Math.round((baseCurrent * (0.9 + random() * 0.2)) * 100) / 100;
// Power factor and power calculations
const powerFactor = Math.round((0.85 + random() * 0.14) * 1000) / 1000;
const activePower = Math.round(apparentPower * powerFactor);
const reactivePower = Math.round(Math.sqrt(Math.pow(apparentPower, 2) - Math.pow(activePower, 2)));
// Frequency (nominal 60 Hz in US, 50 Hz in Europe)
const nominalFreq = random() > 0.5 ? 60 : 50;
const frequency = Math.round((nominalFreq + (random() - 0.5) * 0.1) * 1000) / 1000;
// Harmonics (Total Harmonic Distortion)
const thd_v = Math.round((eventType === 'harmonic_distortion' ? 3 + random() * 5 : random() * 2) * 100) / 100;
const thd_i = Math.round((eventType === 'harmonic_distortion' ? 5 + random() * 10 : random() * 3) * 100) / 100;
results.push({
recordId: `PMU_${Date.now()}_${i}`,
location: {
substation: `${substations[Math.floor(random() * substations.length)]} Substation`,
pmuId: `PMU${String(Math.floor(1 + random() * 999)).padStart(3, '0')}`,
busNumber: Math.floor(1 + random() * 100),
voltageClass: voltageClass.name,
nominalVoltage: voltageClass.nominal,
latitude: Math.round((30 + random() * 20) * 1000000) / 1000000,
longitude: Math.round((-100 + random() * 30) * 1000000) / 1000000
},
timestamp: new Date(Date.now() - random() * 3600000).toISOString(),
voltage: {
phaseA: phaseA_V,
phaseB: phaseB_V,
phaseC: phaseC_V,
neutral: Math.round(Math.abs(phaseA_V + phaseB_V + phaseC_V) / 10 * 100) / 100,
lineToLine: {
AB: Math.round(Math.sqrt(3) * ((phaseA_V + phaseB_V) / 2) * 100) / 100,
BC: Math.round(Math.sqrt(3) * ((phaseB_V + phaseC_V) / 2) * 100) / 100,
CA: Math.round(Math.sqrt(3) * ((phaseC_V + phaseA_V) / 2) * 100) / 100
},
unbalance: Math.round(random() * 2 * 100) / 100 // percent
},
current: {
phaseA: phaseA_I,
phaseB: phaseB_I,
phaseC: phaseC_I,
neutral: Math.round(Math.sqrt(Math.pow(phaseA_I, 2) + Math.pow(phaseB_I, 2) + Math.pow(phaseC_I, 2)) * 100) / 100,
unbalance: Math.round(random() * 3 * 100) / 100
},
power: {
active: activePower,
reactive: reactivePower,
apparent: apparentPower,
powerFactor: powerFactor,
phaseAngle: Math.round((random() * 60 - 30) * 100) / 100 // degrees
},
frequency: {
value: frequency,
rateOfChange: Math.round((random() - 0.5) * 0.1 * 1000) / 1000, // Hz/s
deviation: Math.round((frequency - nominalFreq) * 1000) / 1000
},
harmonics: {
THD_voltage: thd_v,
THD_current: thd_i,
dominantHarmonic: Math.floor(3 + random() * 12) * 2 + 1, // Odd harmonics
individual: {
H3: Math.round(random() * 2 * 100) / 100,
H5: Math.round(random() * 3 * 100) / 100,
H7: Math.round(random() * 2 * 100) / 100,
H11: Math.round(random() * 1 * 100) / 100
}
},
phasor: {
voltage: {
magnitude: Math.round(avgVoltage * 100) / 100,
angle: Math.round(random() * 360 * 100) / 100
},
current: {
magnitude: Math.round(baseCurrent * 100) / 100,
angle: Math.round(random() * 360 * 100) / 100
}
},
event: {
type: eventType,
severity: eventType === 'normal' ? 'none' : ['low', 'medium', 'high', 'critical'][Math.floor(random() * 4)],
duration: eventType === 'normal' ? 0 : Math.round(random() * 5000), // ms
faultLocation: eventType === 'fault' ? {
distance: Math.round(random() * 50 * 100) / 100, // km
impedance: Math.round((random() * 10) * 100) / 100 // ohms
} : null,
switchingOperation: eventType === 'switching' ? {
breaker: `CB${Math.floor(1 + random() * 50)}`,
status: random() > 0.5 ? 'opened' : 'closed'
} : null
},
quality: {
timeError: Math.round(random() * 1000), // microseconds
dataValidity: random() > 0.95 ? 'invalid' : 'valid',
synchronizationSource: ['GPS', 'IRIG-B', 'NTP'][Math.floor(random() * 3)],
uncertaintyEstimate: Math.round(random() * 0.5 * 1000) / 1000
},
scrapedAt: new Date().toISOString()
});
}
return results;
}
/**
* Generate AIS (Automatic Identification System) maritime ship tracking data
* Simulates vessel positions, navigation status, and maritime traffic
*/
async function generateAISData(count, seed) {
log.info('Generating AIS maritime tracking data...');
const random = createSeededRandom(seed);
const results = [];
const vesselTypes = [
{ code: 30, name: 'Fishing' },
{ code: 60, name: 'Passenger' },
{ code: 70, name: 'Cargo' },
{ code: 80, name: 'Tanker' },
{ code: 36, name: 'Sailing' },
{ code: 37, name: 'Pleasure Craft' },
{ code: 52, name: 'Tug' },
{ code: 31, name: 'Towing' }
];
const navStatuses = [
'Under way using engine',
'At anchor',
'Not under command',
'Restricted manoeuvrability',
'Constrained by draught',
'Moored',
'Aground',
'Engaged in fishing',
'Under way sailing'
];
const messageTypes = [1, 2, 3, 5, 18, 19, 21, 24, 27];
const destinations = ['NEW YORK', 'ROTTERDAM', 'SINGAPORE', 'HONG KONG', 'SHANGHAI', 'LOS ANGELES',
'HAMBURG', 'DUBAI', 'TOKYO', 'SOUTHAMPTON', 'PANAMA CANAL', 'SUEZ CANAL'];
// Generate realistic shipping lanes
const shippingLanes = [
{ name: 'North Atlantic', lat: [40, 50], lon: [-60, -10] },
{ name: 'Mediterranean', lat: [30, 45], lon: [0, 35] },
{ name: 'Panama Approach', lat: [5, 15], lon: [-85, -75] },
{ name: 'Malacca Strait', lat: [0, 6], lon: [98, 105] },
{ name: 'English Channel', lat: [49, 51], lon: [-5, 2] }
];
for (let i = 0; i < count; i++) {
const vesselType = vesselTypes[Math.floor(random() * vesselTypes.length)];
const messageType = messageTypes[Math.floor(random() * messageTypes.length)];
const lane = shippingLanes[Math.floor(random() * shippingLanes.length)];
// Position within shipping lane
const latitude = Math.round((lane.lat[0] + random() * (lane.lat[1] - lane.lat[0])) * 1000000) / 1000000;
const longitude = Math.round((lane.lon[0] + random() * (lane.lon[1] - lane.lon[0])) * 1000000) / 1000000;
// Speed and course
const speed = Math.round((random() * 25) * 10) / 10; // knots
const course = Math.round(random() * 360 * 10) / 10; // degrees
const heading = Math.round((course + (random() - 0.5) * 10) * 10) / 10;
results.push({
recordId: `AIS_${Date.now()}_${i}`,
vessel: {
mmsi: String(200000000 + Math.floor(random() * 799999999)), // Valid MMSI range
imo: messageType === 5 ? String(1000000 + Math.floor(random() * 8999999)) : null, // IMO number
name: `${['OCEAN', 'PACIFIC', 'ATLANTIC', 'MARINE', 'SEA', 'WAVE'][Math.floor(random() * 6)]} ${['STAR', 'VOYAGER', 'PIONEER', 'SPIRIT', 'VENTURE'][Math.floor(random() * 5)]}`,
callSign: `${String.fromCharCode(65 + Math.floor(random() * 26))}${String.fromCharCode(65 + Math.floor(random() * 26))}${Math.floor(1000 + random() * 8999)}`,
type: vesselType.name,
typeCode: vesselType.code,
flag: ['USA', 'UK', 'PANAMA', 'LIBERIA', 'MARSHALL IS', 'SINGAPORE', 'MALTA'][Math.floor(random() * 7)]
},
dimensions: {
length: Math.floor(50 + random() * 350), // meters
beam: Math.floor(10 + random() * 50), // meters
draught: Math.round((2 + random() * 15) * 10) / 10, // meters
toBow: Math.floor(20 + random() * 150),
toStern: Math.floor(20 + random() * 150),
toPort: Math.floor(5 + random() * 20),
toStarboard: Math.floor(5 + random() * 20)
},
position: {
latitude,
longitude,
accuracy: random() > 0.9 ? 'low' : 'high',
timestamp: new Date(Date.now() - random() * 300000).toISOString(), // Within last 5 min
positioningDevice: ['GPS', 'DGPS', 'Loran-C'][Math.floor(random() * 3)]
},
navigation: {
status: navStatuses[Math.floor(random() * navStatuses.length)],
speed: speed,
course: course,
heading: heading,
rateOfTurn: Math.round((random() - 0.5) * 10 * 100) / 100, // degrees/min
destination: messageType === 5 ? destinations[Math.floor(random() * destinations.length)] : null,
eta: messageType === 5 ? new Date(Date.now() + (1 + random() * 10) * 24 * 60 * 60 * 1000).toISOString() : null
},
message: {
type: messageType,
repeatIndicator: Math.floor(random() * 4),
class: messageType <= 3 ? 'A' : 'B',
channel: random() > 0.5 ? 'A' : 'B',
timeSlot: Math.floor(random() * 2250)
},
safety: {
collisionRisk: speed > 0 ? (random() > 0.85 ? 'high' : random() > 0.6 ? 'medium' : 'low') : 'none',
closestApproach: speed > 0 ? {
distance: Math.round((0.1 + random() * 10) * 100) / 100, // nautical miles
time: Math.round((5 + random() * 55)), // minutes
vesselMMSI: String(200000000 + Math.floor(random() * 799999999))
} : null,
inShippingLane: random() > 0.2,
weatherConditions: {
seaState: Math.floor(random() * 9), // Douglas scale 0-9
visibility: Math.round((1 + random() * 9) * 10) / 10, // nautical miles
windSpeed: Math.round(random() * 40) // knots
}
},
routing: {
shippingLane: lane.name,
nextWaypoint: {
latitude: Math.round((latitude + (random() - 0.5) * 2) * 1000000) / 1000000,
longitude: Math.round((longitude + (random() - 0.5) * 2) * 1000000) / 1000000,
distance: Math.round((10 + random() * 200) * 10) / 10, // nautical miles
eta: new Date(Date.now() + random() * 86400000).toISOString()
},
routeDeviation: Math.round(random() * 5 * 100) / 100, // nautical miles
trafficDensity: ['low', 'medium', 'high', 'very high'][Math.floor(random() * 4)]
},
scrapedAt: new Date().toISOString()
});
}
return results;
}
/**
* Generate Radar data (weather and vehicle detection)
* Simulates reflectivity, velocity, and Doppler measurements
*/
async function generateRadarData(count, seed) {
log.info('Generating Radar detection data...');
const random = createSeededRandom(seed);
const results = [];
const radarTypes = ['weather', 'vehicle', 'marine', 'air_traffic'];
const weatherTypes = ['clear', 'rain', 'snow', 'hail', 'storm', 'tornado'];
const vehicleTypes = ['car', 'truck', 'motorcycle', 'bicycle', 'pedestrian'];
const precipTypes = ['none', 'drizzle', 'rain', 'heavy_rain', 'snow', 'sleet', 'hail', 'mixed'];
for (let i = 0; i < count; i++) {
const radarType = radarTypes[Math.floor(random() * radarTypes.length)];
const isWeather = radarType === 'weather';
// Range gate parameters
const range = Math.round((0.1 + random() * 50) * 100) / 100; // km
const azimuth = Math.round(random() * 360 * 10) / 10; // degrees
const elevation = Math.round((random() * 20 - 5) * 10) / 10; // degrees
// Reflectivity (dBZ) - weather radar
const reflectivity = isWeather
? Math.round((-20 + random() * 80) * 10) / 10 // -20 to 60 dBZ
: Math.round((10 + random() * 30) * 10) / 10; // Vehicle radar
// Doppler velocity
const velocity = Math.round((random() * 60 - 30) * 10) / 10; // m/s
// Weather-specific data
const weatherData = isWeather ? {
precipitationType: precipTypes[Math.floor(random() * precipTypes.length)],
precipitationRate: Math.round(random() * 100 * 10) / 10, // mm/hr
stormCell: reflectivity > 45 ? {
id: `CELL${Math.floor(100 + random() * 899)}`,
top: Math.round((5 + random() * 15) * 100) / 100, // km
vil: Math.round(random() * 80), // kg/m²
severity: reflectivity > 55 ? 'severe' : 'moderate',
movement: {
direction: Math.round(random() * 360),
speed: Math.round((10 + random() * 40) * 10) / 10 // km/h
}
} : null,
echoTop: Math.round((2 + random() * 18) * 100) / 100, // km
verticalIntegratedLiquid: Math.round(random() * 50), // kg/m²
hydrometeorClassification: ['biological', 'anomalous_prop', 'ice_crystals', 'dry_snow', 'wet_snow',
'light_rain', 'moderate_rain', 'heavy_rain', 'hail', 'big_drops'][Math.floor(random() * 10)]
} : null;
// Vehicle detection data
const vehicleData = !isWeather ? {
detections: Array.from({ length: Math.floor(1 + random() * 5) }, () => ({
type: vehicleTypes[Math.floor(random() * vehicleTypes.length)],
range: Math.round((2 + random() * 200) * 10) / 10, // meters
azimuth: Math.round(random() * 180 * 10) / 10, // degrees
velocity: Math.round((random() * 50) * 10) / 10, // m/s
rcs: Math.round((random() * 40 - 10) * 10) / 10, // dBsm (radar cross section)
confidence: Math.round((0.5 + random() * 0.5) * 100) / 100,
trackId: Math.floor(1000 + random() * 8999)
})),
trackingQuality: ['excellent', 'good', 'fair', 'poor'][Math.floor(random() * 4)],
multipath: random() > 0.8,
clutter: random() > 0.7
} : null;
results.push({
recordId: `RADAR_${Date.now()}_${i}`,
radar: {
id: `RADAR${String(Math.floor(1 + random() * 999)).padStart(3, '0')}`,
type: radarType,
location: {
latitude: Math.round((25 + random() * 25) * 1000000) / 1000000,
longitude: Math.round((-125 + random() * 50) * 1000000) / 1000000,
altitude: Math.round(random() * 2000), // meters
name: `${['North', 'South', 'East', 'West', 'Central'][Math.floor(random() * 5)]} Site`
},
specifications: {
frequency: radarType === 'weather' ? '2.7-3.0 GHz (S-band)' : '76-81 GHz (W-band)',
wavelength: radarType === 'weather' ? '10 cm' : '4 mm',
beamWidth: Math.round((0.5 + random() * 2) * 10) / 10, // degrees
pulseWidth: Math.round((0.5 + random() * 2) * 100) / 100, // microseconds
prf: Math.round((300 + random() * 1700)), // Hz (pulse repetition frequency)
maxRange: radarType === 'weather' ? 250 : 150, // km
rangeResolution: Math.round((50 + random() * 200)), // meters
mode: ['surveillance', 'tracking', 'doppler'][Math.floor(random() * 3)]
}
},
measurement: {
timestamp: new Date(Date.now() - random() * 300000).toISOString(),
scanNumber: Math.floor(1 + random() * 1000),
elevationAngle: elevation,
azimuthAngle: azimuth,
range: range,
gateSpacing: Math.round((100 + random() * 150)), // meters
reflectivity: reflectivity,
velocity: velocity,
spectrumWidth: Math.round((1 + random() * 10) * 10) / 10, // m/s
correlation: Math.round((0.7 + random() * 0.3) * 1000) / 1000,
snr: Math.round((5 + random() * 35) * 10) / 10, // dB
zdr: isWeather ? Math.round((random() * 6 - 1) * 10) / 10 : null, // Differential reflectivity (dB)
kdp: isWeather ? Math.round((random() * 5) * 100) / 100 : null, // Specific differential phase (deg/km)
rhohv: isWeather ? Math.round((0.7 + random() * 0.3) * 1000) / 1000 : null // Correlation coefficient
},
weather: weatherData,
vehicle: vehicleData,
doppler: {
velocitySpectrum: Array.from({ length: 16 }, () => Math.round(random() * 100)),
nyquistVelocity: Math.round((10 + random() * 20) * 10) / 10, // m/s
aliasing: Math.abs(velocity) > 25,
spectralWidth: Math.round((1 + random() * 8) * 10) / 10
},
quality: {
clutter: random() > 0.7 ? 'high' : random() > 0.4 ? 'medium' : 'low',
groundClutterSuppression: random() > 0.5,
anomalousPropagation: random() > 0.9,
blockage: random() > 0.85,
calibrationStatus: random() > 0.95 ? 'needs_cal' : 'ok',
dataQualityIndex: Math.round((0.6 + random() * 0.4) * 100) / 100
},
scrapedAt: new Date().toISOString()
});
}
return results;
}
// ============================================
// PRIORITY 2: EXOTIC DATA GENERATORS
// ============================================
async function generateSCADAData(count, seed) {
log.info('Generating SCADA/Industrial control data...');
const random = createSeededRandom(seed);
const results = [];
const equipment = {
pump: { type: 'PUMP', maxPressure: 150, maxFlow: 500, units: { pressure: 'PSI', flow: 'GPM' } },
valve: { type: 'VALVE', positions: ['OPEN', 'CLOSED', 'THROTTLING'], units: { position: '%' } },
motor: { type: 'MOTOR', maxSpeed: 1800, maxCurrent: 50, units: { speed: 'RPM', current: 'A' } },
tank: { type: 'TANK', maxLevel: 100, maxVolume: 10000, units: { level: '%', volume: 'GAL' } },
heater: { type: 'HEATER', maxTemp: 300, maxPower: 100, units: { temp: 'F', power: 'kW' } }
};
const equipmentTypes = Object.keys(equipment);
const alarmTypes = ['HIGH_LIMIT', 'LOW_LIMIT', 'RATE_OF_CHANGE', 'DEVIATION', 'COMM_FAILURE'];
const qualityCodes = ['GOOD', 'BAD', 'UNCERTAIN', 'FORCED'];
for (let i = 0; i < count; i++) {
const eqType = equipmentTypes[Math.floor(random() * equipmentTypes.length)];
const eqConfig = equipment[eqType];
const timestamp = new Date(Date.now() - random() * 24 * 60 * 60 * 1000);
const processVars = {};
if (eqType === 'pump') {
processVars.pressure = Math.round((random() * eqConfig.maxPressure) * 10) / 10;
processVars.flow = Math.round((random() * eqConfig.maxFlow) * 10) / 10;
processVars.vibration = Math.round((random() * 10) * 100) / 100;
} else if (eqType === 'valve') {
processVars.position = Math.round((random() * 100) * 10) / 10;
processVars.command = Math.round((random() * 100) * 10) / 10;
processVars.feedback = processVars.command + (random() - 0.5) * 2;
} else if (eqType === 'motor') {
processVars.speed = Math.round((random() * eqConfig.maxSpeed) * 10) / 10;
processVars.current = Math.round((random() * eqConfig.maxCurrent) * 10) / 10;
processVars.torque = Math.round((random() * 100) * 10) / 10;
} else if (eqType === 'tank') {
processVars.level = Math.round((random() * eqConfig.maxLevel) * 10) / 10;
processVars.volume = Math.round((processVars.level / 100 * eqConfig.maxVolume) * 10) / 10;
processVars.temperature = Math.round((60 + random() * 100) * 10) / 10;
} else if (eqType === 'heater') {
processVars.temperature = Math.round((60 + random() * eqConfig.maxTemp) * 10) / 10;
processVars.setpoint = Math.round((100 + random() * 200) * 10) / 10;
processVars.power = Math.round((random() * eqConfig.maxPower) * 10) / 10;
}
const activeAlarms = [];
if (random() > 0.85) {
const alarmType = alarmTypes[Math.floor(random() * alarmTypes.length)];
activeAlarms.push({
type: alarmType,
priority: Math.floor(1 + random() * 4),
message: eqType.toUpperCase() + '_' + (i + 1) + ': ' + alarmType,
acknowledgedAt: random() > 0.5 ? new Date(timestamp.getTime() + random() * 60000).toISOString() : null
});
}
results.push({
tagId: eqType.toUpperCase() + '_' + String(i + 1).padStart(4, '0'),
equipmentType: eqConfig.type,
location: 'AREA_' + (Math.floor(random() * 5) + 1),
timestamp: timestamp.toISOString(),
processVariables: processVars,
plcRegisters: {
holding: Array.from({ length: 8 }, () => Math.floor(random() * 65536)),
input: Array.from({ length: 4 }, () => Math.floor(random() * 65536)),
coil: Array.from({ length: 4 }, () => random() > 0.5)
},
controlOutputs: {
analogOut: Math.round((random() * 100) * 10) / 10,
digitalOut: random() > 0.5,
mode: ['AUTO', 'MANUAL', 'CASCADE'][Math.floor(random() * 3)]
},
setpoints: Object.keys(processVars).reduce((acc, key) => {
if (typeof processVars[key] === 'number') {
acc[key] = Math.round((processVars[key] * (0.9 + random() * 0.2)) * 10) / 10;
}
return acc;
}, {}),
alarms: activeAlarms,
qualityCode: qualityCodes[Math.floor(random() * qualityCodes.length)],
opcua: {
nodeId: 'ns=2;s=' + eqType.toUpperCase() + '.' + (i + 1),
browseName: eqType.toUpperCase() + '_' + (i + 1),
statusCode: random() > 0.95 ? 'Bad' : 'Good'
},
scrapedAt: new Date().toISOString()
});
}
return results;
}
async function generateLiDARData(count, seed) {
log.info('Generating LiDAR point cloud data...');
const random = createSeededRandom(seed);
const results = [];
const scanPatterns = ['ROTATING_360', 'SOLID_STATE', 'FLASH', 'MEMS_MIRROR'];
const classifications = [
{ code: 0, name: 'NEVER_CLASSIFIED' },
{ code: 1, name: 'UNCLASSIFIED' },
{ code: 2, name: 'GROUND' },
{ code: 3, name: 'LOW_VEGETATION' },
{ code: 4, name: 'MEDIUM_VEGETATION' },
{ code: 5, name: 'HIGH_VEGETATION' },
{ code: 6, name: 'BUILDING' },
{ code: 7, name: 'LOW_POINT' },
{ code: 9, name: 'WATER' },
{ code: 13, name: 'VEHICLE' },
{ code: 14, name: 'PEDESTRIAN' }
];
const objectTypes = ['VEHICLE', 'PEDESTRIAN', 'CYCLIST', 'OBSTACLE', 'TRAFFIC_SIGN'];
for (let i = 0; i < count; i++) {
const timestamp = new Date(Date.now() - random() * 3600 * 1000);
const numPoints = Math.floor(10000 + random() * 90000);
const scanPattern = scanPatterns[Math.floor(random() * scanPatterns.length)];
const points = Array.from({ length: Math.min(numPoints, 1000) }, (_, idx) => {
const angle = (idx / 1000) * 2 * Math.PI;
const distance = 2 + random() * 100;
const classification = classifications[Math.floor(random() * classifications.length)];
return {
x: Math.round((distance * Math.cos(angle)) * 1000) / 1000,
y: Math.round((distance * Math.sin(angle)) * 1000) / 1000,
z: Math.round(((random() - 0.5) * 10) * 1000) / 1000,
intensity: Math.floor(random() * 255),
returnNumber: Math.floor(1 + random() * 4),
numberOfReturns: Math.floor(1 + random() * 5),
classification: classification.code,
classificationName: classification.name,
scanAngle: Math.round((random() - 0.5) * 60 * 10) / 10,
rgb: random() > 0.5 ? {
r: Math.floor(random() * 255),
g: Math.floor(random() * 255),
b: Math.floor(random() * 255)
} : null
};
});
const detections = Array.from({ length: Math.floor(random() * 10) }, () => {
const objType = objectTypes[Math.floor(random() * objectTypes.length)];
const centerX = (random() - 0.5) * 100;
const centerY = (random() - 0.5) * 100;
const centerZ = random() * 2;
return {
objectType: objType,
confidence: Math.round((0.5 + random() * 0.5) * 1000) / 1000,
boundingBox: {
center: { x: centerX, y: centerY, z: centerZ },
dimensions: {
length: Math.round((2 + random() * 8) * 100) / 100,
width: Math.round((1.5 + random() * 3) * 100) / 100,
height: Math.round((1 + random() * 3) * 100) / 100
},
rotation: Math.round((random() * 360) * 10) / 10
},
velocity: objType !== 'TRAFFIC_SIGN' && objType !== 'OBSTACLE' ? {
x: Math.round(((random() - 0.5) * 30) * 100) / 100,
y: Math.round(((random() - 0.5) * 30) * 100) / 100,
z: Math.round(((random() - 0.5) * 2) * 100) / 100
} : null,
trackId: 'TRK_' + Math.floor(random() * 1000)
};
});
results.push({
scanId: 'SCAN_' + timestamp.getTime() + '_' + i,
timestamp: timestamp.toISOString(),
sensorId: 'LIDAR_' + (Math.floor(random() * 10) + 1),
scanPattern,
pointCloud: {
numPoints,
samplePoints: points.slice(0, 100),
format: 'LAS_1.4',
coordinateSystem: 'WGS84_UTM',
bounds: {
minX: Math.min(...points.map(p => p.x)),
maxX: Math.max(...points.map(p => p.x)),
minY: Math.min(...points.map(p => p.y)),
maxY: Math.max(...points.map(p => p.y)),
minZ: Math.min(...points.map(p => p.z)),
maxZ: Math.max(...points.map(p => p.z))
}
},
detections,
metadata: {
horizontalFov: Math.round((scanPattern === 'ROTATING_360' ? 360 : 120) * 10) / 10,
verticalFov: Math.round((30 + random() * 40) * 10) / 10,
range: Math.round((50 + random() * 200) * 10) / 10,
accuracy: Math.round((0.01 + random() * 0.05) * 1000) / 1000,
scanRate: Math.round((5 + random() * 15) * 10) / 10
},
scrapedAt: new Date().toISOString()
});
}
return results;
}
async function generateCANBusData(count, seed) {
log.info('Generating CAN bus vehicle data...');
const random = createSeededRandom(seed);
const results = [];
const ecuTypes = {
engine: { arbitrationId: 0x0C0, signals: ['rpm', 'throttle', 'coolant_temp', 'oil_pressure'] },
transmission: { arbitrationId: 0x0D0, signals: ['gear', 'clutch', 'shift_position'] },
chassis: { arbitrationId: 0x1A0, signals: ['speed', 'brake_pressure', 'steering_angle', 'abs_active'] },
body: { arbitrationId: 0x2C0, signals: ['door_driver', 'door_passenger', 'lights', 'windows'] },
battery: { arbitrationId: 0x3E0, signals: ['voltage', 'current', 'soc', 'temperature'] }
};
const ecuNames = Object.keys(ecuTypes);
for (let i = 0; i < count; i++) {
const timestamp = new Date(Date.now() - random() * 3600 * 1000);
const ecuName = ecuNames[Math.floor(random() * ecuNames.length)];
const ecu = ecuTypes[ecuName];
const signals = {};
if (ecuName === 'engine') {
signals.rpm = Math.floor(800 + random() * 6000);
signals.throttle = Math.round((random() * 100) * 10) / 10;
signals.coolant_temp = Math.round((70 + random() * 50) * 10) / 10;
signals.oil_pressure = Math.round((20 + random() * 80) * 10) / 10;
signals.intake_temp = Math.round((20 + random() * 80) * 10) / 10;
signals.maf = Math.round((10 + random() * 200) * 10) / 10;
} else if (ecuName === 'transmission') {
signals.gear = Math.floor(random() * 6) + 1;
signals.clutch = Math.round((random() * 100) * 10) / 10;
signals.shift_position = ['P', 'R', 'N', 'D', 'S'][Math.floor(random() * 5)];
signals.torque_converter = Math.round((random() * 100) * 10) / 10;
} else if (ecuName === 'chassis') {
signals.speed = Math.round((random() * 120) * 10) / 10;
signals.brake_pressure = Math.round((random() * 2000) * 10) / 10;
signals.steering_angle = Math.round(((random() - 0.5) * 900) * 10) / 10;
signals.abs_active = random() > 0.9;
signals.traction_control = random() > 0.85;
signals.wheel_speed_fl = Math.round((signals.speed * (0.95 + random() * 0.1)) * 10) / 10;
signals.wheel_speed_fr = Math.round((signals.speed * (0.95 + random() * 0.1)) * 10) / 10;
signals.wheel_speed_rl = Math.round((signals.speed * (0.95 + random() * 0.1)) * 10) / 10;
signals.wheel_speed_rr = Math.round((signals.speed * (0.95 + random() * 0.1)) * 10) / 10;
} else if (ecuName === 'body') {
signals.door_driver = random() > 0.9;
signals.door_passenger = random() > 0.9;
signals.door_rear_left = random() > 0.95;
signals.door_rear_right = random() > 0.95;
signals.trunk = random() > 0.98;
signals.lights = ['OFF', 'PARKING', 'LOW_BEAM', 'HIGH_BEAM'][Math.floor(random() * 4)];
signals.windows = {
driver: Math.floor(random() * 100),
passenger: Math.floor(random() * 100),
rear_left: Math.floor(random() * 100),
rear_right: Math.floor(random() * 100)
};
} else if (ecuName === 'battery') {
signals.voltage = Math.round((12 + random() * 3) * 100) / 100;
signals.current = Math.round(((random() - 0.5) * 200) * 10) / 10;
signals.soc = Math.round((20 + random() * 80) * 10) / 10;
signals.temperature = Math.round((15 + random() * 40) * 10) / 10;
}
const dataBytes = Array.from({ length: 8 }, () => Math.floor(random() * 256));
results.push({
messageId: 'CAN_' + timestamp.getTime() + '_' + i,
timestamp: timestamp.toISOString(),
arbitrationId: '0x' + ecu.arbitrationId.toString(16).toUpperCase().padStart(3, '0'),
ecuName: ecuName.toUpperCase(),
dlc: 8,
data: dataBytes.map(b => '0x' + b.toString(16).toUpperCase().padStart(2, '0')).join(' '),
signals,
dbcDecoded: {
messageName: ecuName.toUpperCase() + '_STATUS',
cycletime: Math.floor(10 + random() * 90),
signalCount: Object.keys(signals).length
},
busLoad: Math.round((random() * 100) * 10) / 10,
errorFrames: Math.floor(random() * 5),
scrapedAt: new Date().toISOString()
});
}
return results;
}
async function generateGenomicVCFData(count, seed) {
log.info('Generating genomic VCF variant data...');
const random = createSeededRandom(seed);
const results = [];
const chromosomes = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', 'X', 'Y', 'MT'];
const bases = ['A', 'C', 'G', 'T'];
const consequences = ['MISSENSE', 'SYNONYMOUS', 'NONSENSE', 'FRAMESHIFT', 'SPLICE_SITE', 'INTRONIC', 'UTR_5', 'UTR_3', 'INTERGENIC'];
const impacts = ['HIGH', 'MODERATE', 'LOW', 'MODIFIER'];
const filters = ['PASS', 'LOW_QUAL', 'STRAND_BIAS', 'LOW_DEPTH'];
const genotypes = ['0/0', '0/1', '1/1', '0/2', '1/2'];
for (let i = 0; i < count; i++) {
const chrom = chromosomes[Math.floor(random() * chromosomes.length)];
const pos = Math.floor(1000000 + random() * 200000000);
const ref = bases[Math.floor(random() * bases.length)];
const alt = bases.filter(b => b !== ref)[Math.floor(random() * 3)];
const qual = Math.round((random() * 1000) * 10) / 10;
const filter = qual > 30 ? 'PASS' : filters[Math.floor(random() * filters.length)];
const genotype = genotypes[Math.floor(random() * genotypes.length)];
const geneNames = ['BRCA1', 'TP53', 'EGFR', 'KRAS', 'PTEN', 'MYC', 'NOTCH1', 'APC', 'RB1', 'VHL', 'CDKN2A', 'PIK3CA'];
const gene = geneNames[Math.floor(random() * geneNames.length)];
const consequence = consequences[Math.floor(random() * consequences.length)];
const impact = impacts[Math.floor(random() * impacts.length)];
results.push({
variantId: 'VAR_' + chrom + '_' + pos + '_' + i,
vcfRecord: {
chrom,
pos,
id: random() > 0.7 ? ('rs' + Math.floor(1000000 + random() * 99000000)) : '.',
ref,
alt,
qual,
filter,
info: {
DP: Math.floor(10 + random() * 200),
AF: Math.round((random()) * 1000) / 1000,
AC: Math.floor(1 + random() * 10),
AN: Math.floor(10 + random() * 100),
BaseQRankSum: Math.round(((random() - 0.5) * 10) * 100) / 100,
MQ: Math.round((40 + random() * 20) * 10) / 10,
MQRankSum: Math.round(((random() - 0.5) * 5) * 100) / 100,
ReadPosRankSum: Math.round(((random() - 0.5) * 5) * 100) / 100
},
format: ['GT', 'DP', 'GQ', 'AD'],
samples: [{
GT: genotype,
DP: Math.floor(10 + random() * 100),
GQ: Math.floor(random() * 99),
AD: genotype === '0/1' ?
(Math.floor(random() * 50) + ',' + Math.floor(random() * 50)) :
genotype === '1/1' ? ('0,' + Math.floor(random() * 100)) : (Math.floor(random() * 100) + ',0')
}]
},
annotation: {
gene,
transcript: gene + '-001',
consequence,
impact,
proteinChange: consequence === 'MISSENSE' ? ('p.' + ['Ala', 'Arg', 'Asn', 'Asp', 'Cys', 'Gln', 'Glu'][Math.floor(random() * 7)] + Math.floor(1 + random() * 500) + ['Val', 'Leu', 'Ile', 'Met'][Math.floor(random() * 4)]) : null,
cdnaChange: 'c.' + Math.floor(1 + random() * 3000) + ref + '>' + alt,
exon: consequence !== 'INTRONIC' ? (Math.floor(1 + random() * 20) + '/20') : null
},
populationFrequencies: {
gnomAD_AF: Math.round((random() * 0.1) * 100000) / 100000,
gnomAD_AF_afr: Math.round((random() * 0.1) * 100000) / 100000,
gnomAD_AF_eas: Math.round((random() * 0.1) * 100000) / 100000,
gnomAD_AF_nfe: Math.round((random() * 0.1) * 100000) / 100000,
ExAC_AF: Math.round((random() * 0.1) * 100000) / 100000,
'1000g_AF': Math.round((random() * 0.1) * 100000) / 100000
},
predictions: {
SIFT: random() > 0.5 ? 'TOLERATED' : 'DELETERIOUS',
SIFT_score: Math.round((random()) * 1000) / 1000,
PolyPhen: random() > 0.5 ? 'BENIGN' : 'PROBABLY_DAMAGING',
PolyPhen_score: Math.round((random()) * 1000) / 1000,
CADD_phred: Math.round((random() * 40) * 10) / 10,
GERP_RS: Math.round(((random() - 0.5) * 12) * 100) / 100
},
clinicalSignificance: {
clinvar: ['BENIGN', 'LIKELY_BENIGN', 'UNCERTAIN', 'LIKELY_PATHOGENIC', 'PATHOGENIC'][Math.floor(random() * 5)],
reviewStatus: ['NO_ASSERTION', 'SINGLE_SUBMITTER', 'MULTIPLE_SUBMITTERS', 'EXPERT_PANEL'][Math.floor(random() * 4)],
conditions: random() > 0.7 ? ['Hereditary cancer syndrome', 'Familial adenomatous polyposis'][Math.floor(random() * 2)] : []
},
scrapedAt: new Date().toISOString()
});
}
return results;
}
async function generateSatelliteData(count, seed) {
log.info('Generating satellite multi-spectral imagery data...');
const random = createSeededRandom(seed);
const results = [];
const satellites = ['Landsat-8', 'Landsat-9', 'Sentinel-2A', 'Sentinel-2B', 'MODIS', 'WorldView-3', 'Planet'];
const bands = {
'Landsat-8': ['Coastal', 'Blue', 'Green', 'Red', 'NIR', 'SWIR1', 'SWIR2', 'Cirrus', 'TIR1', 'TIR2'],
'Sentinel-2A': ['Coastal', 'Blue', 'Green', 'Red', 'RedEdge1', 'RedEdge2', 'RedEdge3', 'NIR', 'SWIR1', 'SWIR2'],
'MODIS': ['Red', 'NIR', 'Blue', 'Green', 'SWIR', 'TIR'],
'WorldView-3': ['Coastal', 'Blue', 'Green', 'Yellow', 'Red', 'RedEdge', 'NIR1', 'NIR2'],
'Planet': ['Blue', 'Green', 'Red', 'NIR']
};
const processingLevels = ['L1C', 'L1T', 'L2A', 'L2SP'];
for (let i = 0; i < count; i++) {
const satellite = satellites[Math.floor(random() * satellites.length)];
const satelliteBands = bands[satellite] || bands['Landsat-8'];
const timestamp = new Date(Date.now() - random() * 365 * 24 * 60 * 60 * 1000);
const lat = (random() - 0.5) * 180;
const lon = (random() - 0.5) * 360;
const cloudCover = Math.round((random() * 100) * 10) / 10;
const pixelValues = {};
satelliteBands.forEach(band => {
let maxValue = 65535;
if (band.includes('TIR')) {
maxValue = 40000;
}
pixelValues[band] = Math.floor(random() * maxValue);
});
const red = pixelValues['Red'] || 0;
const nir = pixelValues['NIR'] || pixelValues['NIR1'] || 0;
const ndvi = nir + red !== 0 ? Math.round(((nir - red) / (nir + red)) * 1000) / 1000 : 0;
const evi = nir + red !== 0 ? Math.round((2.5 * (nir - red) / (nir + 6 * red - 7.5 * (pixelValues['Blue'] || 0) + 1)) * 1000) / 1000 : 0;
results.push({
sceneId: satellite.replace('-', '') + '_' + timestamp.getTime() + '_' + i,
satellite,
sensor: satellite.includes('Landsat') ? 'OLI/TIRS' : satellite.includes('Sentinel') ? 'MSI' : 'Unknown',
timestamp: timestamp.toISOString(),
acquisitionDate: timestamp.toISOString().split('T')[0],
processingLevel: processingLevels[Math.floor(random() * processingLevels.length)],
location: {
centerLat: Math.round(lat * 100000) / 100000,
centerLon: Math.round(lon * 100000) / 100000,
path: Math.floor(1 + random() * 233),
row: Math.floor(1 + random() * 248),
wrs: Math.floor(1 + random() * 233) + '/' + Math.floor(1 + random() * 248)
},
geometry: {
type: 'Polygon',
coordinates: [[
[lon, lat],
[lon + 0.1, lat],
[lon + 0.1, lat + 0.1],
[lon, lat + 0.1],
[lon, lat]
]]
},
bands: satelliteBands.map(bandName => ({
name: bandName,
wavelength: bandName === 'Blue' ? '0.45-0.51' :
bandName === 'Green' ? '0.53-0.59' :
bandName === 'Red' ? '0.64-0.67' :
bandName === 'NIR' || bandName === 'NIR1' ? '0.85-0.88' :
bandName === 'SWIR1' ? '1.57-1.65' :
bandName === 'SWIR2' ? '2.11-2.29' :
bandName.includes('TIR') ? '10.6-12.5' : '0.43-0.45',
resolution: satellite.includes('Landsat') ? 30 : satellite.includes('Sentinel') ? 10 : 250,
pixelValue: pixelValues[bandName],
units: bandName.includes('TIR') ? 'Kelvin' : 'DN'
})),
cloudCover: {
percentage: cloudCover,
level: cloudCover < 10 ? 'CLEAR' :
cloudCover < 30 ? 'PARTLY_CLOUDY' :
cloudCover < 70 ? 'MOSTLY_CLOUDY' : 'OVERCAST',
cloudMask: Array.from({ length: 100 }, () => random() < cloudCover / 100)
},
indices: {
NDVI: ndvi,
EVI: evi,
NDWI: pixelValues['Green'] && pixelValues['NIR'] ?
Math.round(((pixelValues['Green'] - pixelValues['NIR']) / (pixelValues['Green'] + pixelValues['NIR'])) * 1000) / 1000 : 0,
SAVI: nir + red !== 0 ?
Math.round((1.5 * (nir - red) / (nir + red + 0.5)) * 1000) / 1000 : 0
},
metadata: {
sunElevation: Math.round((30 + random() * 60) * 100) / 100,
sunAzimuth: Math.round((random() * 360) * 100) / 100,
viewAngle: Math.round((random() * 30) * 100) / 100,
resolution: satellite.includes('WorldView') ? 1.24 :
satellite.includes('Planet') ? 3 :
satellite.includes('Sentinel') ? 10 : 30,
format: 'GeoTIFF',
projection: 'EPSG:4326',
tileId: 'T' + Math.floor(10 + random() * 50) + ['A', 'B', 'C', 'D'][Math.floor(random() * 4)]
},
qualityAssessment: {
overallQuality: ['EXCELLENT', 'GOOD', 'FAIR', 'POOR'][Math.floor(random() * 4)],
radiometricQuality: Math.round((random() * 10) * 10) / 10,
geometricQuality: Math.round((random() * 10) * 10) / 10,
artifacts: random() > 0.8,
stripingDetected: random() > 0.95
},
scrapedAt: new Date().toISOString()
});
}
return results;
}