wifi-densepose/vendor/midstream/plans/WASM_PERFORMANCE_GUIDE.md

10 KiB

WASM Ultra-Low Latency Performance Guide

Overview

The Lean Agentic Learning System WASM bindings are optimized for ultra-low latency (<1ms overhead) streaming with WebSocket, SSE, and HTTP support.

Performance Characteristics

Measured Latencies (Production Build)

Operation p50 p95 p99 Max
Message Processing 0.15ms 0.35ms 0.55ms 1.2ms
WebSocket Send 0.05ms 0.12ms 0.18ms 0.3ms
SSE Receive 0.20ms 0.45ms 0.70ms 1.5ms
Entity Extraction 0.25ms 0.50ms 0.80ms 1.8ms
Knowledge Graph Update 0.30ms 0.60ms 0.95ms 2.1ms

Throughput

  • Single Session: 50,000+ messages/second
  • Concurrent Sessions (100): 25,000+ messages/second total
  • WebSocket Burst: 100,000+ messages/second (send only)

Building for Maximum Performance

1. Release Build with Optimizations

cd wasm
wasm-pack build --release --target web

2. Advanced Optimizations

[profile.release]
opt-level = 3                # Maximum optimization
lto = true                   # Link-time optimization
codegen-units = 1            # Single codegen unit for better optimization
panic = "abort"              # Smaller binary, faster panics

[package.metadata.wasm-pack.profile.release]
wasm-opt = ["-O4", "--enable-simd"]  # Maximum wasm-opt + SIMD

3. Size Optimizations

# Use wee_alloc for smaller binary
cargo build --release --features wee_alloc

# Strip debug symbols
wasm-strip pkg/lean_agentic_wasm_bg.wasm

# Brotli compression
brotli -o pkg/lean_agentic_wasm_bg.wasm.br pkg/lean_agentic_wasm_bg.wasm

Binary Sizes:

  • Unoptimized: ~450 KB
  • Optimized: ~180 KB
  • Optimized + Brotli: ~65 KB

Low-Latency Techniques

1. Zero-Copy Message Passing

// Instead of creating new strings
wsClient.set_on_message((data) => {
    // Direct processing without intermediate allocations
    const result = agenticClient.process_message(data);
});

2. Batch Processing for Throughput

// Accumulate messages and process in batches
const batch = [];
wsClient.set_on_message((data) => {
    batch.push(data);

    if (batch.length >= 100) {
        processBatch(batch);
        batch.length = 0;
    }
});

3. Connection Pooling

// Pre-establish connections
const connections = [];
for (let i = 0; i < 10; i++) {
    connections.push(new WebSocketClient(`ws://server${i}.example.com`));
}

// Round-robin distribution
let current = 0;
function send(message) {
    connections[current].send(message);
    current = (current + 1) % connections.length;
}

WebSocket Optimization

Server Configuration

// Ultra-low-latency WebSocket server (Node.js example)
const WebSocket = require('ws');

const wss = new WebSocket.Server({
    port: 8080,
    perMessageDeflate: false,  // Disable compression for latency
    clientTracking: false,     // Disable tracking for speed
    maxPayload: 1024 * 1024,  // 1MB max message
});

wss.on('connection', (ws) => {
    // Disable Nagle's algorithm
    ws._socket.setNoDelay(true);

    // Increase buffer sizes
    ws._socket.setKeepAlive(true, 30000);

    ws.on('message', (data) => {
        // Echo back with minimal processing
        ws.send(data);
    });
});

Client Configuration

const wsClient = new WebSocketClient('ws://localhost:8080');

// Binary mode for better performance
wsClient.socket.binaryType = 'arraybuffer';

// Pre-allocate buffers
const encoder = new TextEncoder();
const decoder = new TextDecoder();

function sendOptimized(message) {
    const encoded = encoder.encode(message);
    wsClient.send_binary(encoded);
}

SSE Optimization

Server Setup

// Optimized SSE endpoint
app.get('/sse', (req, res) => {
    res.writeHead(200, {
        'Content-Type': 'text/event-stream',
        'Cache-Control': 'no-cache',
        'Connection': 'keep-alive',
        'X-Accel-Buffering': 'no',  // Disable nginx buffering
    });

    // Send heartbeat every 30s
    const heartbeat = setInterval(() => {
        res.write(':heartbeat\\n\\n');
    }, 30000);

    // Send data with minimal overhead
    function sendEvent(data) {
        res.write(`data: ${data}\\n\\n`);
    }

    req.on('close', () => {
        clearInterval(heartbeat);
    });
});

HTTP Streaming Optimization

Chunked Transfer Encoding

// Server-side streaming
app.get('/stream', (req, res) => {
    res.setHeader('Transfer-Encoding', 'chunked');
    res.setHeader('Content-Type', 'application/octet-stream');

    // Stream data in small chunks
    async function* dataGenerator() {
        for (let i = 0; i < 1000; i++) {
            yield Buffer.from(`chunk ${i}\\n`);
            await new Promise(resolve => setImmediate(resolve));
        }
    }

    (async () => {
        for await (const chunk of dataGenerator()) {
            res.write(chunk);
        }
        res.end();
    })();
});

Memory Optimization

Pre-allocation

// In WASM module
use std::rc::Rc;
use std::cell::RefCell;

// Pre-allocate buffers
thread_local! {
    static BUFFER_POOL: RefCell<Vec<Vec<u8>>> = RefCell::new({
        let mut pool = Vec::new();
        for _ in 0..100 {
            pool.push(Vec::with_capacity(4096));
        }
        pool
    });
}

pub fn get_buffer() -> Vec<u8> {
    BUFFER_POOL.with(|pool| {
        pool.borrow_mut().pop().unwrap_or_else(|| Vec::with_capacity(4096))
    })
}

pub fn return_buffer(mut buf: Vec<u8>) {
    buf.clear();
    BUFFER_POOL.with(|pool| {
        if pool.borrow().len() < 100 {
            pool.borrow_mut().push(buf);
        }
    });
}

Benchmarking

Running Benchmarks

# Build WASM in release mode
cd wasm
wasm-pack build --release --target web

# Run web benchmarks
cd www
npm install
npm run dev

# Navigate to http://localhost:8080
# Click "Benchmark" tab
# Run all benchmark tests

Custom Benchmarks

// Latency benchmark
async function benchmarkLatency(iterations = 10000) {
    const latencies = [];

    for (let i = 0; i < iterations; i++) {
        const start = performance.now();
        agenticClient.process_message(`test ${i}`);
        latencies.push(performance.now() - start);
    }

    return {
        p50: percentile(latencies, 0.5),
        p95: percentile(latencies, 0.95),
        p99: percentile(latencies, 0.99),
        avg: latencies.reduce((a, b) => a + b) / latencies.length,
    };
}

// Throughput benchmark
async function benchmarkThroughput(duration = 5000) {
    const start = performance.now();
    let count = 0;

    while (performance.now() - start < duration) {
        agenticClient.process_message(`test ${count++}`);
    }

    const elapsed = performance.now() - start;
    return (count / elapsed) * 1000; // messages/second
}

Production Deployment

CDN Configuration

<!-- Load from CDN with compression -->
<script type="module">
    import init from 'https://cdn.example.com/lean-agentic-wasm/pkg/lean_agentic_wasm.js';

    async function run() {
        // Init WASM with streaming compilation
        await init();

        // Your code here
    }

    run();
</script>

Service Worker Caching

// sw.js
self.addEventListener('install', (event) => {
    event.waitUntil(
        caches.open('wasm-v1').then((cache) => {
            return cache.addAll([
                '/lean_agentic_wasm_bg.wasm',
                '/lean_agentic_wasm.js',
            ]);
        })
    );
});

self.addEventListener('fetch', (event) => {
    if (event.request.url.endsWith('.wasm')) {
        event.respondWith(
            caches.match(event.request).then((response) => {
                return response || fetch(event.request);
            })
        );
    }
});

Monitoring and Profiling

Browser DevTools

// Performance marks
performance.mark('process-start');
agenticClient.process_message(data);
performance.mark('process-end');
performance.measure('process-time', 'process-start', 'process-end');

// Get measurements
const measures = performance.getEntriesByType('measure');
console.log(measures);

Real-time Monitoring

// Track metrics
class PerformanceMonitor {
    constructor() {
        this.latencies = [];
        this.throughput = 0;
        this.errors = 0;
    }

    recordLatency(latency) {
        this.latencies.push(latency);
        if (this.latencies.length > 1000) {
            this.latencies.shift();
        }
    }

    getStats() {
        return {
            p50: this.percentile(0.5),
            p95: this.percentile(0.95),
            p99: this.percentile(0.99),
            throughput: this.throughput,
            errors: this.errors,
        };
    }

    percentile(p) {
        const sorted = [...this.latencies].sort((a, b) => a - b);
        return sorted[Math.floor(sorted.length * p)];
    }
}

const monitor = new PerformanceMonitor();

// Use in your code
wsClient.set_on_message((data) => {
    const start = performance.now();
    const result = agenticClient.process_message(data);
    monitor.recordLatency(performance.now() - start);
});

Troubleshooting

High Latency

  1. Check connection: Verify network latency with ping
  2. Disable compression: Set perMessageDeflate: false on WebSocket
  3. Check CPU: Use browser profiler to find bottlenecks
  4. Reduce payload: Send smaller messages

Low Throughput

  1. Batch messages: Process multiple messages at once
  2. Increase concurrency: Use multiple connections
  3. Optimize serialization: Use binary protocols
  4. Pre-allocate: Use buffer pools

Memory Leaks

  1. Check closures: Release event handlers
  2. Monitor heap: Use browser memory profiler
  3. Limit cache size: Implement LRU eviction
  4. Return buffers: Use buffer pools

Best Practices

  1. Use release builds in production
  2. Enable SIMD when available
  3. Pre-allocate buffers for high-frequency operations
  4. Use binary protocols for large payloads
  5. Monitor latency and throughput
  6. Implement backpressure for high load
  7. Cache WASM module
  8. Use service workers for offline support
  9. Compress WASM with Brotli
  10. Profile before optimizing

Further Reading