#!/usr/bin/env node /** * WiFlow Pose Estimation Architecture (arXiv:2602.08661) * * Pure JavaScript implementation for ruvllm-based CSI-to-pose inference. * Adapted from the published WiFlow paper for single TX/RX ESP32 deployment: * - Stage 1: Temporal Convolutional Network (dilated causal convolutions) * - Stage 2: Asymmetric Convolution Encoder (subcarrier-dimension spatial) * - Stage 3: Axial Self-Attention (width + height, O(H^2W + HW^2)) * - Decoder: Adaptive average pooling + linear projection to 17 COCO keypoints * * Input: [batch, 128 subcarriers, 20 time steps] (CSI amplitude) * Output: [batch, 17 keypoints, 2 coordinates] normalized to [0,1] * * ADR: docs/adr/ADR-072-wiflow-architecture.md */ 'use strict'; // --------------------------------------------------------------------------- // Deterministic PRNG (xorshift32) // --------------------------------------------------------------------------- function createRng(seed) { let s = seed | 0 || 42; return () => { s ^= s << 13; s ^= s >> 17; s ^= s << 5; return (s >>> 0) / 4294967296; }; } /** Box-Muller transform for Gaussian samples */ function gaussianRng(rng) { return () => { const u1 = rng() || 1e-10; const u2 = rng(); return Math.sqrt(-2 * Math.log(u1)) * Math.cos(2 * Math.PI * u2); }; } // --------------------------------------------------------------------------- // Tensor utility functions (Float32Array based) // --------------------------------------------------------------------------- /** Initialize weight array with Kaiming He (fan_in) for ReLU layers */ function initKaiming(fanIn, fanOut, rng) { const std = Math.sqrt(2.0 / fanIn); const gauss = gaussianRng(rng); const arr = new Float32Array(fanIn * fanOut); for (let i = 0; i < arr.length; i++) arr[i] = gauss() * std; return arr; } /** Initialize weight array with Xavier/Glorot */ function initXavier(fanIn, fanOut, rng) { const std = Math.sqrt(2.0 / (fanIn + fanOut)); const gauss = gaussianRng(rng); const arr = new Float32Array(fanIn * fanOut); for (let i = 0; i < arr.length; i++) arr[i] = gauss() * std; return arr; } /** ReLU activation in-place */ function relu(arr) { for (let i = 0; i < arr.length; i++) { if (arr[i] < 0) arr[i] = 0; } return arr; } /** Softmax over a 1D array (or over last dimension of a strided view) */ function softmax(arr, offset, length) { offset = offset || 0; length = length || arr.length; let maxVal = -Infinity; for (let i = offset; i < offset + length; i++) { if (arr[i] > maxVal) maxVal = arr[i]; } let sum = 0; for (let i = offset; i < offset + length; i++) { arr[i] = Math.exp(arr[i] - maxVal); sum += arr[i]; } if (sum > 0) { for (let i = offset; i < offset + length; i++) arr[i] /= sum; } return arr; } /** SmoothL1 loss (Huber loss with beta) */ function smoothL1(predicted, target, beta) { beta = beta || 0.1; let loss = 0; const n = Math.min(predicted.length, target.length); for (let i = 0; i < n; i++) { const diff = Math.abs(predicted[i] - target[i]); if (diff < beta) { loss += 0.5 * diff * diff / beta; } else { loss += diff - 0.5 * beta; } } return loss / n; } /** SmoothL1 gradient */ function smoothL1Grad(predicted, target, beta) { beta = beta || 0.1; const n = Math.min(predicted.length, target.length); const grad = new Float32Array(n); for (let i = 0; i < n; i++) { const diff = predicted[i] - target[i]; const absDiff = Math.abs(diff); if (absDiff < beta) { grad[i] = diff / beta / n; } else { grad[i] = (diff > 0 ? 1 : -1) / n; } } return grad; } // --------------------------------------------------------------------------- // 1D Convolution (causal and non-causal) // --------------------------------------------------------------------------- /** * Conv1D: [channels_in, time] -> [channels_out, time] * Weight shape: [out_ch, in_ch, kernel] * Supports dilation and causal (left-only) padding. */ class Conv1d { /** * @param {number} inCh * @param {number} outCh * @param {number} kernel * @param {object} opts - { dilation, stride, causal, bias } */ constructor(inCh, outCh, kernel, opts = {}) { this.inCh = inCh; this.outCh = outCh; this.kernel = kernel; this.dilation = opts.dilation || 1; this.stride = opts.stride || 1; this.causal = opts.causal !== undefined ? opts.causal : false; this.hasBias = opts.bias !== false; const rng = createRng(opts.seed || (inCh * 1000 + outCh * 7 + kernel * 31)); // Kaiming init for ReLU this.weight = initKaiming(inCh * kernel, outCh, rng); this.bias = this.hasBias ? new Float32Array(outCh) : null; // Gradient accumulators this.weightGrad = new Float32Array(this.weight.length); this.biasGrad = this.hasBias ? new Float32Array(outCh) : null; } /** Count parameters */ numParams() { return this.weight.length + (this.hasBias ? this.bias.length : 0); } /** * Forward pass. * @param {Float32Array} input - shape [inCh, T] * @param {number} T - temporal length * @returns {{ output: Float32Array, T_out: number }} */ forward(input, T) { const effectiveK = this.kernel + (this.kernel - 1) * (this.dilation - 1); let padLeft, padRight; if (this.causal) { padLeft = effectiveK - 1; padRight = 0; } else { padLeft = Math.floor((effectiveK - 1) / 2); padRight = Math.ceil((effectiveK - 1) / 2); } const T_padded = T + padLeft + padRight; const T_out = Math.floor((T_padded - effectiveK) / this.stride) + 1; // Pad input with zeros const padded = new Float32Array(this.inCh * T_padded); for (let c = 0; c < this.inCh; c++) { for (let t = 0; t < T; t++) { padded[c * T_padded + (t + padLeft)] = input[c * T + t]; } } // Convolution const output = new Float32Array(this.outCh * T_out); for (let oc = 0; oc < this.outCh; oc++) { for (let t = 0; t < T_out; t++) { let sum = this.hasBias ? this.bias[oc] : 0; const tStart = t * this.stride; for (let ic = 0; ic < this.inCh; ic++) { for (let k = 0; k < this.kernel; k++) { const tIdx = tStart + k * this.dilation; if (tIdx >= 0 && tIdx < T_padded) { const wIdx = oc * (this.inCh * this.kernel) + ic * this.kernel + k; sum += this.weight[wIdx] * padded[ic * T_padded + tIdx]; } } } output[oc * T_out + t] = sum; } } return { output, T_out }; } } // --------------------------------------------------------------------------- // Batch Normalization 1D // --------------------------------------------------------------------------- class BatchNorm1d { constructor(numFeatures, opts = {}) { this.numFeatures = numFeatures; this.eps = opts.eps || 1e-5; this.momentum = opts.momentum || 0.1; this.gamma = new Float32Array(numFeatures).fill(1.0); this.beta = new Float32Array(numFeatures); this.runMean = new Float32Array(numFeatures); this.runVar = new Float32Array(numFeatures).fill(1.0); this.initialized = false; this.training = true; } numParams() { return this.numFeatures * 2; // gamma + beta } /** * Forward: normalize across time dimension. * @param {Float32Array} input - [channels, T] * @param {number} T - time steps * @returns {Float32Array} - [channels, T] */ forward(input, T) { const output = new Float32Array(input.length); if (this.training && T > 1) { // Compute batch stats per channel for (let c = 0; c < this.numFeatures; c++) { let mean = 0; for (let t = 0; t < T; t++) mean += input[c * T + t]; mean /= T; let variance = 0; for (let t = 0; t < T; t++) variance += (input[c * T + t] - mean) ** 2; variance /= T; // Update running stats if (this.initialized) { this.runMean[c] = (1 - this.momentum) * this.runMean[c] + this.momentum * mean; this.runVar[c] = (1 - this.momentum) * this.runVar[c] + this.momentum * variance; } else { this.runMean[c] = mean; this.runVar[c] = variance; } // Normalize const invStd = 1.0 / Math.sqrt(variance + this.eps); for (let t = 0; t < T; t++) { output[c * T + t] = this.gamma[c] * (input[c * T + t] - mean) * invStd + this.beta[c]; } } this.initialized = true; } else { // Use running stats (inference mode) for (let c = 0; c < this.numFeatures; c++) { const invStd = 1.0 / Math.sqrt(this.runVar[c] + this.eps); for (let t = 0; t < T; t++) { output[c * T + t] = this.gamma[c] * (input[c * T + t] - this.runMean[c]) * invStd + this.beta[c]; } } } return output; } } // --------------------------------------------------------------------------- // Stage 1: Temporal Convolutional Network (TCN) // --------------------------------------------------------------------------- /** * Single TCN block: DilatedCausalConv1d -> BN -> ReLU -> residual */ class TCNBlock { constructor(inCh, outCh, kernel, dilation, seed) { this.conv = new Conv1d(inCh, outCh, kernel, { dilation, causal: true, seed: seed || (inCh * 100 + dilation * 13), }); this.bn = new BatchNorm1d(outCh); // 1x1 residual projection if channels differ this.residual = null; if (inCh !== outCh) { this.residual = new Conv1d(inCh, outCh, 1, { seed: seed ? seed + 999 : inCh * 200 + outCh * 7, }); } } numParams() { let p = this.conv.numParams() + this.bn.numParams(); if (this.residual) p += this.residual.numParams(); return p; } forward(input, T) { const { output: convOut, T_out } = this.conv.forward(input, T); const bnOut = this.bn.forward(convOut, T_out); relu(bnOut); // Residual connection let res; if (this.residual) { const { output: resOut } = this.residual.forward(input, T); res = resOut; } else { res = input; } // Add residual (T_out should equal T for causal conv with same stride) const outCh = this.conv.outCh; for (let c = 0; c < outCh; c++) { for (let t = 0; t < T_out; t++) { bnOut[c * T_out + t] += res[c * T_out + t] || 0; } } return { output: bnOut, T_out }; } } /** * Full TCN: 4 blocks with dilation (1, 2, 4, 8), kernel=7 * Channel progression: inputCh -> 256 -> 192 -> 128 -> 128 * Scaled to reach ~2.5M total model parameters with 128-subcarrier input. */ class TemporalConvNet { constructor(inputCh, seed) { seed = seed || 42; this.blocks = [ new TCNBlock(inputCh, 256, 7, 1, seed), new TCNBlock(256, 192, 7, 2, seed + 100), new TCNBlock(192, 128, 7, 4, seed + 200), new TCNBlock(128, 128, 7, 8, seed + 300), ]; this.outCh = 128; } numParams() { return this.blocks.reduce((s, b) => s + b.numParams(), 0); } forward(input, T) { let x = input; let t = T; for (const block of this.blocks) { const result = block.forward(x, t); x = result.output; t = result.T_out; } return { output: x, T_out: t, channels: this.outCh }; } } // --------------------------------------------------------------------------- // Stage 2: Asymmetric Convolution Encoder // --------------------------------------------------------------------------- /** * Single asymmetric conv block: 1xk conv in subcarrier dim + BN + ReLU + residual * Operates on [channels, H, W] where H = subcarrier features, W = time * * After TCN, data is [48, T]. We reshape to [1, 48, T] and treat dim-1 as * "subcarrier features" and dim-2 as "time". * Each block does a 1×3 conv in the subcarrier dimension with stride (1,2) downsampling. */ class AsymmetricConvBlock { constructor(inCh, outCh, kernel, strideH, seed) { this.inCh = inCh; this.outCh = outCh; this.kernel = kernel; this.strideH = strideH || 1; const rng = createRng(seed || (inCh * 37 + outCh * 11)); // Weight: [outCh, inCh, kernel] applied along H dimension this.weight = initKaiming(inCh * kernel, outCh, rng); this.bias = new Float32Array(outCh); this.bn = new BatchNorm1d(outCh); // Residual 1x1 + stride this.residual = null; if (inCh !== outCh || strideH > 1) { this.residualWeight = initKaiming(inCh, outCh, createRng(seed ? seed + 500 : inCh * 53)); this.residualBias = new Float32Array(outCh); } } numParams() { let p = this.weight.length + this.bias.length + this.bn.numParams(); if (this.residualWeight) p += this.residualWeight.length + this.residualBias.length; return p; } /** * Forward pass. * @param {Float32Array} input - [inCh, H, W] flattened * @param {number} H - height (subcarrier features) * @param {number} W - width (time) * @returns {{ output: Float32Array, H_out: number, W_out: number }} */ forward(input, H, W) { const pad = Math.floor((this.kernel - 1) / 2); const H_out = Math.floor((H + 2 * pad - this.kernel) / this.strideH) + 1; const W_out = W; // 1×k conv along H dimension const convOut = new Float32Array(this.outCh * H_out * W_out); for (let oc = 0; oc < this.outCh; oc++) { for (let h = 0; h < H_out; h++) { const hStart = h * this.strideH - pad; for (let w = 0; w < W_out; w++) { let sum = this.bias[oc]; for (let ic = 0; ic < this.inCh; ic++) { for (let k = 0; k < this.kernel; k++) { const hIdx = hStart + k; if (hIdx >= 0 && hIdx < H) { const wIdx = oc * (this.inCh * this.kernel) + ic * this.kernel + k; sum += this.weight[wIdx] * input[ic * H * W + hIdx * W + w]; } } } convOut[oc * H_out * W_out + h * W_out + w] = sum; } } } // BN across H_out * W_out as "time" dimension const bnOut = this.bn.forward(convOut, H_out * W_out); relu(bnOut); // Residual if (this.residualWeight) { // 1x1 conv + stride for residual for (let oc = 0; oc < this.outCh; oc++) { for (let h = 0; h < H_out; h++) { const hSrc = h * this.strideH; if (hSrc >= H) continue; for (let w = 0; w < W_out; w++) { let resVal = this.residualBias[oc]; for (let ic = 0; ic < this.inCh; ic++) { resVal += this.residualWeight[oc * this.inCh + ic] * input[ic * H * W + hSrc * W + w]; } bnOut[oc * H_out * W_out + h * W_out + w] += resVal; } } } } else { // Direct residual add const minH = Math.min(H_out, H); for (let c = 0; c < Math.min(this.outCh, this.inCh); c++) { for (let h = 0; h < minH; h++) { for (let w = 0; w < W_out; w++) { bnOut[c * H_out * W_out + h * W_out + w] += input[c * H * W + h * W + w]; } } } } return { output: bnOut, H_out, W_out }; } } /** * Full asymmetric encoder: 4 blocks * Channel progression: 1 -> 32 -> 64 -> 128 -> 256 * H progression (with stride 2): 128 -> 64 -> 32 -> 16 -> 8 */ class AsymmetricConvEncoder { constructor(seed) { seed = seed || 1000; this.blocks = [ new AsymmetricConvBlock(1, 32, 3, 2, seed), new AsymmetricConvBlock(32, 64, 3, 2, seed + 100), new AsymmetricConvBlock(64, 128, 3, 2, seed + 200), new AsymmetricConvBlock(128, 256, 3, 2, seed + 300), ]; this.outCh = 256; } numParams() { return this.blocks.reduce((s, b) => s + b.numParams(), 0); } /** * Forward: takes TCN output [48, T] and processes spatially. * Reshapes to [1, 48, T], then applies 4 blocks. * @param {Float32Array} input - [channels, T] from TCN * @param {number} channels - TCN output channels (48) * @param {number} T - time steps * @returns {{ output: Float32Array, channels: number, H: number, W: number }} */ forward(input, channels, T) { // Reshape [channels, T] -> [1, channels, T] // block input: [inCh, H, W] where inCh=1, H=channels, W=T let x = new Float32Array(1 * channels * T); for (let h = 0; h < channels; h++) { for (let w = 0; w < T; w++) { x[0 * channels * T + h * T + w] = input[h * T + w]; } } let H = channels; let W = T; let ch = 1; for (const block of this.blocks) { const result = block.forward(x, H, W); x = result.output; H = result.H_out; W = result.W_out; ch = block.outCh; } return { output: x, channels: ch, H, W }; } } // --------------------------------------------------------------------------- // Stage 3: Axial Self-Attention // --------------------------------------------------------------------------- /** * Single-axis attention: Q, K, V linear projections + scaled dot-product. * Operates along one axis (width or height) of [channels, H, W] tensor. */ class AxialAttention { constructor(channels, numHeads, axis, seed) { this.channels = channels; this.numHeads = numHeads; this.headDim = Math.floor(channels / numHeads); this.axis = axis; // 'width' (temporal) or 'height' (feature) const rng = createRng(seed || (channels * 17 + numHeads * 3)); // Q, K, V projections: channels -> channels this.Wq = initXavier(channels, channels, rng); this.Wk = initXavier(channels, channels, createRng((seed || 0) + 1)); this.Wv = initXavier(channels, channels, createRng((seed || 0) + 2)); this.Wo = initXavier(channels, channels, createRng((seed || 0) + 3)); // Biases this.bq = new Float32Array(channels); this.bk = new Float32Array(channels); this.bv = new Float32Array(channels); this.bo = new Float32Array(channels); // Learnable positional encoding (max length 128) this.maxLen = 128; const posRng = createRng((seed || 0) + 10); this.posEnc = new Float32Array(this.maxLen * channels); const posScale = 0.02; for (let i = 0; i < this.posEnc.length; i++) { this.posEnc[i] = (posRng() - 0.5) * posScale; } } numParams() { return this.Wq.length + this.Wk.length + this.Wv.length + this.Wo.length + this.bq.length + this.bk.length + this.bv.length + this.bo.length + this.posEnc.length; } /** * Linear projection: x [N, C] @ W [C, C] + b [C] -> [N, C] */ _project(x, N, C, W, b) { const out = new Float32Array(N * C); for (let n = 0; n < N; n++) { for (let j = 0; j < C; j++) { let sum = b[j]; for (let i = 0; i < C; i++) { sum += x[n * C + i] * W[i * C + j]; } out[n * C + j] = sum; } } return out; } /** * Forward: applies attention along the specified axis. * @param {Float32Array} input - [channels, H, W] flattened * @param {number} H * @param {number} W * @returns {Float32Array} - same shape */ forward(input, H, W) { const C = this.channels; const output = new Float32Array(input.length); if (this.axis === 'width') { // Attention along W (temporal axis) for each row h for (let h = 0; h < H; h++) { // Extract row: [W, C] where each position has C channels const row = new Float32Array(W * C); for (let w = 0; w < W; w++) { for (let c = 0; c < C; c++) { row[w * C + c] = input[c * H * W + h * W + w]; } // Add positional encoding if (w < this.maxLen) { for (let c = 0; c < C; c++) { row[w * C + c] += this.posEnc[w * C + c]; } } } // Q, K, V projections: [W, C] const Q = this._project(row, W, C, this.Wq, this.bq); const K = this._project(row, W, C, this.Wk, this.bk); const V = this._project(row, W, C, this.Wv, this.bv); // Multi-head attention const attnOut = this._multiheadAttention(Q, K, V, W); // Output projection const projected = this._project(attnOut, W, C, this.Wo, this.bo); // Write back + residual for (let w = 0; w < W; w++) { for (let c = 0; c < C; c++) { output[c * H * W + h * W + w] = input[c * H * W + h * W + w] + projected[w * C + c]; } } } } else { // Attention along H (feature axis) for each column w for (let w = 0; w < W; w++) { const col = new Float32Array(H * C); for (let h = 0; h < H; h++) { for (let c = 0; c < C; c++) { col[h * C + c] = input[c * H * W + h * W + w]; } if (h < this.maxLen) { for (let c = 0; c < C; c++) { col[h * C + c] += this.posEnc[h * C + c]; } } } const Q = this._project(col, H, C, this.Wq, this.bq); const K = this._project(col, H, C, this.Wk, this.bk); const V = this._project(col, H, C, this.Wv, this.bv); const attnOut = this._multiheadAttention(Q, K, V, H); const projected = this._project(attnOut, H, C, this.Wo, this.bo); for (let h = 0; h < H; h++) { for (let c = 0; c < C; c++) { output[c * H * W + h * W + w] = input[c * H * W + h * W + w] + projected[h * C + c]; } } } } return output; } /** * Multi-head scaled dot-product attention. * @param {Float32Array} Q - [N, C] * @param {Float32Array} K - [N, C] * @param {Float32Array} V - [N, C] * @param {number} N - sequence length * @returns {Float32Array} - [N, C] */ _multiheadAttention(Q, K, V, N) { const C = this.channels; const H = this.numHeads; const D = this.headDim; const scale = 1.0 / Math.sqrt(D); const output = new Float32Array(N * C); for (let head = 0; head < H; head++) { const dOff = head * D; // Compute attention scores: [N, N] const scores = new Float32Array(N * N); for (let i = 0; i < N; i++) { for (let j = 0; j < N; j++) { let dot = 0; for (let d = 0; d < D; d++) { dot += Q[i * C + dOff + d] * K[j * C + dOff + d]; } scores[i * N + j] = dot * scale; } // Softmax over j for this row i softmax(scores, i * N, N); } // Apply attention to V: [N, D] for (let i = 0; i < N; i++) { for (let d = 0; d < D; d++) { let sum = 0; for (let j = 0; j < N; j++) { sum += scores[i * N + j] * V[j * C + dOff + d]; } output[i * C + dOff + d] = sum; } } } return output; } } /** * Axial Self-Attention: width attention (temporal) then height attention (feature). */ class AxialSelfAttention { constructor(channels, numHeads, seed) { seed = seed || 2000; this.widthAttn = new AxialAttention(channels, numHeads, 'width', seed); this.heightAttn = new AxialAttention(channels, numHeads, 'height', seed + 500); this.channels = channels; } numParams() { return this.widthAttn.numParams() + this.heightAttn.numParams(); } forward(input, H, W) { const afterWidth = this.widthAttn.forward(input, H, W); const afterHeight = this.heightAttn.forward(afterWidth, H, W); return afterHeight; } } // --------------------------------------------------------------------------- // Decoder: Adaptive Average Pooling + Linear -> 17 COCO keypoints x 2 // --------------------------------------------------------------------------- /** * COCO skeleton: 17 keypoints * 0=nose, 1=left_eye, 2=right_eye, 3=left_ear, 4=right_ear, * 5=left_shoulder, 6=right_shoulder, 7=left_elbow, 8=right_elbow, * 9=left_wrist, 10=right_wrist, 11=left_hip, 12=right_hip, * 13=left_knee, 14=right_knee, 15=left_ankle, 16=right_ankle */ const COCO_KEYPOINTS = [ 'nose', 'left_eye', 'right_eye', 'left_ear', 'right_ear', 'left_shoulder', 'right_shoulder', 'left_elbow', 'right_elbow', 'left_wrist', 'right_wrist', 'left_hip', 'right_hip', 'left_knee', 'right_knee', 'left_ankle', 'right_ankle', ]; const BONE_CONNECTIONS = [ [0, 1], [0, 2], // nose -> eyes [1, 3], [2, 4], // eyes -> ears [5, 7], [7, 9], // left arm [6, 8], [8, 10], // right arm [5, 11], [6, 12], // torso [11, 13], [13, 15], // left leg [12, 14], [14, 16], // right leg [5, 6], // shoulder width ]; /** Bone length priors normalized to person height */ const BONE_LENGTH_PRIORS = [ 0.06, 0.06, // nose-eye (x2) 0.06, 0.06, // eye-ear (x2) 0.15, 0.13, // left shoulder-elbow, elbow-wrist 0.15, 0.13, // right shoulder-elbow, elbow-wrist 0.26, 0.26, // shoulder-hip (x2) 0.25, 0.25, // left hip-knee, knee-ankle 0.25, 0.25, // right hip-knee, knee-ankle 0.20, // shoulder width ]; class PoseDecoder { constructor(inFeatures, numKeypoints, seed) { this.inFeatures = inFeatures; this.numKeypoints = numKeypoints || 17; this.outDim = this.numKeypoints * 2; const rng = createRng(seed || 3000); // Linear: inFeatures -> numKeypoints * 2 this.weight = initXavier(inFeatures, this.outDim, rng); this.bias = new Float32Array(this.outDim); // Initialize bias to center of room (0.5, 0.5) for each keypoint for (let k = 0; k < this.numKeypoints; k++) { this.bias[k * 2] = 0.5; // x this.bias[k * 2 + 1] = 0.5; // y } } numParams() { return this.weight.length + this.bias.length; } /** * Forward: adaptive average pooling over temporal dim, then linear. * @param {Float32Array} input - [channels, H, W] * @param {number} channels * @param {number} H * @param {number} W * @returns {Float32Array} - [numKeypoints * 2] keypoint coordinates */ forward(input, channels, H, W) { // Adaptive average pooling: [channels, H, W] -> [channels * H] // Average over W (temporal dimension) const pooled = new Float32Array(channels * H); for (let c = 0; c < channels; c++) { for (let h = 0; h < H; h++) { let sum = 0; for (let w = 0; w < W; w++) { sum += input[c * H * W + h * W + w]; } pooled[c * H + h] = sum / W; } } // Linear projection: [channels * H] -> [numKeypoints * 2] const featureDim = channels * H; const out = new Float32Array(this.outDim); // If featureDim != inFeatures, truncate or zero-pad const useDim = Math.min(featureDim, this.inFeatures); for (let j = 0; j < this.outDim; j++) { let sum = this.bias[j]; for (let i = 0; i < useDim; i++) { sum += pooled[i] * this.weight[i * this.outDim + j]; } // Sigmoid to normalize output to [0, 1] out[j] = 1.0 / (1.0 + Math.exp(-sum)); } return out; } } // --------------------------------------------------------------------------- // WiFlow Model: Full Pipeline // --------------------------------------------------------------------------- class WiFlowModel { /** * @param {object} config * @param {number} config.inputChannels - CSI subcarrier count (default: 128) * @param {number} config.timeSteps - temporal window (default: 20) * @param {number} config.numKeypoints - COCO keypoints (default: 17) * @param {number} config.numHeads - attention heads (default: 8) * @param {number} config.seed - random seed (default: 42) */ constructor(config = {}) { this.inputChannels = config.inputChannels || 128; this.timeSteps = config.timeSteps || 20; this.numKeypoints = config.numKeypoints || 17; this.numHeads = config.numHeads || 8; this.seed = config.seed || 42; this.training = true; // Stage 1: TCN (inputChannels -> 128 channels, preserves time) this.tcn = new TemporalConvNet(this.inputChannels, this.seed); // Stage 2: Asymmetric Conv (128 TCN features -> 8 via stride-2 downsampling) // Input: [1, 128, T] -> [256, 8, T] this.spatialEncoder = new AsymmetricConvEncoder(this.seed + 1000); // Stage 3: Axial Self-Attention on [256, 8, T] this.axialAttention = new AxialSelfAttention(256, this.numHeads, this.seed + 2000); // Decoder: [256, 8, T] -> 17 * 2 // After pooling over T: feature dim = 256 * 8 = 2048 this.decoder = new PoseDecoder(2048, this.numKeypoints, this.seed + 3000); } /** Total parameter count */ numParams() { return this.tcn.numParams() + this.spatialEncoder.numParams() + this.axialAttention.numParams() + this.decoder.numParams(); } /** Parameter breakdown by stage */ paramBreakdown() { return { tcn: this.tcn.numParams(), spatialEncoder: this.spatialEncoder.numParams(), axialAttention: this.axialAttention.numParams(), decoder: this.decoder.numParams(), total: this.numParams(), }; } /** Set training/eval mode */ setTraining(mode) { this.training = mode; // Propagate to BatchNorm layers const setBnMode = (obj) => { if (obj && obj.bn) obj.bn.training = mode; if (obj && obj.blocks) obj.blocks.forEach(b => setBnMode(b)); if (obj && obj.conv && obj.conv.bn) obj.conv.bn = mode; }; setBnMode(this.tcn); setBnMode(this.spatialEncoder); } /** * Forward pass: CSI amplitude -> 17 keypoint coordinates. * * @param {Float32Array} csiAmplitude - [inputChannels, timeSteps] flattened * or [batch, inputChannels, timeSteps] for batched inference. * @param {number} [batchSize=1] * @returns {Float32Array|Float32Array[]} - [numKeypoints * 2] or array of them */ forward(csiAmplitude, batchSize) { batchSize = batchSize || 1; if (batchSize === 1) { return this._forwardSingle(csiAmplitude); } // Batched inference const results = []; const singleSize = this.inputChannels * this.timeSteps; for (let b = 0; b < batchSize; b++) { const slice = csiAmplitude.slice(b * singleSize, (b + 1) * singleSize); results.push(this._forwardSingle(slice)); } return results; } /** * Single-sample forward pass. * @param {Float32Array} input - [inputChannels, timeSteps] * @returns {Float32Array} - [numKeypoints * 2] */ _forwardSingle(input) { // Stage 1: TCN const tcnResult = this.tcn.forward(input, this.timeSteps); // Stage 2: Asymmetric Conv const spatialResult = this.spatialEncoder.forward( tcnResult.output, tcnResult.channels, tcnResult.T_out ); // Stage 3: Axial Attention const attnOutput = this.axialAttention.forward( spatialResult.output, spatialResult.H, spatialResult.W ); // Decoder const keypoints = this.decoder.forward( attnOutput, spatialResult.channels, spatialResult.H, spatialResult.W ); return keypoints; } /** * Compute WiFlow loss: L = L_H + 0.2 * L_B * L_H = SmoothL1(predicted, target, beta=0.1) * L_B = bone length constraint violation * * @param {Float32Array} predicted - [numKeypoints * 2] * @param {Float32Array} target - [numKeypoints * 2] * @param {boolean} boneConstraints - include bone length loss * @returns {{ total: number, smoothL1: number, boneLoss: number }} */ computeLoss(predicted, target, boneConstraints) { if (boneConstraints === undefined) boneConstraints = true; const lH = smoothL1(predicted, target, 0.1); let lB = 0; if (boneConstraints) { for (let b = 0; b < BONE_CONNECTIONS.length; b++) { const [i, j] = BONE_CONNECTIONS[b]; const prior = BONE_LENGTH_PRIORS[b]; const dx = predicted[i * 2] - predicted[j * 2]; const dy = predicted[i * 2 + 1] - predicted[j * 2 + 1]; const boneLen = Math.sqrt(dx * dx + dy * dy); // Penalty for deviation from prior (squared difference) const deviation = boneLen - prior; lB += deviation * deviation; } lB /= BONE_CONNECTIONS.length; } return { total: lH + 0.2 * lB, smoothL1: lH, boneLoss: lB, }; } /** * Compute loss gradient w.r.t. predicted keypoints. * @param {Float32Array} predicted - [numKeypoints * 2] * @param {Float32Array} target - [numKeypoints * 2] * @returns {Float32Array} - gradient [numKeypoints * 2] */ computeLossGrad(predicted, target) { const n = predicted.length; const grad = smoothL1Grad(predicted, target, 0.1); // Bone constraint gradient for (let b = 0; b < BONE_CONNECTIONS.length; b++) { const [i, j] = BONE_CONNECTIONS[b]; const prior = BONE_LENGTH_PRIORS[b]; const dx = predicted[i * 2] - predicted[j * 2]; const dy = predicted[i * 2 + 1] - predicted[j * 2 + 1]; const boneLen = Math.sqrt(dx * dx + dy * dy) || 1e-8; const deviation = boneLen - prior; const scale = 0.2 * 2 * deviation / (boneLen * BONE_CONNECTIONS.length); grad[i * 2] += scale * dx; grad[i * 2 + 1] += scale * dy; grad[j * 2] -= scale * dx; grad[j * 2 + 1] -= scale * dy; } return grad; } /** * Compute PCK@threshold (Percentage of Correct Keypoints). * @param {Float32Array} predicted - [numKeypoints * 2] * @param {Float32Array} target - [numKeypoints * 2] * @param {number} threshold - distance threshold (normalized coords) * @returns {number} - fraction of keypoints within threshold */ static pck(predicted, target, threshold) { threshold = threshold || 0.2; let correct = 0; const nk = Math.floor(predicted.length / 2); for (let k = 0; k < nk; k++) { const dx = predicted[k * 2] - target[k * 2]; const dy = predicted[k * 2 + 1] - target[k * 2 + 1]; const dist = Math.sqrt(dx * dx + dy * dy); if (dist <= threshold) correct++; } return correct / nk; } /** * Compute bone length violation rate. * @param {Float32Array} predicted - [numKeypoints * 2] * @param {number} tolerance - allowed deviation as fraction of prior * @returns {{ violationRate: number, violations: number[] }} */ static boneViolations(predicted, tolerance) { tolerance = tolerance || 0.5; // 50% deviation tolerance const violations = []; for (let b = 0; b < BONE_CONNECTIONS.length; b++) { const [i, j] = BONE_CONNECTIONS[b]; const prior = BONE_LENGTH_PRIORS[b]; const dx = predicted[i * 2] - predicted[j * 2]; const dy = predicted[i * 2 + 1] - predicted[j * 2 + 1]; const boneLen = Math.sqrt(dx * dx + dy * dy); if (Math.abs(boneLen - prior) > prior * tolerance) { violations.push(b); } } return { violationRate: violations.length / BONE_CONNECTIONS.length, violations, }; } /** * Get all weights as a flat Float32Array (for quantization / export). */ getAllWeights() { const arrays = []; // Collect all weight arrays from each stage const collectConv = (conv) => { arrays.push(conv.weight); if (conv.bias) arrays.push(conv.bias); }; const collectBN = (bn) => { arrays.push(bn.gamma); arrays.push(bn.beta); }; // TCN for (const block of this.tcn.blocks) { collectConv(block.conv); collectBN(block.bn); if (block.residual) collectConv(block.residual); } // Spatial encoder for (const block of this.spatialEncoder.blocks) { arrays.push(block.weight); arrays.push(block.bias); collectBN(block.bn); if (block.residualWeight) { arrays.push(block.residualWeight); arrays.push(block.residualBias); } } // Axial attention for (const attn of [this.axialAttention.widthAttn, this.axialAttention.heightAttn]) { arrays.push(attn.Wq, attn.Wk, attn.Wv, attn.Wo); arrays.push(attn.bq, attn.bk, attn.bv, attn.bo); arrays.push(attn.posEnc); } // Decoder arrays.push(this.decoder.weight); arrays.push(this.decoder.bias); // Flatten let totalLen = 0; for (const a of arrays) totalLen += a.length; const flat = new Float32Array(totalLen); let offset = 0; for (const a of arrays) { flat.set(a, offset); offset += a.length; } return flat; } /** * Export model as a named tensor map (for SafeTensors). * @returns {Map} */ toTensorMap() { const tensors = new Map(); // TCN for (let i = 0; i < this.tcn.blocks.length; i++) { const b = this.tcn.blocks[i]; tensors.set(`tcn.block${i}.conv.weight`, b.conv.weight); if (b.conv.bias) tensors.set(`tcn.block${i}.conv.bias`, b.conv.bias); tensors.set(`tcn.block${i}.bn.gamma`, b.bn.gamma); tensors.set(`tcn.block${i}.bn.beta`, b.bn.beta); tensors.set(`tcn.block${i}.bn.runMean`, b.bn.runMean); tensors.set(`tcn.block${i}.bn.runVar`, b.bn.runVar); if (b.residual) { tensors.set(`tcn.block${i}.residual.weight`, b.residual.weight); if (b.residual.bias) tensors.set(`tcn.block${i}.residual.bias`, b.residual.bias); } } // Spatial encoder for (let i = 0; i < this.spatialEncoder.blocks.length; i++) { const b = this.spatialEncoder.blocks[i]; tensors.set(`spatial.block${i}.weight`, b.weight); tensors.set(`spatial.block${i}.bias`, b.bias); tensors.set(`spatial.block${i}.bn.gamma`, b.bn.gamma); tensors.set(`spatial.block${i}.bn.beta`, b.bn.beta); tensors.set(`spatial.block${i}.bn.runMean`, b.bn.runMean); tensors.set(`spatial.block${i}.bn.runVar`, b.bn.runVar); if (b.residualWeight) { tensors.set(`spatial.block${i}.residual.weight`, b.residualWeight); tensors.set(`spatial.block${i}.residual.bias`, b.residualBias); } } // Axial attention for (const [name, attn] of [['width', this.axialAttention.widthAttn], ['height', this.axialAttention.heightAttn]]) { tensors.set(`axial.${name}.Wq`, attn.Wq); tensors.set(`axial.${name}.Wk`, attn.Wk); tensors.set(`axial.${name}.Wv`, attn.Wv); tensors.set(`axial.${name}.Wo`, attn.Wo); tensors.set(`axial.${name}.bq`, attn.bq); tensors.set(`axial.${name}.bk`, attn.bk); tensors.set(`axial.${name}.bv`, attn.bv); tensors.set(`axial.${name}.bo`, attn.bo); tensors.set(`axial.${name}.posEnc`, attn.posEnc); } // Decoder tensors.set('decoder.weight', this.decoder.weight); tensors.set('decoder.bias', this.decoder.bias); return tensors; } /** * Load weights from a tensor map (from SafeTensors). * @param {Map} tensors */ fromTensorMap(tensors) { const load = (key, target) => { const src = tensors.get(key); if (src && src.length === target.length) { target.set(src); } }; for (let i = 0; i < this.tcn.blocks.length; i++) { const b = this.tcn.blocks[i]; load(`tcn.block${i}.conv.weight`, b.conv.weight); if (b.conv.bias) load(`tcn.block${i}.conv.bias`, b.conv.bias); load(`tcn.block${i}.bn.gamma`, b.bn.gamma); load(`tcn.block${i}.bn.beta`, b.bn.beta); load(`tcn.block${i}.bn.runMean`, b.bn.runMean); load(`tcn.block${i}.bn.runVar`, b.bn.runVar); if (b.residual) { load(`tcn.block${i}.residual.weight`, b.residual.weight); if (b.residual.bias) load(`tcn.block${i}.residual.bias`, b.residual.bias); } } for (let i = 0; i < this.spatialEncoder.blocks.length; i++) { const b = this.spatialEncoder.blocks[i]; load(`spatial.block${i}.weight`, b.weight); load(`spatial.block${i}.bias`, b.bias); load(`spatial.block${i}.bn.gamma`, b.bn.gamma); load(`spatial.block${i}.bn.beta`, b.bn.beta); load(`spatial.block${i}.bn.runMean`, b.bn.runMean); load(`spatial.block${i}.bn.runVar`, b.bn.runVar); if (b.residualWeight) { load(`spatial.block${i}.residual.weight`, b.residualWeight); load(`spatial.block${i}.residual.bias`, b.residualBias); } } for (const [name, attn] of [['width', this.axialAttention.widthAttn], ['height', this.axialAttention.heightAttn]]) { load(`axial.${name}.Wq`, attn.Wq); load(`axial.${name}.Wk`, attn.Wk); load(`axial.${name}.Wv`, attn.Wv); load(`axial.${name}.Wo`, attn.Wo); load(`axial.${name}.bq`, attn.bq); load(`axial.${name}.bk`, attn.bk); load(`axial.${name}.bv`, attn.bv); load(`axial.${name}.bo`, attn.bo); load(`axial.${name}.posEnc`, attn.posEnc); } load('decoder.weight', this.decoder.weight); load('decoder.bias', this.decoder.bias); } } // --------------------------------------------------------------------------- // FLOPs estimation // --------------------------------------------------------------------------- /** * Estimate FLOPs per forward pass for each stage. */ function estimateFLOPs(config) { config = config || {}; const C = config.inputChannels || 128; const T = config.timeSteps || 20; const K = 7; // TCN kernel let flops = {}; // Stage 1: TCN - 4 dilated causal conv blocks // Each conv: 2 * inCh * outCh * K * T const tcnLayers = [ { inCh: C, outCh: 256 }, { inCh: 256, outCh: 192 }, { inCh: 192, outCh: 128 }, { inCh: 128, outCh: 128 }, ]; flops.tcn = 0; for (const l of tcnLayers) { flops.tcn += 2 * l.inCh * l.outCh * K * T; // BN: 4 * outCh * T flops.tcn += 4 * l.outCh * T; // Residual 1x1 if channels differ if (l.inCh !== l.outCh) flops.tcn += 2 * l.inCh * l.outCh * T; } // Stage 2: Asymmetric conv const spatialLayers = [ { inCh: 1, outCh: 32, Hin: 128, Hout: 64 }, { inCh: 32, outCh: 64, Hin: 64, Hout: 32 }, { inCh: 64, outCh: 128, Hin: 32, Hout: 16 }, { inCh: 128, outCh: 256, Hin: 16, Hout: 8 }, ]; flops.spatialEncoder = 0; for (const l of spatialLayers) { flops.spatialEncoder += 2 * l.inCh * l.outCh * 3 * l.Hout * T; flops.spatialEncoder += 4 * l.outCh * l.Hout * T; flops.spatialEncoder += 2 * l.inCh * l.outCh * l.Hout * T; // residual } // Stage 3: Axial attention // Width attention: H * (3 * C * C + C * W * W) for each of H rows const attnC = 256, attnH = 8, attnW = T; flops.axialAttention = 0; // Width: for each of H rows, project W tokens, compute W*W attention flops.axialAttention += attnH * (3 * attnW * attnC * attnC + attnW * attnW * attnC + attnW * attnC * attnC); // Height: for each of W cols, project H tokens, compute H*H attention flops.axialAttention += attnW * (3 * attnH * attnC * attnC + attnH * attnH * attnC + attnH * attnC * attnC); // Decoder const featureDim = 256 * 8; // after pooling flops.decoder = 2 * featureDim * 34; // 17*2 outputs flops.total = flops.tcn + flops.spatialEncoder + flops.axialAttention + flops.decoder; return flops; } // --------------------------------------------------------------------------- // Exports // --------------------------------------------------------------------------- module.exports = { // Core model classes WiFlowModel, TemporalConvNet, AsymmetricConvEncoder, AxialSelfAttention, AxialAttention, PoseDecoder, Conv1d, BatchNorm1d, TCNBlock, AsymmetricConvBlock, // Constants COCO_KEYPOINTS, BONE_CONNECTIONS, BONE_LENGTH_PRIORS, // Utility functions smoothL1, smoothL1Grad, softmax, relu, initKaiming, initXavier, createRng, gaussianRng, estimateFLOPs, };