diff --git a/firmware/esp32-csi-node/components/ruv_temporal/README.md b/firmware/esp32-csi-node/components/ruv_temporal/README.md index 0488757c..4b8ad163 100644 --- a/firmware/esp32-csi-node/components/ruv_temporal/README.md +++ b/firmware/esp32-csi-node/components/ruv_temporal/README.md @@ -9,10 +9,20 @@ C ABI declared in `include/ruv_temporal.h`. | Phase | Scope | State | |-------|-------|-------| -| 4 — Scaffold | Cargo.toml, src/{lib.rs,window.rs}, include/ruv_temporal.h, CMakeLists.txt, .cargo/config.toml | **Done.** Source compiles host-side syntax check; not yet cross-compiled to xtensa. | +| 4 — Scaffold | Cargo.toml, src/{lib.rs,window.rs,weights.rs}, include/ruv_temporal.h, CMakeLists.txt, .cargo/config.toml | **Done.** | | 5 — Cross-compile | `cargo +esp build --release --target xtensa-esp32s3-none-elf` produces `libruv_temporal.a`. | **Blocked** — see below. | -| 6 — Wire from edge_processing.c | FreeRTOS task on Core 1, queue from adaptive_controller fast loop, push() in fast tick, classify() at 1 Hz, emit `0xC5110007` packet. | Not started. | -| 7 — COM8 validation | Flash 8MB build with `CONFIG_CSI_TEMPORAL_HEAD_ENABLED=y`, soak ≥5 min, check no Tmr Svc / task_wdt overflow. | Not started. | +| 6 — Wire from edge_processing.c | FreeRTOS task on Core 1, queue from adaptive_controller fast loop, push() in fast tick, classify() at 1 Hz, emit `0xC5110007` packet. | **Done** in `main/temporal_task.c` (no-op shim path verified by 8MB firmware build with feature off). | +| 7 — COM8 validation | Flash 8MB build with `CONFIG_CSI_TEMPORAL_HEAD_ENABLED=y`, soak ≥5 min, check no Tmr Svc / task_wdt overflow. | Pending board reattach. | + +## Module map + +| File | Purpose | +|------|---------| +| `src/lib.rs` | C ABI: `ruv_temporal_init / push / classify / destroy / kernel_self_test` | +| `src/window.rs` | `FrameRing` rolling buffer used by `ruv_temporal_push` | +| `src/weights.rs` | Loader-side mirror of host `wifi_densepose_temporal::weights`. Parses the `.rvne` blob format (magic `RVNE`, version 1, FP32/FP16, CRC32-IEEE). Bit-exact with the host crate; a blob produced by the host's `WeightBlob::serialize()` parses here byte-for-byte. | +| `include/ruv_temporal.h` | Public C header consumed by `main/temporal_task.c` | +| `shim.c` | Empty C shim for `idf_component_register` | ## Phase 5 blocker — esp toolchain rust-src bug diff --git a/firmware/esp32-csi-node/components/ruv_temporal/src/lib.rs b/firmware/esp32-csi-node/components/ruv_temporal/src/lib.rs index 24fd6a07..8a64b7c3 100644 --- a/firmware/esp32-csi-node/components/ruv_temporal/src/lib.rs +++ b/firmware/esp32-csi-node/components/ruv_temporal/src/lib.rs @@ -24,7 +24,9 @@ extern crate alloc; use alloc::boxed::Box; use core::ffi::c_void; +mod weights; mod window; +use weights::{WeightBlobView, WeightLoadError}; use window::FrameRing; // ---- ESP-IDF compatible error codes --------------------------------------- @@ -87,8 +89,35 @@ pub extern "C" fn ruv_temporal_init( if out_ctx.is_null() || input_dim == 0 || window_len == 0 || n_classes == 0 { return ESP_ERR_INVALID_ARG; } - // Phase 5: deserialize weights blob; Phase 4 just records the size. - let _ = (weights, weights_len); + + // Optional weights blob: when caller passes a non-NULL pointer, + // parse and validate it. Caller can pass NULL during the Phase 4/5 + // bring-up window when the kernel forward isn't actually consuming + // weights yet — we just want the parse path itself proven on the + // device. Once Phase 5 unblocks and the kernel is wired, Phase 6 + // makes a non-NULL weights argument required. + if !weights.is_null() && weights_len > 0 { + // SAFETY: caller asserts the buffer covers `weights_len` bytes + // and outlives this call. Borrowed-slice parse — no copy. + let buf = unsafe { core::slice::from_raw_parts(weights, weights_len) }; + match WeightBlobView::parse(buf) { + Ok(view) => { + // Sanity-check that the blob's declared shape matches + // the runtime arguments. A blob with input_dim=32 in + // a context configured for input_dim=16 is a deploy bug + // we want to catch at init() not at first classify(). + if view.header.input_dim as u32 != input_dim + || view.header.n_classes as u32 != n_classes + { + return ESP_ERR_INVALID_ARG; + } + // Phase 5+: stash view into the context for the kernel + // to consume. For now the parse itself is the proof + // that the format crossed the host/firmware boundary. + } + Err(e) => return weights::weight_load_err_to_esp(&e), + } + } let ring = match FrameRing::new(window_len as usize, input_dim as usize) { Some(r) => r, diff --git a/firmware/esp32-csi-node/components/ruv_temporal/src/weights.rs b/firmware/esp32-csi-node/components/ruv_temporal/src/weights.rs new file mode 100644 index 00000000..2bf0169a --- /dev/null +++ b/firmware/esp32-csi-node/components/ruv_temporal/src/weights.rs @@ -0,0 +1,194 @@ +// Firmware-side mirror of `wifi-densepose-temporal::weights`. Same wire +// format, same magic, same CRC polynomial — a blob produced by the +// host's `WeightBlob::serialize()` parses here byte-for-byte. +// +// no_std + alloc. The host side keeps weights as `Vec` because it +// owns the buffer; the firmware loader takes a borrowed `&[u8]` slice +// (the blob lives in flash via EMBED_FILES, or a heap mmap from NVS, +// neither of which the loader should re-allocate). +// +// Stays *byte-exact* in lockstep with `v2/crates/wifi-densepose-temporal/src/weights.rs`. +// When the host format changes, this file changes in the same commit +// and bumps `BLOB_VERSION`; mismatched versions refuse to load. + +use core::convert::TryInto; +use core::fmt; + +pub const BLOB_MAGIC: u32 = 0x5256_4E45; // "RVNE" +pub const BLOB_VERSION: u16 = 1; +pub const BLOB_HEADER_LEN: usize = 24; +pub const BLOB_FOOTER_LEN: usize = 4; + +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum WeightDtype { + F32, + F16, +} + +#[derive(Clone, Copy, Debug)] +pub struct WeightBlobHeader { + pub dtype: WeightDtype, + pub input_dim: u16, + pub n_q_heads: u16, + pub n_kv_heads: u16, + pub head_dim: u16, + pub n_layers: u16, + pub n_classes: u16, +} + +impl WeightBlobHeader { + pub fn elem_bytes(&self) -> usize { + match self.dtype { + WeightDtype::F32 => 4, + WeightDtype::F16 => 2, + } + } + + fn validate(&self) -> Result<(), WeightLoadError> { + if self.input_dim == 0 + || self.n_q_heads == 0 + || self.n_kv_heads == 0 + || self.head_dim == 0 + { + return Err(WeightLoadError::ZeroDim); + } + if self.n_q_heads % self.n_kv_heads != 0 { + return Err(WeightLoadError::InvalidGqaRatio); + } + if self.n_layers == 0 || self.n_classes < 2 { + return Err(WeightLoadError::DegenerateShape); + } + Ok(()) + } +} + +/// A parsed view into a weights blob. Holds borrowed slices into the +/// caller-owned buffer — no allocation, no copy. The firmware's +/// kernel reads weights directly from this view. +#[derive(Clone, Copy)] +pub struct WeightBlobView<'a> { + pub header: WeightBlobHeader, + pub weights: &'a [u8], +} + +impl<'a> WeightBlobView<'a> { + /// Parse a blob, validating magic / version / size / CRC. Returns + /// a borrowed view; the input `buf` must outlive the view. + pub fn parse(buf: &'a [u8]) -> Result { + if buf.len() < BLOB_HEADER_LEN + BLOB_FOOTER_LEN { + return Err(WeightLoadError::TooShort); + } + + let magic = u32::from_le_bytes(buf[0..4].try_into().unwrap()); + if magic != BLOB_MAGIC { + return Err(WeightLoadError::BadMagic); + } + let version = u16::from_le_bytes(buf[4..6].try_into().unwrap()); + if version != BLOB_VERSION { + return Err(WeightLoadError::WrongVersion(version)); + } + let flags = buf[6]; + let dtype = match flags & 0x01 { + 0 => WeightDtype::F32, + _ => WeightDtype::F16, + }; + + let input_dim = u16::from_le_bytes(buf[8..10].try_into().unwrap()); + let n_q_heads = u16::from_le_bytes(buf[10..12].try_into().unwrap()); + let n_kv_heads = u16::from_le_bytes(buf[12..14].try_into().unwrap()); + let head_dim = u16::from_le_bytes(buf[14..16].try_into().unwrap()); + let n_layers = u16::from_le_bytes(buf[16..18].try_into().unwrap()); + let n_classes = u16::from_le_bytes(buf[18..20].try_into().unwrap()); + let weights_len = u32::from_le_bytes(buf[20..24].try_into().unwrap()) as usize; + + let expected = BLOB_HEADER_LEN + weights_len + BLOB_FOOTER_LEN; + if buf.len() != expected { + return Err(WeightLoadError::SizeMismatch); + } + + let stored_crc = u32::from_le_bytes(buf[buf.len() - 4..].try_into().unwrap()); + let computed = crc32_ieee(&buf[..buf.len() - 4]); + if stored_crc != computed { + return Err(WeightLoadError::CrcMismatch); + } + + let header = WeightBlobHeader { + dtype, + input_dim, + n_q_heads, + n_kv_heads, + head_dim, + n_layers, + n_classes, + }; + header.validate()?; + + let weights_start = BLOB_HEADER_LEN; + let weights_end = weights_start + weights_len; + Ok(Self { + header, + weights: &buf[weights_start..weights_end], + }) + } +} + +/// Loader-side error. Distinct from the host-side `TemporalError` so +/// the firmware can map specific cases to specific `esp_err_t` codes. +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum WeightLoadError { + TooShort, + BadMagic, + WrongVersion(u16), + SizeMismatch, + CrcMismatch, + ZeroDim, + InvalidGqaRatio, + DegenerateShape, +} + +impl fmt::Display for WeightLoadError { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + Self::TooShort => write!(f, "weight blob too short"), + Self::BadMagic => write!(f, "weight blob: bad magic"), + Self::WrongVersion(v) => write!(f, "weight blob: unsupported version {}", v), + Self::SizeMismatch => write!(f, "weight blob: declared length doesn't match buffer"), + Self::CrcMismatch => write!(f, "weight blob: CRC32 mismatch"), + Self::ZeroDim => write!(f, "weight blob: zero-valued dimension(s)"), + Self::InvalidGqaRatio => write!(f, "weight blob: n_q_heads not divisible by n_kv_heads"), + Self::DegenerateShape => write!(f, "weight blob: n_layers=0 or n_classes<2"), + } + } +} + +/// Map loader errors to esp_err_t-style codes for the C ABI. Defined +/// here rather than in lib.rs so the mapping stays adjacent to the +/// error type and can't drift. +pub const fn weight_load_err_to_esp(err: &WeightLoadError) -> i32 { + match err { + WeightLoadError::TooShort + | WeightLoadError::BadMagic + | WeightLoadError::WrongVersion(_) + | WeightLoadError::SizeMismatch => 0x102, // ESP_ERR_INVALID_ARG + WeightLoadError::CrcMismatch => 0x10C, // ESP_ERR_INVALID_CRC + WeightLoadError::ZeroDim + | WeightLoadError::InvalidGqaRatio + | WeightLoadError::DegenerateShape => 0x103, // ESP_ERR_INVALID_SIZE + } +} + +/// Same polynomial as `temporal_task.c::crc32_ieee` and the host-side +/// `wifi_densepose_temporal::weights::crc32_ieee`. The whole point of +/// keeping it bit-for-bit identical across all three sites is so a +/// blob round-trips without re-computing. +fn crc32_ieee(data: &[u8]) -> u32 { + let mut crc = 0xFFFF_FFFFu32; + for &b in data { + crc ^= b as u32; + for _ in 0..8 { + let mask = 0u32.wrapping_sub(crc & 1); + crc = (crc >> 1) ^ (0xEDB8_8320 & mask); + } + } + !crc +}