This commit is contained in:
Locke Werks 2026-06-04 05:15:34 +08:00 committed by GitHub
commit 5041344604
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 285 additions and 4 deletions

View File

@ -176,9 +176,9 @@ huggingface-cli download ruvnet/wifi-densepose-pretrained --local-dir models/wif
|----------|-------------|--------| |----------|-------------|--------|
| Python training / evaluation / embedding extraction | `model.safetensors` | ✅ Works — load with `safetensors.torch.load_file` | | Python training / evaluation / embedding extraction | `model.safetensors` | ✅ Works — load with `safetensors.torch.load_file` |
| Inspect / re-export the bundle | `model.rvf.jsonl` (line-by-line JSON) | ✅ Works — plain JSONL | | Inspect / re-export the bundle | `model.rvf.jsonl` (line-by-line JSON) | ✅ Works — plain JSONL |
| Sensing-server `--model <PATH>` flag | binary RVF (`RVFS` magic) | ⚠️ Loader does not yet accept the JSONL container | | Sensing-server `--model <PATH>` flag | binary RVF (`RVFS` magic) or JSONL container | ✅ Works — loader sniffs the input and maps JSONL lines onto in-memory binary segments |
**Known gap:** the HF model ships in JSONL RVF format, but `v2/crates/wifi-densepose-sensing-server/src/rvf_container.rs` only parses the binary RVF segment format. Pointing `--model` at `model.rvf.jsonl` currently errors with `invalid magic at offset 0: expected 0x52564653, got 0x7974227B` and the live pipeline degrades to null output rather than falling back to heuristic mode — so for the live sensing-server, run **without** `--model` until a JSONL adapter lands (or the model is re-published as binary RVF). Use the weights from Python / training in the meantime. **JSONL loader:** `RvfReader::from_bytes` (and `from_file`) in `v2/crates/wifi-densepose-sensing-server/src/rvf_container.rs` sniff the leading byte and dispatch to a JSONL parser when the file starts with `{`. The parser maps `metadata` lines onto a synthetic manifest segment (so `ProgressiveLoader::load_layer_a` reports the real model id and version), `quantization` lines onto a `SEG_QUANT` segment, and everything else (`encoder`, `lora`, `ewc`, `wiflow`, …) into a `SEG_META` payload preserved verbatim. Unrecognised inputs return an explicit error rather than degrading to null output. Note that the JSONL bundle deliberately does **not** carry the f32 weight matrix — those ship separately as `model.safetensors` / `model-qN.bin` — so the live inference path still needs one of those companion files when convolution weights are required.
**Quantization choices** (all in the HF repo): `model-q2.bin` (4 KB) · `model-q4.bin` ⭐ recommended (8 KB) · `model-q8.bin` (16 KB) · `model.safetensors` full (48 KB) **Quantization choices** (all in the HF repo): `model-q2.bin` (4 KB) · `model-q4.bin` ⭐ recommended (8 KB) · `model-q8.bin` (16 KB) · `model.safetensors` full (48 KB)

View File

@ -438,7 +438,34 @@ pub struct RvfReader {
impl RvfReader { impl RvfReader {
/// Parse an RVF container from a byte slice. /// Parse an RVF container from a byte slice.
///
/// Sniffs the input to support two on-disk encodings:
/// - Binary segment format (default): 64-byte segment headers + payloads,
/// identified by the leading `RVFS` magic (0x52564653, little-endian).
/// - Line-delimited JSON ("RVF JSONL"): one JSON object per line, identified
/// by a leading `{` after stripping ASCII whitespace. Used by the
/// HuggingFace release bundle (`model.rvf.jsonl`). Lines are mapped to
/// equivalent binary segments in-memory so downstream code can use the
/// same `RvfReader`/`ProgressiveLoader` plumbing.
///
/// Returns a clear error if neither format is recognised, so callers can
/// surface that to the operator instead of silently degrading.
pub fn from_bytes(data: &[u8]) -> Result<Self, String> { pub fn from_bytes(data: &[u8]) -> Result<Self, String> {
// Sniff: first non-whitespace byte tells us which encoding to use.
let first_non_ws = data.iter().copied().find(|b| !b.is_ascii_whitespace());
if let Some(b) = first_non_ws {
if b == b'{' || b == b'[' {
return Self::from_jsonl_bytes(data);
}
}
Self::from_binary_bytes(data)
}
/// Parse the binary RVF segment format. Most callers should use
/// [`RvfReader::from_bytes`], which dispatches between formats based on a
/// magic-byte sniff.
fn from_binary_bytes(data: &[u8]) -> Result<Self, String> {
let mut segments = Vec::new(); let mut segments = Vec::new();
let mut offset = 0; let mut offset = 0;
@ -454,7 +481,9 @@ impl RvfReader {
if header.magic != SEGMENT_MAGIC { if header.magic != SEGMENT_MAGIC {
return Err(format!( return Err(format!(
"invalid magic at offset {offset}: expected 0x{SEGMENT_MAGIC:08X}, \ "invalid magic at offset {offset}: expected 0x{SEGMENT_MAGIC:08X}, \
got 0x{:08X}", got 0x{:08X} (tip: if this file starts with '{{', it is the JSONL \
RVF format pass the raw bytes through RvfReader::from_bytes so it \
can be sniffed and converted)",
header.magic header.magic
)); ));
} }
@ -504,13 +533,131 @@ impl RvfReader {
}) })
} }
/// Read an RVF container from a file. /// Read an RVF container from a file. Format is sniffed automatically; both
/// the binary `RVFS` container and the JSONL container shipped by the
/// HuggingFace release bundle are accepted.
pub fn from_file(path: &std::path::Path) -> Result<Self, String> { pub fn from_file(path: &std::path::Path) -> Result<Self, String> {
let data = let data =
std::fs::read(path).map_err(|e| format!("failed to read {}: {e}", path.display()))?; std::fs::read(path).map_err(|e| format!("failed to read {}: {e}", path.display()))?;
Self::from_bytes(&data) Self::from_bytes(&data)
} }
/// Parse the JSONL RVF container format produced by the HuggingFace release
/// bundle (`model.rvf.jsonl`).
///
/// Each non-blank line must be a JSON object with a `"type"` field. The
/// loader maps known line types onto equivalent binary segments so the rest
/// of the pipeline (manifest lookup, progressive loader, etc.) keeps
/// working unmodified:
///
/// | JSONL `type` | Binary segment | Notes |
/// |------------------|---------------------|----------------------------------------|
/// | `metadata` | `SEG_MANIFEST` | `name` → `model_id`, `version` carried |
/// | `quantization` | `SEG_QUANT` | Stored verbatim as JSON payload |
/// | (anything else) | `SEG_META` | Stored verbatim — `wiflow`, `encoder`, |
/// | | | `lora`, `ewc`, etc. are all preserved |
///
/// The JSONL container does not carry the float weight matrix itself (the
/// HF bundle ships those separately in `model.safetensors` /
/// `model-qN.bin`), so the resulting `RvfReader` returns `None` from
/// [`RvfReader::weights`]. Downstream code that requires weights must read
/// them from one of the sibling artifacts; this matches the documented
/// behaviour of the HF bundle.
fn from_jsonl_bytes(data: &[u8]) -> Result<Self, String> {
let text = std::str::from_utf8(data)
.map_err(|e| format!("JSONL RVF: file is not valid UTF-8: {e}"))?;
let mut builder = RvfBuilder::new();
let mut saw_metadata = false;
let mut extra_meta: Vec<serde_json::Value> = Vec::new();
let mut line_no = 0usize;
for raw_line in text.lines() {
line_no += 1;
let line = raw_line.trim();
if line.is_empty() {
continue;
}
let value: serde_json::Value = serde_json::from_str(line).map_err(|e| {
format!("JSONL RVF: line {line_no} is not valid JSON: {e}")
})?;
let obj = value.as_object().ok_or_else(|| {
format!("JSONL RVF: line {line_no} must be a JSON object, got {value}")
})?;
let ty = obj
.get("type")
.and_then(|v| v.as_str())
.ok_or_else(|| {
format!("JSONL RVF: line {line_no} is missing required string field `type`")
})?
.to_string();
match ty.as_str() {
"metadata" => {
// Map to a SEG_MANIFEST so manifest()/load_layer_a() find it.
let model_id = obj
.get("name")
.and_then(|v| v.as_str())
.unwrap_or("unknown");
let version = obj
.get("version")
.and_then(|v| v.as_str())
.unwrap_or("0.0.0");
let description = obj
.get("architecture")
.and_then(|v| v.as_str())
.unwrap_or("wifi-densepose JSONL container");
builder.add_manifest(model_id, version, description);
// Preserve any extra metadata fields (training stats, custom
// dims, etc.) by also emitting a SEG_META segment with the
// full object.
extra_meta.push(value.clone());
saw_metadata = true;
}
"quantization" => {
// Encode as JSON in a SEG_QUANT segment. Use the original
// object verbatim — `quant_type` may be absent so we
// synthesise one for readability.
let mut quant = obj.clone();
quant
.entry("quant_type".to_string())
.or_insert(serde_json::Value::String("rvf-jsonl".to_string()));
builder
.add_raw_segment(SEG_QUANT, &serde_json::to_vec(&quant).unwrap_or_default());
}
// Everything else (encoder, lora, ewc, wiflow, ...) goes into a
// SEG_META segment so it round-trips through `metadata()` and
// downstream introspection tooling.
_ => {
extra_meta.push(value.clone());
}
}
}
if !saw_metadata && extra_meta.is_empty() {
return Err(
"JSONL RVF: file contained no JSON objects with a `type` field".to_string(),
);
}
// Bundle every non-quantization line into a single SEG_META segment so
// callers can recover the full picture via metadata() without us needing
// to invent a richer segment vocabulary just for the JSONL adapter.
let meta_payload = serde_json::json!({
"source_format": "rvf-jsonl",
"lines": extra_meta,
});
builder.add_metadata(&meta_payload);
// Round-trip via the binary path so we share validation + CRC checks.
let bytes = builder.build();
Self::from_binary_bytes(&bytes)
}
/// Find the first segment with the given type and return its payload. /// Find the first segment with the given type and return its payload.
pub fn find_segment(&self, seg_type: u8) -> Option<&[u8]> { pub fn find_segment(&self, seg_type: u8) -> Option<&[u8]> {
self.segments self.segments
@ -1074,6 +1221,140 @@ mod tests {
assert!(reader.lora_profile("nonexistent").is_none()); assert!(reader.lora_profile("nonexistent").is_none());
} }
// ── JSONL RVF container tests (HuggingFace bundle compatibility) ────────
/// Mirrors the exact bytes that ship in `ruvnet/wifi-densepose-pretrained`
/// at `model.rvf.jsonl` (HuggingFace bundle, v1.0.0).
const SAMPLE_HF_JSONL: &str = concat!(
"{\"type\":\"metadata\",\"name\":\"wifi-densepose-csi-embedding\",",
"\"version\":\"1.0.0\",\"architecture\":\"csi-encoder-8-64-128\",",
"\"training\":{\"steps\":12212300,\"loss\":0.065,\"learningRate\":0.001},",
"\"custom\":{\"inputDim\":8,\"hiddenDim\":64,\"embeddingDim\":128}}\n",
"{\"type\":\"encoder\",\"w1_shape\":[8,64],\"w2_shape\":[64,128]}\n",
"{\"type\":\"lora\",\"config\":{\"rank\":8,\"alpha\":16}}\n",
"{\"type\":\"ewc\",\"stats\":{\"tasksLearned\":4}}\n",
"{\"type\":\"quantization\",\"default_bits\":4,\"variants\":[2,4,8]}\n",
);
#[test]
fn from_bytes_dispatches_to_jsonl_on_brace() {
let reader = RvfReader::from_bytes(SAMPLE_HF_JSONL.as_bytes())
.expect("HF JSONL bundle should load via sniff");
let manifest = reader.manifest().expect("manifest should be synthesised");
assert_eq!(manifest["model_id"], "wifi-densepose-csi-embedding");
assert_eq!(manifest["version"], "1.0.0");
assert_eq!(manifest["description"], "csi-encoder-8-64-128");
}
#[test]
fn jsonl_sniff_tolerates_leading_whitespace() {
let padded = format!("\n \t{}", SAMPLE_HF_JSONL);
let reader = RvfReader::from_bytes(padded.as_bytes()).expect("whitespace prefix ok");
assert!(reader.manifest().is_some());
}
#[test]
fn jsonl_quantization_becomes_quant_segment() {
let reader = RvfReader::from_bytes(SAMPLE_HF_JSONL.as_bytes()).unwrap();
let q = reader
.quant_info()
.expect("quantization line should map to SEG_QUANT");
assert_eq!(q["default_bits"], 4);
assert_eq!(q["variants"], serde_json::json!([2, 4, 8]));
}
#[test]
fn jsonl_preserves_other_lines_in_metadata() {
let reader = RvfReader::from_bytes(SAMPLE_HF_JSONL.as_bytes()).unwrap();
let meta = reader.metadata().expect("aggregated metadata present");
assert_eq!(meta["source_format"], "rvf-jsonl");
let lines = meta["lines"]
.as_array()
.expect("lines must be an array");
// metadata + encoder + lora + ewc -> 4 entries (quantization went to SEG_QUANT)
assert!(lines.len() >= 4, "got {} lines", lines.len());
let types: Vec<&str> = lines
.iter()
.filter_map(|v| v["type"].as_str())
.collect();
assert!(types.contains(&"metadata"));
assert!(types.contains(&"encoder"));
assert!(types.contains(&"lora"));
assert!(types.contains(&"ewc"));
}
#[test]
fn jsonl_no_weights_segment_present() {
// The JSONL bundle deliberately ships its f32 matrices in companion
// safetensors / qN files, so the reader should not invent fake weights.
let reader = RvfReader::from_bytes(SAMPLE_HF_JSONL.as_bytes()).unwrap();
assert!(reader.weights().is_none(), "JSONL must not synthesise weights");
assert!(!reader.info().has_weights);
}
#[test]
fn jsonl_progressive_loader_layer_a_works() {
use crate::rvf_pipeline::ProgressiveLoader;
// This is the integration point that broke when the loader couldn't
// sniff JSONL — verify Layer A reports the real model name now.
let mut loader = ProgressiveLoader::new(SAMPLE_HF_JSONL.as_bytes())
.expect("progressive loader accepts JSONL bytes");
let la = loader
.load_layer_a()
.expect("layer A must populate from synthesised manifest");
assert_eq!(la.model_name, "wifi-densepose-csi-embedding");
assert_eq!(la.version, "1.0.0");
assert!(la.n_segments > 0);
}
#[test]
fn jsonl_invalid_json_line_is_explicit() {
let bad = b"{\"type\":\"metadata\",\"name\":\"x\"}\nthis is not json\n";
let err = RvfReader::from_bytes(bad).unwrap_err();
assert!(err.contains("JSONL RVF"), "got: {err}");
assert!(err.contains("line 2"), "got: {err}");
}
#[test]
fn jsonl_missing_type_field_is_explicit() {
let bad = b"{\"name\":\"no-type-here\"}\n";
let err = RvfReader::from_bytes(bad).unwrap_err();
assert!(err.contains("missing required string field `type`"), "got: {err}");
}
#[test]
fn binary_error_mentions_jsonl_hint() {
// Garbage bytes should now hint at JSONL when applicable.
let mut data = vec![0u8; 128];
data[0..4].copy_from_slice(&0xDEAD_BEEFu32.to_le_bytes());
let err = RvfReader::from_bytes(&data).unwrap_err();
assert!(err.contains("invalid magic"));
// Hint text travels with the binary-path error so operators can grep it.
assert!(err.contains("JSONL"), "expected JSONL hint, got: {err}");
}
#[test]
fn jsonl_minimal_metadata_only() {
let minimal = b"{\"type\":\"metadata\",\"name\":\"tiny\",\"version\":\"0.0.1\"}\n";
let reader = RvfReader::from_bytes(minimal).unwrap();
let m = reader.manifest().unwrap();
assert_eq!(m["model_id"], "tiny");
assert_eq!(m["version"], "0.0.1");
}
#[test]
fn jsonl_blank_lines_only_rejected() {
// A file made entirely of whitespace doesn't trip the JSONL sniff
// (no `{` ever appears) and parses as an empty binary container.
// A file that starts with `{` but only contains blank lines after the
// first non-blank line must be rejected with a clear error rather than
// silently producing an empty reader.
let only_blanks = b"{\n\n";
let err = RvfReader::from_bytes(only_blanks).unwrap_err();
assert!(err.contains("JSONL RVF"), "got: {err}");
}
#[test] #[test]
fn test_rvf_multiple_lora_profiles() { fn test_rvf_multiple_lora_profiles() {
let w1: Vec<f32> = vec![1.0, 2.0, 3.0]; let w1: Vec<f32> = vec![1.0, 2.0, 3.0];