|
|
|
|
@ -438,7 +438,34 @@ pub struct RvfReader {
|
|
|
|
|
|
|
|
|
|
impl RvfReader {
|
|
|
|
|
/// Parse an RVF container from a byte slice.
|
|
|
|
|
///
|
|
|
|
|
/// Sniffs the input to support two on-disk encodings:
|
|
|
|
|
/// - Binary segment format (default): 64-byte segment headers + payloads,
|
|
|
|
|
/// identified by the leading `RVFS` magic (0x52564653, little-endian).
|
|
|
|
|
/// - Line-delimited JSON ("RVF JSONL"): one JSON object per line, identified
|
|
|
|
|
/// by a leading `{` after stripping ASCII whitespace. Used by the
|
|
|
|
|
/// HuggingFace release bundle (`model.rvf.jsonl`). Lines are mapped to
|
|
|
|
|
/// equivalent binary segments in-memory so downstream code can use the
|
|
|
|
|
/// same `RvfReader`/`ProgressiveLoader` plumbing.
|
|
|
|
|
///
|
|
|
|
|
/// Returns a clear error if neither format is recognised, so callers can
|
|
|
|
|
/// surface that to the operator instead of silently degrading.
|
|
|
|
|
pub fn from_bytes(data: &[u8]) -> Result<Self, String> {
|
|
|
|
|
// Sniff: first non-whitespace byte tells us which encoding to use.
|
|
|
|
|
let first_non_ws = data.iter().copied().find(|b| !b.is_ascii_whitespace());
|
|
|
|
|
if let Some(b) = first_non_ws {
|
|
|
|
|
if b == b'{' || b == b'[' {
|
|
|
|
|
return Self::from_jsonl_bytes(data);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
Self::from_binary_bytes(data)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// Parse the binary RVF segment format. Most callers should use
|
|
|
|
|
/// [`RvfReader::from_bytes`], which dispatches between formats based on a
|
|
|
|
|
/// magic-byte sniff.
|
|
|
|
|
fn from_binary_bytes(data: &[u8]) -> Result<Self, String> {
|
|
|
|
|
let mut segments = Vec::new();
|
|
|
|
|
let mut offset = 0;
|
|
|
|
|
|
|
|
|
|
@ -454,7 +481,9 @@ impl RvfReader {
|
|
|
|
|
if header.magic != SEGMENT_MAGIC {
|
|
|
|
|
return Err(format!(
|
|
|
|
|
"invalid magic at offset {offset}: expected 0x{SEGMENT_MAGIC:08X}, \
|
|
|
|
|
got 0x{:08X}",
|
|
|
|
|
got 0x{:08X} (tip: if this file starts with '{{', it is the JSONL \
|
|
|
|
|
RVF format — pass the raw bytes through RvfReader::from_bytes so it \
|
|
|
|
|
can be sniffed and converted)",
|
|
|
|
|
header.magic
|
|
|
|
|
));
|
|
|
|
|
}
|
|
|
|
|
@ -504,13 +533,131 @@ impl RvfReader {
|
|
|
|
|
})
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// Read an RVF container from a file.
|
|
|
|
|
/// Read an RVF container from a file. Format is sniffed automatically; both
|
|
|
|
|
/// the binary `RVFS` container and the JSONL container shipped by the
|
|
|
|
|
/// HuggingFace release bundle are accepted.
|
|
|
|
|
pub fn from_file(path: &std::path::Path) -> Result<Self, String> {
|
|
|
|
|
let data =
|
|
|
|
|
std::fs::read(path).map_err(|e| format!("failed to read {}: {e}", path.display()))?;
|
|
|
|
|
Self::from_bytes(&data)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// Parse the JSONL RVF container format produced by the HuggingFace release
|
|
|
|
|
/// bundle (`model.rvf.jsonl`).
|
|
|
|
|
///
|
|
|
|
|
/// Each non-blank line must be a JSON object with a `"type"` field. The
|
|
|
|
|
/// loader maps known line types onto equivalent binary segments so the rest
|
|
|
|
|
/// of the pipeline (manifest lookup, progressive loader, etc.) keeps
|
|
|
|
|
/// working unmodified:
|
|
|
|
|
///
|
|
|
|
|
/// | JSONL `type` | Binary segment | Notes |
|
|
|
|
|
/// |------------------|---------------------|----------------------------------------|
|
|
|
|
|
/// | `metadata` | `SEG_MANIFEST` | `name` → `model_id`, `version` carried |
|
|
|
|
|
/// | `quantization` | `SEG_QUANT` | Stored verbatim as JSON payload |
|
|
|
|
|
/// | (anything else) | `SEG_META` | Stored verbatim — `wiflow`, `encoder`, |
|
|
|
|
|
/// | | | `lora`, `ewc`, etc. are all preserved |
|
|
|
|
|
///
|
|
|
|
|
/// The JSONL container does not carry the float weight matrix itself (the
|
|
|
|
|
/// HF bundle ships those separately in `model.safetensors` /
|
|
|
|
|
/// `model-qN.bin`), so the resulting `RvfReader` returns `None` from
|
|
|
|
|
/// [`RvfReader::weights`]. Downstream code that requires weights must read
|
|
|
|
|
/// them from one of the sibling artifacts; this matches the documented
|
|
|
|
|
/// behaviour of the HF bundle.
|
|
|
|
|
fn from_jsonl_bytes(data: &[u8]) -> Result<Self, String> {
|
|
|
|
|
let text = std::str::from_utf8(data)
|
|
|
|
|
.map_err(|e| format!("JSONL RVF: file is not valid UTF-8: {e}"))?;
|
|
|
|
|
|
|
|
|
|
let mut builder = RvfBuilder::new();
|
|
|
|
|
let mut saw_metadata = false;
|
|
|
|
|
let mut extra_meta: Vec<serde_json::Value> = Vec::new();
|
|
|
|
|
let mut line_no = 0usize;
|
|
|
|
|
|
|
|
|
|
for raw_line in text.lines() {
|
|
|
|
|
line_no += 1;
|
|
|
|
|
let line = raw_line.trim();
|
|
|
|
|
if line.is_empty() {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
let value: serde_json::Value = serde_json::from_str(line).map_err(|e| {
|
|
|
|
|
format!("JSONL RVF: line {line_no} is not valid JSON: {e}")
|
|
|
|
|
})?;
|
|
|
|
|
|
|
|
|
|
let obj = value.as_object().ok_or_else(|| {
|
|
|
|
|
format!("JSONL RVF: line {line_no} must be a JSON object, got {value}")
|
|
|
|
|
})?;
|
|
|
|
|
|
|
|
|
|
let ty = obj
|
|
|
|
|
.get("type")
|
|
|
|
|
.and_then(|v| v.as_str())
|
|
|
|
|
.ok_or_else(|| {
|
|
|
|
|
format!("JSONL RVF: line {line_no} is missing required string field `type`")
|
|
|
|
|
})?
|
|
|
|
|
.to_string();
|
|
|
|
|
|
|
|
|
|
match ty.as_str() {
|
|
|
|
|
"metadata" => {
|
|
|
|
|
// Map to a SEG_MANIFEST so manifest()/load_layer_a() find it.
|
|
|
|
|
let model_id = obj
|
|
|
|
|
.get("name")
|
|
|
|
|
.and_then(|v| v.as_str())
|
|
|
|
|
.unwrap_or("unknown");
|
|
|
|
|
let version = obj
|
|
|
|
|
.get("version")
|
|
|
|
|
.and_then(|v| v.as_str())
|
|
|
|
|
.unwrap_or("0.0.0");
|
|
|
|
|
let description = obj
|
|
|
|
|
.get("architecture")
|
|
|
|
|
.and_then(|v| v.as_str())
|
|
|
|
|
.unwrap_or("wifi-densepose JSONL container");
|
|
|
|
|
builder.add_manifest(model_id, version, description);
|
|
|
|
|
|
|
|
|
|
// Preserve any extra metadata fields (training stats, custom
|
|
|
|
|
// dims, etc.) by also emitting a SEG_META segment with the
|
|
|
|
|
// full object.
|
|
|
|
|
extra_meta.push(value.clone());
|
|
|
|
|
saw_metadata = true;
|
|
|
|
|
}
|
|
|
|
|
"quantization" => {
|
|
|
|
|
// Encode as JSON in a SEG_QUANT segment. Use the original
|
|
|
|
|
// object verbatim — `quant_type` may be absent so we
|
|
|
|
|
// synthesise one for readability.
|
|
|
|
|
let mut quant = obj.clone();
|
|
|
|
|
quant
|
|
|
|
|
.entry("quant_type".to_string())
|
|
|
|
|
.or_insert(serde_json::Value::String("rvf-jsonl".to_string()));
|
|
|
|
|
builder
|
|
|
|
|
.add_raw_segment(SEG_QUANT, &serde_json::to_vec(&quant).unwrap_or_default());
|
|
|
|
|
}
|
|
|
|
|
// Everything else (encoder, lora, ewc, wiflow, ...) goes into a
|
|
|
|
|
// SEG_META segment so it round-trips through `metadata()` and
|
|
|
|
|
// downstream introspection tooling.
|
|
|
|
|
_ => {
|
|
|
|
|
extra_meta.push(value.clone());
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if !saw_metadata && extra_meta.is_empty() {
|
|
|
|
|
return Err(
|
|
|
|
|
"JSONL RVF: file contained no JSON objects with a `type` field".to_string(),
|
|
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Bundle every non-quantization line into a single SEG_META segment so
|
|
|
|
|
// callers can recover the full picture via metadata() without us needing
|
|
|
|
|
// to invent a richer segment vocabulary just for the JSONL adapter.
|
|
|
|
|
let meta_payload = serde_json::json!({
|
|
|
|
|
"source_format": "rvf-jsonl",
|
|
|
|
|
"lines": extra_meta,
|
|
|
|
|
});
|
|
|
|
|
builder.add_metadata(&meta_payload);
|
|
|
|
|
|
|
|
|
|
// Round-trip via the binary path so we share validation + CRC checks.
|
|
|
|
|
let bytes = builder.build();
|
|
|
|
|
Self::from_binary_bytes(&bytes)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// Find the first segment with the given type and return its payload.
|
|
|
|
|
pub fn find_segment(&self, seg_type: u8) -> Option<&[u8]> {
|
|
|
|
|
self.segments
|
|
|
|
|
@ -1074,6 +1221,140 @@ mod tests {
|
|
|
|
|
assert!(reader.lora_profile("nonexistent").is_none());
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// ── JSONL RVF container tests (HuggingFace bundle compatibility) ────────
|
|
|
|
|
|
|
|
|
|
/// Mirrors the exact bytes that ship in `ruvnet/wifi-densepose-pretrained`
|
|
|
|
|
/// at `model.rvf.jsonl` (HuggingFace bundle, v1.0.0).
|
|
|
|
|
const SAMPLE_HF_JSONL: &str = concat!(
|
|
|
|
|
"{\"type\":\"metadata\",\"name\":\"wifi-densepose-csi-embedding\",",
|
|
|
|
|
"\"version\":\"1.0.0\",\"architecture\":\"csi-encoder-8-64-128\",",
|
|
|
|
|
"\"training\":{\"steps\":12212300,\"loss\":0.065,\"learningRate\":0.001},",
|
|
|
|
|
"\"custom\":{\"inputDim\":8,\"hiddenDim\":64,\"embeddingDim\":128}}\n",
|
|
|
|
|
"{\"type\":\"encoder\",\"w1_shape\":[8,64],\"w2_shape\":[64,128]}\n",
|
|
|
|
|
"{\"type\":\"lora\",\"config\":{\"rank\":8,\"alpha\":16}}\n",
|
|
|
|
|
"{\"type\":\"ewc\",\"stats\":{\"tasksLearned\":4}}\n",
|
|
|
|
|
"{\"type\":\"quantization\",\"default_bits\":4,\"variants\":[2,4,8]}\n",
|
|
|
|
|
);
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
fn from_bytes_dispatches_to_jsonl_on_brace() {
|
|
|
|
|
let reader = RvfReader::from_bytes(SAMPLE_HF_JSONL.as_bytes())
|
|
|
|
|
.expect("HF JSONL bundle should load via sniff");
|
|
|
|
|
|
|
|
|
|
let manifest = reader.manifest().expect("manifest should be synthesised");
|
|
|
|
|
assert_eq!(manifest["model_id"], "wifi-densepose-csi-embedding");
|
|
|
|
|
assert_eq!(manifest["version"], "1.0.0");
|
|
|
|
|
assert_eq!(manifest["description"], "csi-encoder-8-64-128");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
fn jsonl_sniff_tolerates_leading_whitespace() {
|
|
|
|
|
let padded = format!("\n \t{}", SAMPLE_HF_JSONL);
|
|
|
|
|
let reader = RvfReader::from_bytes(padded.as_bytes()).expect("whitespace prefix ok");
|
|
|
|
|
assert!(reader.manifest().is_some());
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
fn jsonl_quantization_becomes_quant_segment() {
|
|
|
|
|
let reader = RvfReader::from_bytes(SAMPLE_HF_JSONL.as_bytes()).unwrap();
|
|
|
|
|
let q = reader
|
|
|
|
|
.quant_info()
|
|
|
|
|
.expect("quantization line should map to SEG_QUANT");
|
|
|
|
|
assert_eq!(q["default_bits"], 4);
|
|
|
|
|
assert_eq!(q["variants"], serde_json::json!([2, 4, 8]));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
fn jsonl_preserves_other_lines_in_metadata() {
|
|
|
|
|
let reader = RvfReader::from_bytes(SAMPLE_HF_JSONL.as_bytes()).unwrap();
|
|
|
|
|
let meta = reader.metadata().expect("aggregated metadata present");
|
|
|
|
|
assert_eq!(meta["source_format"], "rvf-jsonl");
|
|
|
|
|
let lines = meta["lines"]
|
|
|
|
|
.as_array()
|
|
|
|
|
.expect("lines must be an array");
|
|
|
|
|
// metadata + encoder + lora + ewc -> 4 entries (quantization went to SEG_QUANT)
|
|
|
|
|
assert!(lines.len() >= 4, "got {} lines", lines.len());
|
|
|
|
|
let types: Vec<&str> = lines
|
|
|
|
|
.iter()
|
|
|
|
|
.filter_map(|v| v["type"].as_str())
|
|
|
|
|
.collect();
|
|
|
|
|
assert!(types.contains(&"metadata"));
|
|
|
|
|
assert!(types.contains(&"encoder"));
|
|
|
|
|
assert!(types.contains(&"lora"));
|
|
|
|
|
assert!(types.contains(&"ewc"));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
fn jsonl_no_weights_segment_present() {
|
|
|
|
|
// The JSONL bundle deliberately ships its f32 matrices in companion
|
|
|
|
|
// safetensors / qN files, so the reader should not invent fake weights.
|
|
|
|
|
let reader = RvfReader::from_bytes(SAMPLE_HF_JSONL.as_bytes()).unwrap();
|
|
|
|
|
assert!(reader.weights().is_none(), "JSONL must not synthesise weights");
|
|
|
|
|
assert!(!reader.info().has_weights);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
fn jsonl_progressive_loader_layer_a_works() {
|
|
|
|
|
use crate::rvf_pipeline::ProgressiveLoader;
|
|
|
|
|
// This is the integration point that broke when the loader couldn't
|
|
|
|
|
// sniff JSONL — verify Layer A reports the real model name now.
|
|
|
|
|
let mut loader = ProgressiveLoader::new(SAMPLE_HF_JSONL.as_bytes())
|
|
|
|
|
.expect("progressive loader accepts JSONL bytes");
|
|
|
|
|
let la = loader
|
|
|
|
|
.load_layer_a()
|
|
|
|
|
.expect("layer A must populate from synthesised manifest");
|
|
|
|
|
assert_eq!(la.model_name, "wifi-densepose-csi-embedding");
|
|
|
|
|
assert_eq!(la.version, "1.0.0");
|
|
|
|
|
assert!(la.n_segments > 0);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
fn jsonl_invalid_json_line_is_explicit() {
|
|
|
|
|
let bad = b"{\"type\":\"metadata\",\"name\":\"x\"}\nthis is not json\n";
|
|
|
|
|
let err = RvfReader::from_bytes(bad).unwrap_err();
|
|
|
|
|
assert!(err.contains("JSONL RVF"), "got: {err}");
|
|
|
|
|
assert!(err.contains("line 2"), "got: {err}");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
fn jsonl_missing_type_field_is_explicit() {
|
|
|
|
|
let bad = b"{\"name\":\"no-type-here\"}\n";
|
|
|
|
|
let err = RvfReader::from_bytes(bad).unwrap_err();
|
|
|
|
|
assert!(err.contains("missing required string field `type`"), "got: {err}");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
fn binary_error_mentions_jsonl_hint() {
|
|
|
|
|
// Garbage bytes should now hint at JSONL when applicable.
|
|
|
|
|
let mut data = vec![0u8; 128];
|
|
|
|
|
data[0..4].copy_from_slice(&0xDEAD_BEEFu32.to_le_bytes());
|
|
|
|
|
let err = RvfReader::from_bytes(&data).unwrap_err();
|
|
|
|
|
assert!(err.contains("invalid magic"));
|
|
|
|
|
// Hint text travels with the binary-path error so operators can grep it.
|
|
|
|
|
assert!(err.contains("JSONL"), "expected JSONL hint, got: {err}");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
fn jsonl_minimal_metadata_only() {
|
|
|
|
|
let minimal = b"{\"type\":\"metadata\",\"name\":\"tiny\",\"version\":\"0.0.1\"}\n";
|
|
|
|
|
let reader = RvfReader::from_bytes(minimal).unwrap();
|
|
|
|
|
let m = reader.manifest().unwrap();
|
|
|
|
|
assert_eq!(m["model_id"], "tiny");
|
|
|
|
|
assert_eq!(m["version"], "0.0.1");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
fn jsonl_blank_lines_only_rejected() {
|
|
|
|
|
// A file made entirely of whitespace doesn't trip the JSONL sniff
|
|
|
|
|
// (no `{` ever appears) and parses as an empty binary container.
|
|
|
|
|
// A file that starts with `{` but only contains blank lines after the
|
|
|
|
|
// first non-blank line must be rejected with a clear error rather than
|
|
|
|
|
// silently producing an empty reader.
|
|
|
|
|
let only_blanks = b"{\n\n";
|
|
|
|
|
let err = RvfReader::from_bytes(only_blanks).unwrap_err();
|
|
|
|
|
assert!(err.contains("JSONL RVF"), "got: {err}");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
fn test_rvf_multiple_lora_profiles() {
|
|
|
|
|
let w1: Vec<f32> = vec![1.0, 2.0, 3.0];
|
|
|
|
|
|