diff --git a/.gitignore b/.gitignore index 30f4a0eb..b1c46f74 100644 --- a/.gitignore +++ b/.gitignore @@ -256,6 +256,12 @@ models/ demo_pointcloud.ply demo_splats.json +# Downloaded Hugging Face pretrained weights (not for git) +/models/wifi-densepose-pretrained/ + +# Session scratch files +/.tmp_* + # rvCSI napi-rs addon — generated by `napi build` (do not commit) v2/crates/rvcsi-node/*.node v2/crates/rvcsi-node/binding.js diff --git a/docs/user-guide.md b/docs/user-guide.md index 11e6695b..f872a870 100644 --- a/docs/user-guide.md +++ b/docs/user-guide.md @@ -1165,6 +1165,10 @@ cargo run -p wifi-densepose-sensing-server --release -- \ --source esp32 --udp-port 5005 --http-port 3000 ``` +### Verifying the downloaded bundle + +After downloading, run [`scripts/verify-hf-model.py`](../scripts/verify-hf-model.py) to confirm the bundle is intact and the weights load cleanly. The script prints the safetensors tensor inventory (names, shapes, dtypes), parses `model.rvf.jsonl` line by line, dumps `presence-head.json` / `config.json` / `training-metrics.json`, and — when `torch` is available — pushes a synthetic batch through the first encoder linear layer to confirm no NaN / Inf. It exits 0 on success, non-zero with a clear error otherwise, and accepts `--local-dir ` (default `models/wifi-densepose-pretrained/`). Heads up: the published `model.safetensors` header has three trailing NUL bytes that the strict Rust loader rejects (`trailing characters at line 1 column 1462`); the script falls back to a small pure-Python reader that strips the padding so you still get the full inventory. + See [RVF Model Containers](#rvf-model-containers) for the binary format the loader expects, and [Training a Model](#training-a-model) for using the encoder as a starting point for environment-specific fine-tuning. --- diff --git a/scripts/verify-hf-model.py b/scripts/verify-hf-model.py new file mode 100644 index 00000000..b6d0fcaf --- /dev/null +++ b/scripts/verify-hf-model.py @@ -0,0 +1,343 @@ +#!/usr/bin/env python3 +"""Verify the published ruvnet/wifi-densepose-pretrained model bundle. + +Inspects every file in the downloaded model directory: + - model.safetensors -> tensor names + shapes + dtypes + - model.rvf.jsonl -> line count, first three lines, distinct top-level keys + - presence-head.json -> shallow dump (depth <= 3) + - config.json -> full dump + - training-metrics.json -> final loss / quantization / lora numbers + +If torch is importable, builds a synthetic input matching the inferred encoder +input dim and runs encoder.w1 (the first linear layer) to confirm the weights +yield finite outputs (no NaN / Inf). + +Exits 0 on success, non-zero with a clear error on any failure. + +Usage: + python scripts/verify-hf-model.py + python scripts/verify-hf-model.py --local-dir models/wifi-densepose-pretrained/ +""" + +from __future__ import annotations + +import argparse +import json +import struct +import sys +from pathlib import Path +from typing import Any + +DEFAULT_LOCAL_DIR = Path("models/wifi-densepose-pretrained/") + +# safetensors -> torch dtype lookup. Subset is enough for this bundle. +_SAFETENSORS_DTYPE_NAMES = { + "F64", "F32", "F16", "BF16", + "I64", "I32", "I16", "I8", "U8", "BOOL", +} + + +# --------------------------------------------------------------------------- # +# safetensors loading +# --------------------------------------------------------------------------- # +def _load_safetensors(path: Path): + """Load a .safetensors file as a dict[name -> torch.Tensor]. + + Tries the upstream `safetensors.torch.load_file` first. The published HF + bundle has a non-fatal header bug (declared header length includes 3 + trailing NUL bytes after the JSON object), which the strict Rust parser + rejects with `trailing characters at line 1 column 1462`. When that + happens we fall back to a small pure-Python loader that strips the + padding and rebuilds tensors from the body. + """ + try: + from safetensors.torch import load_file # type: ignore + + return load_file(str(path)), "safetensors.torch.load_file" + except Exception as exc: # noqa: BLE001 - we want any failure here + msg = str(exc) + if "trailing characters" not in msg and "invalid JSON" not in msg: + raise + # Fall through to the manual loader below. + first_err = f"{type(exc).__name__}: {exc}" + + import torch # local import so the fallback message is precise + + dtype_map = { + "F64": torch.float64, + "F32": torch.float32, + "F16": torch.float16, + "BF16": torch.bfloat16, + "I64": torch.int64, + "I32": torch.int32, + "I16": torch.int16, + "I8": torch.int8, + "U8": torch.uint8, + "BOOL": torch.bool, + } + raw = path.read_bytes() + if len(raw) < 8: + raise ValueError(f"{path}: file too short to be a safetensors blob") + header_len = struct.unpack(" len(raw): + raise ValueError( + f"{path}: header length {header_len} inconsistent with file size {len(raw)}" + ) + header_bytes = raw[8 : 8 + header_len] + # Strip the published-bundle trailing padding (NULs / whitespace) before parsing. + header_text = header_bytes.rstrip(b"\x00 \t\r\n").decode("utf-8") + header = json.loads(header_text) + body = raw[8 + header_len :] + + state: dict[str, Any] = {} + for name, info in header.items(): + if name == "__metadata__": + continue + dtype_name = info["dtype"] + if dtype_name not in dtype_map: + raise ValueError(f"{name}: unsupported safetensors dtype {dtype_name!r}") + shape = list(info["shape"]) + start, end = info["data_offsets"] + if start < 0 or end > len(body) or start > end: + raise ValueError( + f"{name}: bad offsets [{start}, {end}] for body of size {len(body)}" + ) + tensor = torch.frombuffer( + bytearray(body[start:end]), dtype=dtype_map[dtype_name] + ).reshape(shape) + state[name] = tensor.clone() # detach from the bytearray buffer + return state, ( + "manual fallback (published bundle has trailing NULs in header; " + f"first error was: {first_err})" + ) + + +# --------------------------------------------------------------------------- # +# JSONL helpers +# --------------------------------------------------------------------------- # +def _truncate(s: str, n: int) -> str: + return s if len(s) <= n else s[:n] + "..." + + +def _inspect_jsonl(path: Path) -> tuple[int, list[str], list[str]]: + """Return (line_count, first_three_truncated, sorted_distinct_top_keys).""" + lines: list[str] = [] + keys: set[str] = set() + total = 0 + with path.open("r", encoding="utf-8") as fh: + for idx, raw_line in enumerate(fh): + line = raw_line.rstrip("\n") + if not line.strip(): + continue + total += 1 + if idx < 3: + lines.append(_truncate(line, 200)) + try: + obj = json.loads(line) + except json.JSONDecodeError as exc: + raise ValueError(f"{path}: line {idx + 1} is not valid JSON: {exc}") from exc + if isinstance(obj, dict): + keys.update(obj.keys()) + return total, lines, sorted(keys) + + +# --------------------------------------------------------------------------- # +# Pretty printers +# --------------------------------------------------------------------------- # +def _dump_shallow(obj: Any, depth: int = 0, max_depth: int = 3) -> str: + """Render `obj` as JSON, but collapse anything below max_depth to its type.""" + if depth >= max_depth: + if isinstance(obj, dict): + return f"" + if isinstance(obj, list): + return f"" + return repr(obj) + if isinstance(obj, dict): + body = ", ".join( + f'"{k}": {_dump_shallow(v, depth + 1, max_depth)}' for k, v in obj.items() + ) + return "{" + body + "}" + if isinstance(obj, list): + if len(obj) > 8: + sample = ", ".join(_dump_shallow(v, depth + 1, max_depth) for v in obj[:8]) + return f"[{sample}, ... (+{len(obj) - 8} more)]" + return "[" + ", ".join(_dump_shallow(v, depth + 1, max_depth) for v in obj) + "]" + return json.dumps(obj) + + +def _section(title: str) -> None: + print() + print("=" * 78) + print(title) + print("=" * 78) + + +# --------------------------------------------------------------------------- # +# Verification steps +# --------------------------------------------------------------------------- # +def _verify_safetensors(path: Path) -> tuple[dict, str, dict[str, tuple]]: + state, loader_note = _load_safetensors(path) + info: dict[str, tuple] = {} + print(f"loader: {loader_note}") + print(f"tensor count: {len(state)}") + print(f"{'name':<30} {'shape':<22} dtype") + print("-" * 78) + for name, tensor in state.items(): + shape = tuple(tensor.shape) + dtype = str(tensor.dtype) + info[name] = (shape, dtype) + print(f"{name:<30} {str(shape):<22} {dtype}") + return state, loader_note, info + + +def _verify_jsonl(path: Path) -> None: + total, sample, keys = _inspect_jsonl(path) + print(f"line count: {total}") + print(f"distinct top-level keys observed: {keys}") + print("first 3 lines (truncated to 200 chars):") + for idx, line in enumerate(sample, start=1): + print(f" [{idx}] {line}") + + +def _verify_presence_head(path: Path) -> None: + obj = json.loads(path.read_text(encoding="utf-8")) + print(_dump_shallow(obj, max_depth=3)) + + +def _verify_config(path: Path) -> dict: + obj = json.loads(path.read_text(encoding="utf-8")) + print(json.dumps(obj, indent=2, sort_keys=True)) + return obj + + +def _verify_training_metrics(path: Path) -> None: + obj = json.loads(path.read_text(encoding="utf-8")) + # Final metrics live in a few specific places. + contrastive = obj.get("contrastive", {}) + task_heads = obj.get("taskHeads", {}) + lora = obj.get("lora", {}) + quant = obj.get("quantization", {}) + print(f"timestamp: {obj.get('timestamp')}") + print(f"total duration (ms): {obj.get('totalDurationMs')}") + print(f"contrastive triplets / final loss: " + f"{contrastive.get('triplets')} / {contrastive.get('finalLoss')}") + print(f"task heads samples / final loss: " + f"{task_heads.get('samples')} / {task_heads.get('finalLoss')}") + print(f"lora adapters / total params: " + f"{lora.get('adapters')} / {lora.get('totalParameters')}") + if quant: + print("quantization:") + for variant, stats in quant.items(): + print(f" {variant}: {stats}") + + +def _verify_first_linear( + state: dict, config: dict, tensor_info: dict[str, tuple] +) -> None: + try: + import torch # noqa: F401 (used below) + except ImportError: + print("torch not importable - skipping forward-pass smoke test") + return + + import torch + + custom = (config or {}).get("custom", {}) + input_dim = int(custom.get("inputDim", 8)) + hidden_dim = int(custom.get("hiddenDim", 64)) + + w1 = state.get("encoder.w1") + b1 = state.get("encoder.b1") + if w1 is None or b1 is None: + raise RuntimeError("encoder.w1 / encoder.b1 missing from safetensors") + + # The published encoder stores the first linear weight flat (input_dim * hidden_dim). + if w1.numel() != input_dim * hidden_dim: + raise RuntimeError( + f"encoder.w1 numel={w1.numel()} does not match " + f"inputDim*hiddenDim={input_dim * hidden_dim}" + ) + if b1.numel() != hidden_dim: + raise RuntimeError( + f"encoder.b1 numel={b1.numel()} does not match hiddenDim={hidden_dim}" + ) + + weight = w1.reshape(input_dim, hidden_dim).to(torch.float32) + bias = b1.to(torch.float32) + + torch.manual_seed(42) + batch = 4 + x = torch.randn(batch, input_dim, dtype=torch.float32) + y = x @ weight + bias + + if not torch.isfinite(y).all(): + bad = (~torch.isfinite(y)).sum().item() + raise RuntimeError(f"first linear layer produced {bad} non-finite values") + print( + f"first linear OK: input={tuple(x.shape)} weight={tuple(weight.shape)} " + f"bias={tuple(bias.shape)} output={tuple(y.shape)} " + f"mean={y.mean().item():+.4f} std={y.std().item():.4f}" + ) + + +# --------------------------------------------------------------------------- # +# Entry point +# --------------------------------------------------------------------------- # +def main(argv: list[str] | None = None) -> int: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument( + "--local-dir", + type=Path, + default=DEFAULT_LOCAL_DIR, + help="Directory containing the downloaded HF model bundle " + f"(default: {DEFAULT_LOCAL_DIR})", + ) + args = parser.parse_args(argv) + + root: Path = args.local_dir + if not root.is_dir(): + print(f"ERROR: --local-dir does not exist or is not a directory: {root}", + file=sys.stderr) + return 2 + + safetensors_path = root / "model.safetensors" + rvf_path = root / "model.rvf.jsonl" + presence_path = root / "presence-head.json" + config_path = root / "config.json" + metrics_path = root / "training-metrics.json" + + for p in (safetensors_path, rvf_path, presence_path, config_path, metrics_path): + if not p.is_file(): + print(f"ERROR: required file missing: {p}", file=sys.stderr) + return 2 + + print(f"Verifying HF bundle at: {root}") + + try: + _section("model.safetensors") + state, _loader_note, tensor_info = _verify_safetensors(safetensors_path) + + _section("model.rvf.jsonl") + _verify_jsonl(rvf_path) + + _section("presence-head.json (depth <= 3)") + _verify_presence_head(presence_path) + + _section("config.json") + config = _verify_config(config_path) + + _section("training-metrics.json (final metrics)") + _verify_training_metrics(metrics_path) + + _section("encoder.w1 forward-pass smoke test") + _verify_first_linear(state, config, tensor_info) + except Exception as exc: # noqa: BLE001 - surface anything as a clear failure + print(f"\nFAIL: {type(exc).__name__}: {exc}", file=sys.stderr) + return 1 + + _section("OK - all checks passed") + return 0 + + +if __name__ == "__main__": + sys.exit(main())