feat(scripts): add fix-safetensors-header.py to repair NUL-padded headers

The SafeTensorsWriter in vendor/ruvector/.../export.js zero-initialises its output buffer and then copies the JSON header in without overwriting the padding zone, so the bytes between the JSON's last '}' and the declared 8-byte-aligned header length are left as 0x00 instead of the spec-required 0x20 (space). Strict readers — the Rust safetensors crate, Candle, and the safetensors.torch.load_file Python helper that wraps the Rust binding — reject the file with 'trailing characters at line 1 column N+1'. This is why model.safetensors at huggingface.co/ruvnet/wifi-densepose-pretrained currently fails to load anywhere outside our hand-rolled JS / Python parsers (both of which strip trailing NULs before json.loads). The utility opens a .safetensors file, locates the header zone, detects NUL padding, and rewrites just the padding bytes with 0x20. Declared header length, JSON content, and every tensor byte are preserved — only the padding bytes flip from NUL to space, so the SHA-256 of the tensor data is unchanged. Idempotent (a clean file reports 'already clean' and exits 0 without rewriting), supports --dry-run, accepts multiple paths.
2026-05-25 17:03:28 -06:00 · 2026-05-25 17:03:28 -06:00 · 5354726d15
parent be4efecbcd
commit 5354726d15
1 changed files with 192 additions and 0 deletions
--- a/scripts/fix-safetensors-header.py
+++ b/scripts/fix-safetensors-header.py
@ -0,0 +1,192 @@
+#!/usr/bin/env python3
+"""Fix safetensors files whose header zone is padded with NUL bytes.
+
+The safetensors spec (https://github.com/huggingface/safetensors#format)
+requires the header zone — the N bytes following the 8-byte u64 length
+prefix — to be either valid JSON or JSON followed by ASCII space (0x20)
+padding. Some writers (notably the JS SafeTensorsWriter in
+vendor/ruvector/.../export.js) emit NUL (0x00) padding instead, which
+strict readers (Rust safetensors crate, Candle, safetensors.torch.load_file)
+reject with `SafetensorError: trailing characters at line 1 column N+1`.
+
+This utility opens a .safetensors file, detects NUL padding in the header
+zone, and rewrites the padding bytes in-place as ASCII spaces. The
+declared header length, the JSON content, and every tensor byte are
+preserved unchanged — only the padding bytes flip.
+
+See docs/huggingface/SAFETENSORS-HEADER-BUG.md for the full bug analysis.
+
+Usage:
+    python scripts/fix-safetensors-header.py path/to/model.safetensors
+    python scripts/fix-safetensors-header.py path/to/model.safetensors --dry-run
+    python scripts/fix-safetensors-header.py models/*.safetensors
+
+Exits:
+    0  — file already clean, OR file was patched successfully
+    1  — file is not a valid safetensors layout / could not be opened
+    2  — bad CLI arguments
+"""
+
+from __future__ import annotations
+
+import argparse
+import struct
+import sys
+from pathlib import Path
+
+
+# Bytes that the spec accepts as valid header content. JSON is permitted
+# to contain any printable ASCII; pad bytes are restricted to 0x20.
+PAD_GOOD = 0x20  # ASCII space — the spec-required padding byte
+PAD_BAD = 0x00   # NUL — what the buggy JS writer emits
+
+
+def inspect_header(path: Path) -> tuple[int, int, bytes]:
+    """Return (declared_header_len, json_end_offset_in_header, padding_bytes).
+
+    `json_end_offset_in_header` is the offset within the header zone (i.e.
+    relative to byte 8 of the file) where the JSON document ends. Anything
+    after that and before `declared_header_len` is padding.
+
+    Raises ValueError if the file is too short or the header is not JSON.
+    """
+    with path.open("rb") as f:
+        prefix = f.read(8)
+        if len(prefix) < 8:
+            raise ValueError(f"{path}: file shorter than 8 bytes")
+        (declared,) = struct.unpack("<Q", prefix)
+        if declared <= 0 or declared > 100 * 1024 * 1024:
+            raise ValueError(
+                f"{path}: declared header length {declared} is implausible"
+            )
+        header = f.read(declared)
+        if len(header) < declared:
+            raise ValueError(
+                f"{path}: file truncated — declared header len {declared}, "
+                f"only {len(header)} bytes available"
+            )
+
+    # Find where the JSON document ends. The spec mandates the header start
+    # with `{`, so we scan from the right for the matching `}` then check that
+    # everything after is padding-class bytes.
+    if not header or header[0] != ord("{"):
+        raise ValueError(
+            f"{path}: header does not start with '{{' (byte 0x{header[0]:02x}) — "
+            "not a safetensors file"
+        )
+
+    # Walk from the end, skipping known padding-class bytes (NUL or space).
+    json_end = len(header)
+    while json_end > 0 and header[json_end - 1] in (PAD_GOOD, PAD_BAD):
+        json_end -= 1
+    if json_end == 0 or header[json_end - 1] != ord("}"):
+        raise ValueError(
+            f"{path}: could not locate end of JSON header (last non-pad byte "
+            f"is 0x{header[json_end - 1]:02x} at offset {json_end - 1})"
+        )
+
+    padding = header[json_end:]
+    return declared, json_end, padding
+
+
+def classify(padding: bytes) -> str:
+    """Return one of 'empty', 'spaces', 'nuls', 'mixed'."""
+    if not padding:
+        return "empty"
+    has_nul = any(b == PAD_BAD for b in padding)
+    has_space = any(b == PAD_GOOD for b in padding)
+    if has_nul and not has_space:
+        return "nuls"
+    if has_space and not has_nul:
+        return "spaces"
+    return "mixed"
+
+
+def fix_file(path: Path, dry_run: bool = False) -> bool:
+    """Rewrite the header padding zone of `path` to use 0x20.
+
+    Returns True if the file was modified (or would be, in dry-run mode);
+    False if it was already clean. Raises ValueError on malformed input.
+    """
+    declared, json_end, padding = inspect_header(path)
+    cls = classify(padding)
+
+    if cls in ("empty", "spaces"):
+        print(f"  [ok]      {path}  ({len(padding)} pad bytes already clean)")
+        return False
+
+    new_padding = bytes([PAD_GOOD] * len(padding))
+    print(
+        f"  [{'would patch' if dry_run else 'patched   '}] {path}  "
+        f"({len(padding)} {cls} pad bytes -> spaces, declared header "
+        f"length {declared} unchanged)"
+    )
+
+    if dry_run:
+        return True
+
+    # Open in r+b and overwrite only the padding bytes. This preserves
+    # every other byte (declared length, JSON header, tensor payload) and
+    # the file's overall size.
+    with path.open("r+b") as f:
+        f.seek(8 + json_end)
+        f.write(new_padding)
+        f.flush()
+
+    # Re-inspect to confirm the rewrite landed.
+    _, _, after = inspect_header(path)
+    if classify(after) not in ("empty", "spaces"):
+        raise RuntimeError(
+            f"{path}: post-write inspection shows padding is still '{classify(after)}'"
+        )
+    return True
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(
+        description=__doc__.split("\n\n")[0],
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="See docs/huggingface/SAFETENSORS-HEADER-BUG.md for full context.",
+    )
+    parser.add_argument(
+        "paths",
+        nargs="+",
+        type=Path,
+        help="One or more .safetensors files to inspect / fix",
+    )
+    parser.add_argument(
+        "--dry-run",
+        action="store_true",
+        help="Report what would change without rewriting any bytes",
+    )
+    args = parser.parse_args()
+
+    any_error = False
+    any_changed = False
+    print(f"Inspecting {len(args.paths)} file(s){' (dry-run)' if args.dry_run else ''}...")
+    for path in args.paths:
+        if not path.exists():
+            print(f"  [error]   {path}  (does not exist)", file=sys.stderr)
+            any_error = True
+            continue
+        try:
+            changed = fix_file(path, dry_run=args.dry_run)
+        except ValueError as exc:
+            print(f"  [error]   {exc}", file=sys.stderr)
+            any_error = True
+            continue
+        except OSError as exc:
+            print(f"  [error]   {path}  ({exc})", file=sys.stderr)
+            any_error = True
+            continue
+        any_changed = any_changed or changed
+
+    if any_error:
+        return 1
+    if args.dry_run and any_changed:
+        print("\nDry-run finished. Re-run without --dry-run to apply the fix.")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())