From 5354726d156fbb99b25f9cc837a01a3a6e35fc08 Mon Sep 17 00:00:00 2001 From: lockewerks <59770696+lockewerks@users.noreply.github.com> Date: Mon, 25 May 2026 17:03:28 -0600 Subject: [PATCH] feat(scripts): add fix-safetensors-header.py to repair NUL-padded headers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The SafeTensorsWriter in vendor/ruvector/.../export.js zero-initialises its output buffer and then copies the JSON header in without overwriting the padding zone, so the bytes between the JSON's last '}' and the declared 8-byte-aligned header length are left as 0x00 instead of the spec-required 0x20 (space). Strict readers — the Rust safetensors crate, Candle, and the safetensors.torch.load_file Python helper that wraps the Rust binding — reject the file with 'trailing characters at line 1 column N+1'. This is why model.safetensors at huggingface.co/ruvnet/wifi-densepose-pretrained currently fails to load anywhere outside our hand-rolled JS / Python parsers (both of which strip trailing NULs before json.loads). The utility opens a .safetensors file, locates the header zone, detects NUL padding, and rewrites just the padding bytes with 0x20. Declared header length, JSON content, and every tensor byte are preserved — only the padding bytes flip from NUL to space, so the SHA-256 of the tensor data is unchanged. Idempotent (a clean file reports 'already clean' and exits 0 without rewriting), supports --dry-run, accepts multiple paths. --- scripts/fix-safetensors-header.py | 192 ++++++++++++++++++++++++++++++ 1 file changed, 192 insertions(+) create mode 100644 scripts/fix-safetensors-header.py diff --git a/scripts/fix-safetensors-header.py b/scripts/fix-safetensors-header.py new file mode 100644 index 00000000..9ded0e4a --- /dev/null +++ b/scripts/fix-safetensors-header.py @@ -0,0 +1,192 @@ +#!/usr/bin/env python3 +"""Fix safetensors files whose header zone is padded with NUL bytes. + +The safetensors spec (https://github.com/huggingface/safetensors#format) +requires the header zone — the N bytes following the 8-byte u64 length +prefix — to be either valid JSON or JSON followed by ASCII space (0x20) +padding. Some writers (notably the JS SafeTensorsWriter in +vendor/ruvector/.../export.js) emit NUL (0x00) padding instead, which +strict readers (Rust safetensors crate, Candle, safetensors.torch.load_file) +reject with `SafetensorError: trailing characters at line 1 column N+1`. + +This utility opens a .safetensors file, detects NUL padding in the header +zone, and rewrites the padding bytes in-place as ASCII spaces. The +declared header length, the JSON content, and every tensor byte are +preserved unchanged — only the padding bytes flip. + +See docs/huggingface/SAFETENSORS-HEADER-BUG.md for the full bug analysis. + +Usage: + python scripts/fix-safetensors-header.py path/to/model.safetensors + python scripts/fix-safetensors-header.py path/to/model.safetensors --dry-run + python scripts/fix-safetensors-header.py models/*.safetensors + +Exits: + 0 — file already clean, OR file was patched successfully + 1 — file is not a valid safetensors layout / could not be opened + 2 — bad CLI arguments +""" + +from __future__ import annotations + +import argparse +import struct +import sys +from pathlib import Path + + +# Bytes that the spec accepts as valid header content. JSON is permitted +# to contain any printable ASCII; pad bytes are restricted to 0x20. +PAD_GOOD = 0x20 # ASCII space — the spec-required padding byte +PAD_BAD = 0x00 # NUL — what the buggy JS writer emits + + +def inspect_header(path: Path) -> tuple[int, int, bytes]: + """Return (declared_header_len, json_end_offset_in_header, padding_bytes). + + `json_end_offset_in_header` is the offset within the header zone (i.e. + relative to byte 8 of the file) where the JSON document ends. Anything + after that and before `declared_header_len` is padding. + + Raises ValueError if the file is too short or the header is not JSON. + """ + with path.open("rb") as f: + prefix = f.read(8) + if len(prefix) < 8: + raise ValueError(f"{path}: file shorter than 8 bytes") + (declared,) = struct.unpack(" 100 * 1024 * 1024: + raise ValueError( + f"{path}: declared header length {declared} is implausible" + ) + header = f.read(declared) + if len(header) < declared: + raise ValueError( + f"{path}: file truncated — declared header len {declared}, " + f"only {len(header)} bytes available" + ) + + # Find where the JSON document ends. The spec mandates the header start + # with `{`, so we scan from the right for the matching `}` then check that + # everything after is padding-class bytes. + if not header or header[0] != ord("{"): + raise ValueError( + f"{path}: header does not start with '{{' (byte 0x{header[0]:02x}) — " + "not a safetensors file" + ) + + # Walk from the end, skipping known padding-class bytes (NUL or space). + json_end = len(header) + while json_end > 0 and header[json_end - 1] in (PAD_GOOD, PAD_BAD): + json_end -= 1 + if json_end == 0 or header[json_end - 1] != ord("}"): + raise ValueError( + f"{path}: could not locate end of JSON header (last non-pad byte " + f"is 0x{header[json_end - 1]:02x} at offset {json_end - 1})" + ) + + padding = header[json_end:] + return declared, json_end, padding + + +def classify(padding: bytes) -> str: + """Return one of 'empty', 'spaces', 'nuls', 'mixed'.""" + if not padding: + return "empty" + has_nul = any(b == PAD_BAD for b in padding) + has_space = any(b == PAD_GOOD for b in padding) + if has_nul and not has_space: + return "nuls" + if has_space and not has_nul: + return "spaces" + return "mixed" + + +def fix_file(path: Path, dry_run: bool = False) -> bool: + """Rewrite the header padding zone of `path` to use 0x20. + + Returns True if the file was modified (or would be, in dry-run mode); + False if it was already clean. Raises ValueError on malformed input. + """ + declared, json_end, padding = inspect_header(path) + cls = classify(padding) + + if cls in ("empty", "spaces"): + print(f" [ok] {path} ({len(padding)} pad bytes already clean)") + return False + + new_padding = bytes([PAD_GOOD] * len(padding)) + print( + f" [{'would patch' if dry_run else 'patched '}] {path} " + f"({len(padding)} {cls} pad bytes -> spaces, declared header " + f"length {declared} unchanged)" + ) + + if dry_run: + return True + + # Open in r+b and overwrite only the padding bytes. This preserves + # every other byte (declared length, JSON header, tensor payload) and + # the file's overall size. + with path.open("r+b") as f: + f.seek(8 + json_end) + f.write(new_padding) + f.flush() + + # Re-inspect to confirm the rewrite landed. + _, _, after = inspect_header(path) + if classify(after) not in ("empty", "spaces"): + raise RuntimeError( + f"{path}: post-write inspection shows padding is still '{classify(after)}'" + ) + return True + + +def main() -> int: + parser = argparse.ArgumentParser( + description=__doc__.split("\n\n")[0], + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog="See docs/huggingface/SAFETENSORS-HEADER-BUG.md for full context.", + ) + parser.add_argument( + "paths", + nargs="+", + type=Path, + help="One or more .safetensors files to inspect / fix", + ) + parser.add_argument( + "--dry-run", + action="store_true", + help="Report what would change without rewriting any bytes", + ) + args = parser.parse_args() + + any_error = False + any_changed = False + print(f"Inspecting {len(args.paths)} file(s){' (dry-run)' if args.dry_run else ''}...") + for path in args.paths: + if not path.exists(): + print(f" [error] {path} (does not exist)", file=sys.stderr) + any_error = True + continue + try: + changed = fix_file(path, dry_run=args.dry_run) + except ValueError as exc: + print(f" [error] {exc}", file=sys.stderr) + any_error = True + continue + except OSError as exc: + print(f" [error] {path} ({exc})", file=sys.stderr) + any_error = True + continue + any_changed = any_changed or changed + + if any_error: + return 1 + if args.dry_run and any_changed: + print("\nDry-run finished. Re-run without --dry-run to apply the fix.") + return 0 + + +if __name__ == "__main__": + sys.exit(main())