test(calibration): self-contained end-to-end regression test

The committed calibration service (model.py/calibrate.py/infer.py) had no automated test — only ad-hoc verification. Adds a CPU-only, no-real-checkpoint test that exercises the CLI end-to-end on synthetic data: build base -> calibrate.py fits adapter -> infer.py runs base+adapter, asserting adapter size (<200KB), keypoint shape [N,17,2], finiteness, [0,1] range, and that the adapter actually changes the output. Passes on Windows CPU (torch 2.11). Co-Authored-By: claude-flow <ruv@ruv.net>
2026-05-31 05:02:24 -04:00 · 2026-05-31 05:02:24 -04:00 · 76cc57294d
parent 1b48b6f5c8
commit 76cc57294d
1 changed files with 103 additions and 0 deletions
--- a/aether-arena/calibration/test_calibration.py
+++ b/aether-arena/calibration/test_calibration.py
@ -0,0 +1,103 @@
+"""Self-contained regression test for the RuView calibration service.
+
+Exercises the committed CLI end-to-end on synthetic data (CPU, no GPU, no real checkpoint):
+  build a base -> calibrate.py fits an adapter -> infer.py runs base+adapter -> assert the
+  adapter is small, inference is shape-correct and finite, and the adapter actually changes output.
+
+Run:  python test_calibration.py    (or via pytest)
+"""
+import json
+import subprocess
+import sys
+import tempfile
+from pathlib import Path
+
+import numpy as np
+import torch
+
+HERE = Path(__file__).parent
+sys.path.insert(0, str(HERE))
+from model import PoseNet, standardize  # noqa: E402
+
+
+def _make_base(path: Path):
+    torch.manual_seed(0)
+    net = PoseNet()
+    # Save without the deterministic gr.A buffer (mirrors the published checkpoint;
+    # calibrate.py/infer.py load with strict=False).
+    sd = {k: v for k, v in net.state_dict().items() if k != "gr.A"}
+    torch.save(sd, path)
+
+
+def _make_data(path: Path, n: int, seed: int):
+    rng = np.random.default_rng(seed)
+    X = rng.standard_normal((n, 3, 114, 10)).astype(np.float32)
+    Y = rng.random((n, 17, 2)).astype(np.float32)  # keypoints in [0,1]
+    np.savez(path, X=X, Y=Y)
+
+
+def _run(*args):
+    r = subprocess.run(
+        [sys.executable, str(HERE / args[0]), *map(str, args[1:])],
+        capture_output=True, text=True,
+    )
+    assert r.returncode == 0, f"{args[0]} failed:\n{r.stdout}\n{r.stderr}"
+    return r.stdout
+
+
+def test_calibration_end_to_end():
+    with tempfile.TemporaryDirectory() as d:
+        d = Path(d)
+        base = d / "base.pt"
+        calib = d / "calib.npz"
+        frames = d / "frames.npz"
+        adapter = d / "room.adapter.npz"
+        kp = d / "kp.npy"
+
+        _make_base(base)
+        _make_data(calib, n=40, seed=1)     # ≥20 → no underfit warning
+        _make_data(frames, n=16, seed=2)
+
+        # 1) calibrate -> adapter
+        out = _run("calibrate.py", "--base", base, "--data", calib, "--out", adapter,
+                   "--iters", "50", "--device", "cpu")
+        assert adapter.exists(), "adapter not written"
+        assert "saved" in out.lower()
+        sz = adapter.stat().st_size
+        assert sz < 200_000, f"adapter unexpectedly large ({sz} bytes)"
+
+        # adapter contains the expected LoRA tensors (materialize + close so the
+        # Windows tempdir can be cleaned up — np.load keeps a lazy file handle).
+        with np.load(adapter) as z:
+            keys = [k for k in z.files if k.endswith(".A") or k.endswith(".B")]
+            assert keys, f"adapter has no LoRA tensors: {z.files}"
+            lora = {k: z[k].astype(np.float32) for k in keys}
+
+        # 2) infer with adapter -> keypoints
+        _run("infer.py", "--base", base, "--adapter", adapter, "--data", frames,
+             "--out", kp, "--device", "cpu")
+        out_kp = np.load(kp)
+        assert out_kp.shape == (16, 17, 2), f"bad keypoint shape {out_kp.shape}"
+        assert np.isfinite(out_kp).all(), "non-finite keypoints"
+        assert (out_kp >= 0).all() and (out_kp <= 1).all(), "keypoints out of [0,1]"
+
+        # 3) adapter must actually change the output vs the zero-shot base
+        with np.load(frames) as fz:
+            frames_x = fz["X"][:]
+        net = PoseNet()
+        net.load_state_dict(torch.load(base, map_location="cpu"), strict=False)
+        net.eval()
+        x = standardize(torch.tensor(frames_x))
+        with torch.no_grad():
+            base_kp = net(x).reshape(16, 17, 2).numpy()
+        net.add_lora()
+        net.load_lora(lora)
+        net.eval()
+        with torch.no_grad():
+            cal_kp = net(x).reshape(16, 17, 2).numpy()
+        assert np.abs(base_kp - cal_kp).sum() > 1e-4, "adapter did not change output"
+
+
+if __name__ == "__main__":
+    test_calibration_end_to_end()
+    print("PASS: calibration service end-to-end (calibrate -> adapter -> infer)")