diff --git a/aether-arena/calibration/test_calibration.py b/aether-arena/calibration/test_calibration.py new file mode 100644 index 00000000..804307ca --- /dev/null +++ b/aether-arena/calibration/test_calibration.py @@ -0,0 +1,103 @@ +"""Self-contained regression test for the RuView calibration service. + +Exercises the committed CLI end-to-end on synthetic data (CPU, no GPU, no real checkpoint): + build a base -> calibrate.py fits an adapter -> infer.py runs base+adapter -> assert the + adapter is small, inference is shape-correct and finite, and the adapter actually changes output. + +Run: python test_calibration.py (or via pytest) +""" +import json +import subprocess +import sys +import tempfile +from pathlib import Path + +import numpy as np +import torch + +HERE = Path(__file__).parent +sys.path.insert(0, str(HERE)) +from model import PoseNet, standardize # noqa: E402 + + +def _make_base(path: Path): + torch.manual_seed(0) + net = PoseNet() + # Save without the deterministic gr.A buffer (mirrors the published checkpoint; + # calibrate.py/infer.py load with strict=False). + sd = {k: v for k, v in net.state_dict().items() if k != "gr.A"} + torch.save(sd, path) + + +def _make_data(path: Path, n: int, seed: int): + rng = np.random.default_rng(seed) + X = rng.standard_normal((n, 3, 114, 10)).astype(np.float32) + Y = rng.random((n, 17, 2)).astype(np.float32) # keypoints in [0,1] + np.savez(path, X=X, Y=Y) + + +def _run(*args): + r = subprocess.run( + [sys.executable, str(HERE / args[0]), *map(str, args[1:])], + capture_output=True, text=True, + ) + assert r.returncode == 0, f"{args[0]} failed:\n{r.stdout}\n{r.stderr}" + return r.stdout + + +def test_calibration_end_to_end(): + with tempfile.TemporaryDirectory() as d: + d = Path(d) + base = d / "base.pt" + calib = d / "calib.npz" + frames = d / "frames.npz" + adapter = d / "room.adapter.npz" + kp = d / "kp.npy" + + _make_base(base) + _make_data(calib, n=40, seed=1) # ≥20 → no underfit warning + _make_data(frames, n=16, seed=2) + + # 1) calibrate -> adapter + out = _run("calibrate.py", "--base", base, "--data", calib, "--out", adapter, + "--iters", "50", "--device", "cpu") + assert adapter.exists(), "adapter not written" + assert "saved" in out.lower() + sz = adapter.stat().st_size + assert sz < 200_000, f"adapter unexpectedly large ({sz} bytes)" + + # adapter contains the expected LoRA tensors (materialize + close so the + # Windows tempdir can be cleaned up — np.load keeps a lazy file handle). + with np.load(adapter) as z: + keys = [k for k in z.files if k.endswith(".A") or k.endswith(".B")] + assert keys, f"adapter has no LoRA tensors: {z.files}" + lora = {k: z[k].astype(np.float32) for k in keys} + + # 2) infer with adapter -> keypoints + _run("infer.py", "--base", base, "--adapter", adapter, "--data", frames, + "--out", kp, "--device", "cpu") + out_kp = np.load(kp) + assert out_kp.shape == (16, 17, 2), f"bad keypoint shape {out_kp.shape}" + assert np.isfinite(out_kp).all(), "non-finite keypoints" + assert (out_kp >= 0).all() and (out_kp <= 1).all(), "keypoints out of [0,1]" + + # 3) adapter must actually change the output vs the zero-shot base + with np.load(frames) as fz: + frames_x = fz["X"][:] + net = PoseNet() + net.load_state_dict(torch.load(base, map_location="cpu"), strict=False) + net.eval() + x = standardize(torch.tensor(frames_x)) + with torch.no_grad(): + base_kp = net(x).reshape(16, 17, 2).numpy() + net.add_lora() + net.load_lora(lora) + net.eval() + with torch.no_grad(): + cal_kp = net(x).reshape(16, 17, 2).numpy() + assert np.abs(base_kp - cal_kp).sum() > 1e-4, "adapter did not change output" + + +if __name__ == "__main__": + test_calibration_end_to_end() + print("PASS: calibration service end-to-end (calibrate -> adapter -> infer)")