wifi-densepose/harness/ruview/test/tools.test.mjs

257 lines
12 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// SPDX-License-Identifier: MIT
// RuView harness tests — Node's built-in test runner (no devDeps to install).
// Run: `node --test test/*.test.mjs` (or `npm test`).
import { test } from 'node:test';
import assert from 'node:assert/strict';
import { readdirSync, readFileSync, mkdtempSync, writeFileSync, rmSync } from 'node:fs';
import { join, dirname, delimiter } from 'node:path';
import { tmpdir } from 'node:os';
import { fileURLToPath } from 'node:url';
import { claimCheck, summarize } from '../src/guardrails.js';
import { TOOLS, TOOL_ALIASES, runTool, listTools, findRepoRoot, run, which } from '../src/tools.js';
import { run as cliRun } from '../bin/cli.js';
const PKG_ROOT = dirname(dirname(fileURLToPath(import.meta.url)));
test('guardrail flags the retracted 100% framing as high severity', () => {
const r = claimCheck('Our model reaches 100% accuracy on every pose.');
assert.equal(r.ok, false);
assert.ok(r.findings.some((f) => f.severity === 'high'));
});
test('guardrail flags an untagged percentage accuracy claim', () => {
// "hit", not "measured" — "measured" would (correctly) route to the no-reproducer branch.
const r = claimCheck('We hit 92.9% PCK on the test set.');
assert.equal(r.ok, false);
assert.ok(r.findings.some((f) => /not tagged/i.test(f.reason)));
});
test('guardrail passes a MEASURED claim that cites a reproducer', () => {
const r = claimCheck('Held-out PCK@20 59.5% vs 50% mean-pose baseline = +9.4pp (MEASURED, verify.py).');
assert.equal(r.ok, true, JSON.stringify(r.findings));
});
test('guardrail flags MEASURED with no reproducer', () => {
const r = claimCheck('Presence detection 97% (MEASURED).');
assert.equal(r.ok, false);
assert.ok(r.findings.some((f) => /no reproducer/i.test(f.reason)));
});
test('guardrail ignores non-metric prose', () => {
assert.equal(claimCheck('The ESP32 streams CSI over UDP to the sensing-server.').ok, true);
assert.equal(claimCheck('').ok, true);
});
// ADR-263 F11/O9: precision pins — short metric tokens must not fire on prose.
test('guardrail does not false-positive on "map"/"F1" prose (ADR-263 F11)', () => {
assert.equal(claimCheck('F-numbers map to findings.').ok, true);
assert.equal(claimCheck('### F1 (HIGH, broken export): `require` points at a missing file').ok, true);
assert.equal(claimCheck('The 0.1.0 tarball ships 44 `.map` files = 62,698 B of dead weight.').ok, true);
assert.equal(claimCheck('the source maps can never resolve').ok, true);
assert.equal(claimCheck('- **O1 (F1):** fix `exports` (see F2 for the 33% map weight — MEASURED, tarball listing)').ok, true);
assert.equal(claimCheck('ADR-264: exports fix, map-free tarball, session-per-transport').ok, true);
});
test('guardrail still catches real short-token metric claims', () => {
assert.equal(claimCheck('We reach mAP 62.3 on COCO.').ok, false);
assert.equal(claimCheck('F1 score of 0.91 on the held set.').ok, false, 'f1 with a real score must still fire');
assert.equal(claimCheck('IoU 0.75 across rooms.').ok, false);
});
// Digits hidden in a code span still make a claim — scrubbing must not blind the
// number gate to `0.95` (regression: code-span number bypassed the gate).
test('guardrail flags an accuracy number stated inside a code span', () => {
const r = claimCheck('Count accuracy reached `0.95` in our tests.');
assert.equal(r.ok, false, JSON.stringify(r.findings));
assert.ok(r.findings.some((f) => /not tagged/i.test(f.reason)));
});
// A MEASURED claim whose only number hides in a code span must still reach the
// missing-reproducer check (regression: the scrubbed gate short-circuited it).
// Bare metric prose with no number at all (e.g. the README rule text) stays a pass.
test('guardrail flags a MEASURED code-span number with no reproducer', () => {
const r = claimCheck('Detection accuracy `0.97` on the set (MEASURED).');
assert.equal(r.ok, false, JSON.stringify(r.findings));
assert.ok(r.findings.some((f) => /no reproducer/i.test(f.reason)));
assert.equal(claimCheck('Every accuracy number must be MEASURED against a baseline.').ok, true);
});
// F1-score phrasings ("F1: 0.91", "F1 reaches 0.91") were scrubbed as option
// labels and slipped through; option refs alone must still not false-positive.
test('guardrail catches F1-score claims but not bare option refs (ADR-263 F11)', () => {
assert.equal(claimCheck('F1: 0.91 on the held-out set.').ok, false, 'F1: value is a metric claim');
assert.equal(claimCheck('F1 reaches 0.91 on the held-out set.').ok, false, 'F1 with a nearby number is a claim');
assert.equal(claimCheck('Options O1O9 are tracked in ADR-263 O2.').ok, true, 'option labels are not metrics');
assert.equal(claimCheck('ADR-263 O2 lands the exports fix.').ok, true);
});
test('summarize gives PASS/finding text', () => {
assert.match(summarize(claimCheck('nothing here')), /PASS/);
assert.match(summarize(claimCheck('100% accuracy')), /finding/);
});
test('registry exposes the documented tools with schemas (underscore-canonical)', () => {
const names = Object.keys(TOOLS);
for (const n of ['ruview_onboard', 'ruview_claim_check', 'ruview_verify', 'ruview_node_monitor', 'ruview_calibrate', 'ruview_node_flash']) {
assert.ok(names.includes(n), `missing ${n}`);
assert.equal(TOOLS[n].inputSchema.type, 'object');
assert.match(n, /^[a-zA-Z0-9_-]{1,64}$/, 'canonical names must satisfy host tool-name regexes');
}
assert.equal(listTools().length, names.length);
});
test('dotted legacy names resolve via aliases (ADR-263 O8)', async () => {
assert.equal(TOOL_ALIASES['ruview.claim_check'], 'ruview_claim_check');
assert.equal(TOOL_ALIASES['ruview.node_monitor'], 'ruview_node_monitor');
const r = await runTool('ruview.onboard', {});
assert.equal(r.ok, true);
});
test('ruview_onboard returns paths and a recommendation', async () => {
const r = await runTool('ruview_onboard', {});
assert.equal(r.ok, true);
assert.ok(r.paths['live-esp32']);
assert.ok(['repo-build', 'docker-demo'].includes(r.recommend));
});
test('ruview_claim_check tool wraps the guardrail', async () => {
const r = await runTool('ruview_claim_check', { text: '100% accuracy' });
assert.equal(r.ok, false);
assert.match(r.summary, /honesty|tag|MEASURED|finding/i);
});
// ADR-263 F1/O1: the honesty gate must fail closed on empty input.
test('ruview_claim_check fails closed on empty/missing text', async () => {
const empty = await runTool('ruview_claim_check', { text: '' });
assert.equal(empty.ok, false);
assert.equal(empty.reason, 'empty_text');
const missing = await runTool('ruview_claim_check', {});
assert.equal(missing.ok, false);
assert.equal(missing.reason, 'empty_text');
});
test('unknown tool fails closed', async () => {
const r = await runTool('ruview_does_not_exist', {});
assert.equal(r.ok, false);
assert.equal(r.reason, 'unknown_tool');
});
test('node_monitor fails closed without a port', async () => {
const r = await runTool('ruview_node_monitor', {});
assert.equal(r.ok, false);
assert.equal(r.reason, 'no_port');
});
test('node_flash refuses without confirm (mutating guard)', async () => {
const r = await runTool('ruview_node_flash', { port: 'COM8', variant: 's3-8mb' });
assert.equal(r.ok, false);
// either not-confirmed (win32) or unsupported_platform (posix) — both fail-closed
assert.ok(['not_confirmed', 'unsupported_platform'].includes(r.reason));
});
test('verify fails closed when not in a RuView repo', async () => {
// point at a tmp dir with no repo markers
const r = await runTool('ruview_verify', { repo: process.platform === 'win32' ? 'C:/Windows/Temp' : '/tmp' });
assert.equal(r.ok, false);
assert.ok(['proof_missing', 'python_missing'].includes(r.reason), r.reason);
});
// ADR-263 F2/O2: registry-level concurrency — a slow child must not block
// other tool calls (run() is promise-based, never spawnSync).
test('run() is non-blocking: a fast tool completes while a slow child runs', async () => {
const slow = run('node', ['-e', 'setTimeout(() => {}, 2000)'], { timeout: 5000 });
const t0 = Date.now();
const fast = await runTool('ruview_onboard', {});
const elapsed = Date.now() - t0;
assert.equal(fast.ok, true);
assert.ok(elapsed < 1000, `onboard took ${elapsed} ms while a 2 s child was running`);
const r = await slow;
assert.equal(r.ok, true);
});
test('run() reports a timeout as a failure, not a hang', async () => {
const r = await run('node', ['-e', 'setTimeout(() => {}, 10000)'], { timeout: 300 });
assert.equal(r.ok, false);
assert.match(String(r.error), /timed out/);
});
test('run() bounds captured output instead of dying on big streams (ADR-263 O4)', async () => {
// 4 MiB of stdout would have hit spawnSync's 1 MiB default maxBuffer (ENOBUFS).
const r = await run('node', ['-e', "process.stdout.write('x'.repeat(4 * 1024 * 1024)); console.log('TAIL_MARKER')"], { timeout: 30000 });
assert.equal(r.ok, true);
assert.ok(r.stdout.length <= 65536, `tail not bounded: ${r.stdout.length}`);
assert.ok(r.stdout.includes('TAIL_MARKER'), 'tail must keep the end of the stream');
});
test('which() finds node and re-probes misses (hits are cached)', () => {
assert.ok(which('node'), 'node must be on PATH in the test env');
assert.equal(which('definitely-not-a-binary-xyz'), null);
assert.equal(which('definitely-not-a-binary-xyz'), null); // re-probed, still absent
});
// ADR-263 O8: a miss must not be cached — an operator who installs a tool
// mid-session (e.g. python after a python_missing failure) must be found next call.
test('which() re-probes after a miss so a newly-installed tool is found', () => {
const dir = mkdtempSync(join(tmpdir(), 'ruview-which-'));
const name = 'ruview-probe-xyz';
const isWin = process.platform === 'win32';
const bin = join(dir, isWin ? `${name}.cmd` : name);
const prevPath = process.env.PATH;
try {
assert.equal(which(name), null, 'not on PATH yet → miss');
writeFileSync(bin, isWin ? '@echo off\n' : '#!/bin/sh\n', { mode: 0o755 });
process.env.PATH = dir + delimiter + prevPath;
assert.ok(which(name), 'installed mid-session → the miss must not have been cached');
} finally {
process.env.PATH = prevPath;
rmSync(dir, { recursive: true, force: true });
}
});
test('CLI run(): claim-check exits non-zero on a bad claim', async () => {
const code = await cliRun(['claim-check', '--text', '100% accuracy']);
assert.notEqual(code, 0);
});
// ADR-263 F1/O1: the CLI must not PASS silently with no input.
test('CLI run(): claim-check with no input exits 2 (fail-closed)', async () => {
assert.equal(await cliRun(['claim-check']), 2);
assert.equal(await cliRun(['claim-check', '--text', ' ']), 2);
});
test('CLI run(): doctor exits 0 (tools-only path)', async () => {
const code = await cliRun(['doctor']);
assert.equal(code, 0);
});
test('CLI run(): unknown command exits non-zero', async () => {
assert.notEqual(await cliRun(['definitely-not-a-command']), 0);
});
test('findRepoRoot locates this monorepo from cwd', () => {
// when run from within wifi-densepose, it should find a root; elsewhere null is fine
const root = findRepoRoot();
assert.ok(root === null || typeof root === 'string');
});
// ADR-263 F7/O7: skills ship from one source; the projected copies must match.
test('.claude/skills/*/SKILL.md are byte-identical to skills/*.md', () => {
const srcDir = join(PKG_ROOT, 'skills');
for (const f of readdirSync(srcDir).filter((f) => f.endsWith('.md'))) {
const name = f.replace(/\.md$/, '');
const src = readFileSync(join(srcDir, f), 'utf8');
const projected = readFileSync(join(PKG_ROOT, '.claude', 'skills', name, 'SKILL.md'), 'utf8');
assert.equal(projected, src, `skill drift: ${name} — run \`npm run sync-skills\``);
}
});
// ADR-263 F6/O6 + F3/O3: package hygiene pins.
test('package.json has no optionalDependencies and no hardcoded server version drift', () => {
const pkg = JSON.parse(readFileSync(join(PKG_ROOT, 'package.json'), 'utf8'));
assert.equal(pkg.optionalDependencies, undefined, 'ADR-263 O3: optional deps tripled the cold npx install');
assert.equal(pkg.dependencies, undefined, 'the harness is dependency-free by design');
const mcpSrc = readFileSync(join(PKG_ROOT, 'src', 'mcp-server.js'), 'utf8');
assert.ok(!/version:\s*'\d+\.\d+\.\d+'/.test(mcpSrc), 'ADR-263 O6: server version must come from package.json');
});