257 lines
12 KiB
JavaScript
257 lines
12 KiB
JavaScript
// SPDX-License-Identifier: MIT
|
||
// RuView harness tests — Node's built-in test runner (no devDeps to install).
|
||
// Run: `node --test test/*.test.mjs` (or `npm test`).
|
||
|
||
import { test } from 'node:test';
|
||
import assert from 'node:assert/strict';
|
||
import { readdirSync, readFileSync, mkdtempSync, writeFileSync, rmSync } from 'node:fs';
|
||
import { join, dirname, delimiter } from 'node:path';
|
||
import { tmpdir } from 'node:os';
|
||
import { fileURLToPath } from 'node:url';
|
||
import { claimCheck, summarize } from '../src/guardrails.js';
|
||
import { TOOLS, TOOL_ALIASES, runTool, listTools, findRepoRoot, run, which } from '../src/tools.js';
|
||
import { run as cliRun } from '../bin/cli.js';
|
||
|
||
const PKG_ROOT = dirname(dirname(fileURLToPath(import.meta.url)));
|
||
|
||
test('guardrail flags the retracted 100% framing as high severity', () => {
|
||
const r = claimCheck('Our model reaches 100% accuracy on every pose.');
|
||
assert.equal(r.ok, false);
|
||
assert.ok(r.findings.some((f) => f.severity === 'high'));
|
||
});
|
||
|
||
test('guardrail flags an untagged percentage accuracy claim', () => {
|
||
// "hit", not "measured" — "measured" would (correctly) route to the no-reproducer branch.
|
||
const r = claimCheck('We hit 92.9% PCK on the test set.');
|
||
assert.equal(r.ok, false);
|
||
assert.ok(r.findings.some((f) => /not tagged/i.test(f.reason)));
|
||
});
|
||
|
||
test('guardrail passes a MEASURED claim that cites a reproducer', () => {
|
||
const r = claimCheck('Held-out PCK@20 59.5% vs 50% mean-pose baseline = +9.4pp (MEASURED, verify.py).');
|
||
assert.equal(r.ok, true, JSON.stringify(r.findings));
|
||
});
|
||
|
||
test('guardrail flags MEASURED with no reproducer', () => {
|
||
const r = claimCheck('Presence detection 97% (MEASURED).');
|
||
assert.equal(r.ok, false);
|
||
assert.ok(r.findings.some((f) => /no reproducer/i.test(f.reason)));
|
||
});
|
||
|
||
test('guardrail ignores non-metric prose', () => {
|
||
assert.equal(claimCheck('The ESP32 streams CSI over UDP to the sensing-server.').ok, true);
|
||
assert.equal(claimCheck('').ok, true);
|
||
});
|
||
|
||
// ADR-263 F11/O9: precision pins — short metric tokens must not fire on prose.
|
||
test('guardrail does not false-positive on "map"/"F1" prose (ADR-263 F11)', () => {
|
||
assert.equal(claimCheck('F-numbers map to findings.').ok, true);
|
||
assert.equal(claimCheck('### F1 (HIGH, broken export): `require` points at a missing file').ok, true);
|
||
assert.equal(claimCheck('The 0.1.0 tarball ships 44 `.map` files = 62,698 B of dead weight.').ok, true);
|
||
assert.equal(claimCheck('the source maps can never resolve').ok, true);
|
||
assert.equal(claimCheck('- **O1 (F1):** fix `exports` (see F2 for the 33% map weight — MEASURED, tarball listing)').ok, true);
|
||
assert.equal(claimCheck('ADR-264: exports fix, map-free tarball, session-per-transport').ok, true);
|
||
});
|
||
|
||
test('guardrail still catches real short-token metric claims', () => {
|
||
assert.equal(claimCheck('We reach mAP 62.3 on COCO.').ok, false);
|
||
assert.equal(claimCheck('F1 score of 0.91 on the held set.').ok, false, 'f1 with a real score must still fire');
|
||
assert.equal(claimCheck('IoU 0.75 across rooms.').ok, false);
|
||
});
|
||
|
||
// Digits hidden in a code span still make a claim — scrubbing must not blind the
|
||
// number gate to `0.95` (regression: code-span number bypassed the gate).
|
||
test('guardrail flags an accuracy number stated inside a code span', () => {
|
||
const r = claimCheck('Count accuracy reached `0.95` in our tests.');
|
||
assert.equal(r.ok, false, JSON.stringify(r.findings));
|
||
assert.ok(r.findings.some((f) => /not tagged/i.test(f.reason)));
|
||
});
|
||
|
||
// A MEASURED claim whose only number hides in a code span must still reach the
|
||
// missing-reproducer check (regression: the scrubbed gate short-circuited it).
|
||
// Bare metric prose with no number at all (e.g. the README rule text) stays a pass.
|
||
test('guardrail flags a MEASURED code-span number with no reproducer', () => {
|
||
const r = claimCheck('Detection accuracy `0.97` on the set (MEASURED).');
|
||
assert.equal(r.ok, false, JSON.stringify(r.findings));
|
||
assert.ok(r.findings.some((f) => /no reproducer/i.test(f.reason)));
|
||
assert.equal(claimCheck('Every accuracy number must be MEASURED against a baseline.').ok, true);
|
||
});
|
||
|
||
// F1-score phrasings ("F1: 0.91", "F1 reaches 0.91") were scrubbed as option
|
||
// labels and slipped through; option refs alone must still not false-positive.
|
||
test('guardrail catches F1-score claims but not bare option refs (ADR-263 F11)', () => {
|
||
assert.equal(claimCheck('F1: 0.91 on the held-out set.').ok, false, 'F1: value is a metric claim');
|
||
assert.equal(claimCheck('F1 reaches 0.91 on the held-out set.').ok, false, 'F1 with a nearby number is a claim');
|
||
assert.equal(claimCheck('Options O1–O9 are tracked in ADR-263 O2.').ok, true, 'option labels are not metrics');
|
||
assert.equal(claimCheck('ADR-263 O2 lands the exports fix.').ok, true);
|
||
});
|
||
|
||
test('summarize gives PASS/finding text', () => {
|
||
assert.match(summarize(claimCheck('nothing here')), /PASS/);
|
||
assert.match(summarize(claimCheck('100% accuracy')), /finding/);
|
||
});
|
||
|
||
test('registry exposes the documented tools with schemas (underscore-canonical)', () => {
|
||
const names = Object.keys(TOOLS);
|
||
for (const n of ['ruview_onboard', 'ruview_claim_check', 'ruview_verify', 'ruview_node_monitor', 'ruview_calibrate', 'ruview_node_flash']) {
|
||
assert.ok(names.includes(n), `missing ${n}`);
|
||
assert.equal(TOOLS[n].inputSchema.type, 'object');
|
||
assert.match(n, /^[a-zA-Z0-9_-]{1,64}$/, 'canonical names must satisfy host tool-name regexes');
|
||
}
|
||
assert.equal(listTools().length, names.length);
|
||
});
|
||
|
||
test('dotted legacy names resolve via aliases (ADR-263 O8)', async () => {
|
||
assert.equal(TOOL_ALIASES['ruview.claim_check'], 'ruview_claim_check');
|
||
assert.equal(TOOL_ALIASES['ruview.node_monitor'], 'ruview_node_monitor');
|
||
const r = await runTool('ruview.onboard', {});
|
||
assert.equal(r.ok, true);
|
||
});
|
||
|
||
test('ruview_onboard returns paths and a recommendation', async () => {
|
||
const r = await runTool('ruview_onboard', {});
|
||
assert.equal(r.ok, true);
|
||
assert.ok(r.paths['live-esp32']);
|
||
assert.ok(['repo-build', 'docker-demo'].includes(r.recommend));
|
||
});
|
||
|
||
test('ruview_claim_check tool wraps the guardrail', async () => {
|
||
const r = await runTool('ruview_claim_check', { text: '100% accuracy' });
|
||
assert.equal(r.ok, false);
|
||
assert.match(r.summary, /honesty|tag|MEASURED|finding/i);
|
||
});
|
||
|
||
// ADR-263 F1/O1: the honesty gate must fail closed on empty input.
|
||
test('ruview_claim_check fails closed on empty/missing text', async () => {
|
||
const empty = await runTool('ruview_claim_check', { text: '' });
|
||
assert.equal(empty.ok, false);
|
||
assert.equal(empty.reason, 'empty_text');
|
||
const missing = await runTool('ruview_claim_check', {});
|
||
assert.equal(missing.ok, false);
|
||
assert.equal(missing.reason, 'empty_text');
|
||
});
|
||
|
||
test('unknown tool fails closed', async () => {
|
||
const r = await runTool('ruview_does_not_exist', {});
|
||
assert.equal(r.ok, false);
|
||
assert.equal(r.reason, 'unknown_tool');
|
||
});
|
||
|
||
test('node_monitor fails closed without a port', async () => {
|
||
const r = await runTool('ruview_node_monitor', {});
|
||
assert.equal(r.ok, false);
|
||
assert.equal(r.reason, 'no_port');
|
||
});
|
||
|
||
test('node_flash refuses without confirm (mutating guard)', async () => {
|
||
const r = await runTool('ruview_node_flash', { port: 'COM8', variant: 's3-8mb' });
|
||
assert.equal(r.ok, false);
|
||
// either not-confirmed (win32) or unsupported_platform (posix) — both fail-closed
|
||
assert.ok(['not_confirmed', 'unsupported_platform'].includes(r.reason));
|
||
});
|
||
|
||
test('verify fails closed when not in a RuView repo', async () => {
|
||
// point at a tmp dir with no repo markers
|
||
const r = await runTool('ruview_verify', { repo: process.platform === 'win32' ? 'C:/Windows/Temp' : '/tmp' });
|
||
assert.equal(r.ok, false);
|
||
assert.ok(['proof_missing', 'python_missing'].includes(r.reason), r.reason);
|
||
});
|
||
|
||
// ADR-263 F2/O2: registry-level concurrency — a slow child must not block
|
||
// other tool calls (run() is promise-based, never spawnSync).
|
||
test('run() is non-blocking: a fast tool completes while a slow child runs', async () => {
|
||
const slow = run('node', ['-e', 'setTimeout(() => {}, 2000)'], { timeout: 5000 });
|
||
const t0 = Date.now();
|
||
const fast = await runTool('ruview_onboard', {});
|
||
const elapsed = Date.now() - t0;
|
||
assert.equal(fast.ok, true);
|
||
assert.ok(elapsed < 1000, `onboard took ${elapsed} ms while a 2 s child was running`);
|
||
const r = await slow;
|
||
assert.equal(r.ok, true);
|
||
});
|
||
|
||
test('run() reports a timeout as a failure, not a hang', async () => {
|
||
const r = await run('node', ['-e', 'setTimeout(() => {}, 10000)'], { timeout: 300 });
|
||
assert.equal(r.ok, false);
|
||
assert.match(String(r.error), /timed out/);
|
||
});
|
||
|
||
test('run() bounds captured output instead of dying on big streams (ADR-263 O4)', async () => {
|
||
// 4 MiB of stdout would have hit spawnSync's 1 MiB default maxBuffer (ENOBUFS).
|
||
const r = await run('node', ['-e', "process.stdout.write('x'.repeat(4 * 1024 * 1024)); console.log('TAIL_MARKER')"], { timeout: 30000 });
|
||
assert.equal(r.ok, true);
|
||
assert.ok(r.stdout.length <= 65536, `tail not bounded: ${r.stdout.length}`);
|
||
assert.ok(r.stdout.includes('TAIL_MARKER'), 'tail must keep the end of the stream');
|
||
});
|
||
|
||
test('which() finds node and re-probes misses (hits are cached)', () => {
|
||
assert.ok(which('node'), 'node must be on PATH in the test env');
|
||
assert.equal(which('definitely-not-a-binary-xyz'), null);
|
||
assert.equal(which('definitely-not-a-binary-xyz'), null); // re-probed, still absent
|
||
});
|
||
|
||
// ADR-263 O8: a miss must not be cached — an operator who installs a tool
|
||
// mid-session (e.g. python after a python_missing failure) must be found next call.
|
||
test('which() re-probes after a miss so a newly-installed tool is found', () => {
|
||
const dir = mkdtempSync(join(tmpdir(), 'ruview-which-'));
|
||
const name = 'ruview-probe-xyz';
|
||
const isWin = process.platform === 'win32';
|
||
const bin = join(dir, isWin ? `${name}.cmd` : name);
|
||
const prevPath = process.env.PATH;
|
||
try {
|
||
assert.equal(which(name), null, 'not on PATH yet → miss');
|
||
writeFileSync(bin, isWin ? '@echo off\n' : '#!/bin/sh\n', { mode: 0o755 });
|
||
process.env.PATH = dir + delimiter + prevPath;
|
||
assert.ok(which(name), 'installed mid-session → the miss must not have been cached');
|
||
} finally {
|
||
process.env.PATH = prevPath;
|
||
rmSync(dir, { recursive: true, force: true });
|
||
}
|
||
});
|
||
|
||
test('CLI run(): claim-check exits non-zero on a bad claim', async () => {
|
||
const code = await cliRun(['claim-check', '--text', '100% accuracy']);
|
||
assert.notEqual(code, 0);
|
||
});
|
||
|
||
// ADR-263 F1/O1: the CLI must not PASS silently with no input.
|
||
test('CLI run(): claim-check with no input exits 2 (fail-closed)', async () => {
|
||
assert.equal(await cliRun(['claim-check']), 2);
|
||
assert.equal(await cliRun(['claim-check', '--text', ' ']), 2);
|
||
});
|
||
|
||
test('CLI run(): doctor exits 0 (tools-only path)', async () => {
|
||
const code = await cliRun(['doctor']);
|
||
assert.equal(code, 0);
|
||
});
|
||
|
||
test('CLI run(): unknown command exits non-zero', async () => {
|
||
assert.notEqual(await cliRun(['definitely-not-a-command']), 0);
|
||
});
|
||
|
||
test('findRepoRoot locates this monorepo from cwd', () => {
|
||
// when run from within wifi-densepose, it should find a root; elsewhere null is fine
|
||
const root = findRepoRoot();
|
||
assert.ok(root === null || typeof root === 'string');
|
||
});
|
||
|
||
// ADR-263 F7/O7: skills ship from one source; the projected copies must match.
|
||
test('.claude/skills/*/SKILL.md are byte-identical to skills/*.md', () => {
|
||
const srcDir = join(PKG_ROOT, 'skills');
|
||
for (const f of readdirSync(srcDir).filter((f) => f.endsWith('.md'))) {
|
||
const name = f.replace(/\.md$/, '');
|
||
const src = readFileSync(join(srcDir, f), 'utf8');
|
||
const projected = readFileSync(join(PKG_ROOT, '.claude', 'skills', name, 'SKILL.md'), 'utf8');
|
||
assert.equal(projected, src, `skill drift: ${name} — run \`npm run sync-skills\``);
|
||
}
|
||
});
|
||
|
||
// ADR-263 F6/O6 + F3/O3: package hygiene pins.
|
||
test('package.json has no optionalDependencies and no hardcoded server version drift', () => {
|
||
const pkg = JSON.parse(readFileSync(join(PKG_ROOT, 'package.json'), 'utf8'));
|
||
assert.equal(pkg.optionalDependencies, undefined, 'ADR-263 O3: optional deps tripled the cold npx install');
|
||
assert.equal(pkg.dependencies, undefined, 'the harness is dependency-free by design');
|
||
const mcpSrc = readFileSync(join(PKG_ROOT, 'src', 'mcp-server.js'), 'utf8');
|
||
assert.ok(!/version:\s*'\d+\.\d+\.\d+'/.test(mcpSrc), 'ADR-263 O6: server version must come from package.json');
|
||
});
|