97 lines
3.6 KiB
TypeScript
97 lines
3.6 KiB
TypeScript
/**
|
|
* ADR-264 O6 — post-restart job reconciliation.
|
|
*
|
|
* When the MCP server restarts mid-run, the persisted job record stays frozen
|
|
* at 'running' (the child.on('close') that flips it lived in the dead process).
|
|
* ruview_job_status must reconcile such a record against the recorded pid and
|
|
* the log's "# exit code: N" marker.
|
|
*
|
|
* We fabricate a persisted record pointing at a KNOWN-DEAD pid (a synchronous
|
|
* child that has already exited) and assert the reconciled status.
|
|
*/
|
|
|
|
import { mkdtempSync, writeFileSync } from "node:fs";
|
|
import { spawnSync } from "node:child_process";
|
|
import os from "node:os";
|
|
import path from "node:path";
|
|
import { randomUUID } from "node:crypto";
|
|
import { jobStatus } from "../src/tools/train-count.js";
|
|
import type { RuviewConfig } from "../src/types.js";
|
|
|
|
/** A pid that has certainly exited: spawnSync waits for the child to finish. */
|
|
function deadPid(): number {
|
|
const r = spawnSync(process.execPath, ["-e", ""]);
|
|
if (typeof r.pid !== "number") throw new Error("could not spawn probe child");
|
|
return r.pid;
|
|
}
|
|
|
|
function makeConfig(jobsDir: string): RuviewConfig {
|
|
return {
|
|
sensingServerUrl: "http://127.0.0.1:19999",
|
|
apiToken: undefined,
|
|
poseCogBinary: "nonexistent",
|
|
countCogBinary: "nonexistent",
|
|
jobsDir,
|
|
};
|
|
}
|
|
|
|
/** Write a fake persisted 'running' record + its log, return {jobId, config}. */
|
|
function seedRunningJob(logBody: string): { jobId: string; config: RuviewConfig } {
|
|
const jobsDir = mkdtempSync(path.join(os.tmpdir(), "rvagent-jobs-"));
|
|
const jobId = randomUUID();
|
|
const logPath = path.join(jobsDir, `${jobId}.log`);
|
|
writeFileSync(logPath, logBody);
|
|
const record = {
|
|
job_id: jobId,
|
|
status: "running",
|
|
log_path: logPath,
|
|
queued_at: Date.now() / 1000,
|
|
epochs_total: 5,
|
|
pid: deadPid(),
|
|
};
|
|
writeFileSync(
|
|
path.join(jobsDir, `${jobId}.json`),
|
|
JSON.stringify(record, null, 2)
|
|
);
|
|
return { jobId, config: makeConfig(jobsDir) };
|
|
}
|
|
|
|
describe("ruview_job_status reconciliation (ADR-264 O6)", () => {
|
|
it("reconciles a dead 'running' job with exit 0 to 'done'", async () => {
|
|
const { jobId, config } = seedRunningJob(
|
|
"# training...\nepoch 5/5\n# exit code: 0\n"
|
|
);
|
|
const out = (await jobStatus({ job_id: jobId }, config)) as Record<string, unknown>;
|
|
expect(out["ok"]).toBe(true);
|
|
const res = out["result"] as Record<string, unknown>;
|
|
expect(res["status"]).toBe("done");
|
|
});
|
|
|
|
it("reconciles a dead 'running' job with non-zero exit to 'failed'", async () => {
|
|
const { jobId, config } = seedRunningJob(
|
|
"# training...\npanic: cuda oom\n# exit code: 101\n"
|
|
);
|
|
const out = (await jobStatus({ job_id: jobId }, config)) as Record<string, unknown>;
|
|
const res = out["result"] as Record<string, unknown>;
|
|
expect(res["status"]).toBe("failed");
|
|
});
|
|
|
|
it("marks a dead 'running' job with no exit marker as 'unknown' with a reason", async () => {
|
|
const { jobId, config } = seedRunningJob("# training...\nepoch 2/5\n");
|
|
const out = (await jobStatus({ job_id: jobId }, config)) as Record<string, unknown>;
|
|
const res = out["result"] as Record<string, unknown>;
|
|
expect(res["status"]).toBe("unknown");
|
|
expect(typeof res["reason"]).toBe("string");
|
|
expect(res["reason"]).toMatch(/restarted/i);
|
|
});
|
|
|
|
it("treats a signal-killed marker (null) as 'failed'", async () => {
|
|
const { jobId, config } = seedRunningJob(
|
|
"# training...\n# exit code: null\n"
|
|
);
|
|
const out = (await jobStatus({ job_id: jobId }, config)) as Record<string, unknown>;
|
|
const res = out["result"] as Record<string, unknown>;
|
|
expect(res["status"]).toBe("failed");
|
|
});
|
|
});
|