Skip to content

Commit 279102c

Browse files
authored
fix(cli): reject execute() immediately when child process is dead (#2978)
## Summary - When a child process crashes and a retry (`RETRY_IMMEDIATELY`) is attempted on the same `TaskRunProcess`, `execute()` hangs forever because the IPC send is silently skipped and the attempt promise can never resolve - This caused runner pods to stay up indefinitely with no heartbeats or polls - Fix: reject the attempt promise immediately when the child is not connected, so the controller can proceed to warm start or exit ## Test plan - [x] Added `taskRunProcess.test.ts` — verifies `execute()` rejects promptly instead of hanging when the child process is dead - [x] Deploy and verify no more stuck runner pods accumulate over time
1 parent b221719 commit 279102c

File tree

3 files changed

+139
-0
lines changed

3 files changed

+139
-0
lines changed
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
---
2+
"trigger.dev": patch
3+
---
4+
5+
Fix runner getting stuck indefinitely when `execute()` is called on a dead child process.
Lines changed: 121 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,121 @@
1+
import { TaskRunProcess, type TaskRunProcessOptions } from "./taskRunProcess.js";
2+
import { describe, it, expect, vi } from "vitest";
3+
import { UnexpectedExitError } from "@trigger.dev/core/v3/errors";
4+
import type {
5+
TaskRunExecution,
6+
TaskRunExecutionPayload,
7+
WorkerManifest,
8+
ServerBackgroundWorker,
9+
MachinePresetResources,
10+
} from "@trigger.dev/core/v3";
11+
12+
function createTaskRunProcessOptions(
13+
overrides: Partial<TaskRunProcessOptions> = {}
14+
): TaskRunProcessOptions {
15+
return {
16+
workerManifest: {
17+
runtime: "node",
18+
workerEntryPoint: "/dev/null",
19+
configEntryPoint: "/dev/null",
20+
otelImportHook: {},
21+
} as unknown as WorkerManifest,
22+
serverWorker: {} as unknown as ServerBackgroundWorker,
23+
env: {},
24+
machineResources: { cpu: 1, memory: 1 } as MachinePresetResources,
25+
...overrides,
26+
};
27+
}
28+
29+
function createExecution(runId: string, attemptNumber: number): TaskRunExecution {
30+
return {
31+
run: {
32+
id: runId,
33+
payload: "{}",
34+
payloadType: "application/json",
35+
tags: [],
36+
isTest: false,
37+
createdAt: new Date(),
38+
startedAt: new Date(),
39+
maxAttempts: 3,
40+
version: "1",
41+
durationMs: 0,
42+
costInCents: 0,
43+
baseCostInCents: 0,
44+
},
45+
attempt: {
46+
number: attemptNumber,
47+
startedAt: new Date(),
48+
id: "deprecated",
49+
backgroundWorkerId: "deprecated",
50+
backgroundWorkerTaskId: "deprecated",
51+
status: "deprecated" as any,
52+
},
53+
task: { id: "test-task", filePath: "test.ts" },
54+
queue: { id: "queue-1", name: "test-queue" },
55+
environment: { id: "env-1", slug: "test", type: "DEVELOPMENT" },
56+
organization: { id: "org-1", slug: "test-org", name: "Test Org" },
57+
project: { id: "proj-1", ref: "proj_test", slug: "test", name: "Test" },
58+
machine: { name: "small-1x", cpu: 0.5, memory: 0.5, centsPerMs: 0 },
59+
} as unknown as TaskRunExecution;
60+
}
61+
62+
describe("TaskRunProcess", () => {
63+
describe("execute() on a dead child process", () => {
64+
it("should reject when child process has already exited and IPC send is skipped", async () => {
65+
const proc = new TaskRunProcess(createTaskRunProcessOptions());
66+
67+
// Simulate a child process that has exited: _child exists but is not connected
68+
const fakeChild = {
69+
connected: false,
70+
killed: false,
71+
pid: 12345,
72+
kill: vi.fn(),
73+
on: vi.fn(),
74+
stdout: { on: vi.fn() },
75+
stderr: { on: vi.fn() },
76+
};
77+
78+
// Set internal state to mimic a process whose child has crashed
79+
(proc as any)._child = fakeChild;
80+
(proc as any)._childPid = 12345;
81+
(proc as any)._isBeingKilled = false;
82+
83+
const execution = createExecution("run-1", 2);
84+
85+
// This should NOT hang forever - it should reject promptly.
86+
//
87+
// BUG: Currently execute() creates a promise, skips the IPC send because
88+
// _child.connected is false, then awaits the promise which will never
89+
// resolve because the child is dead and #handleExit already ran.
90+
//
91+
// The Promise.race with a timeout detects the hang.
92+
const result = await Promise.race([
93+
proc
94+
.execute(
95+
{
96+
payload: { execution, traceContext: {}, metrics: [] },
97+
messageId: "run_run-1",
98+
env: {},
99+
},
100+
true
101+
)
102+
.then(
103+
(v) => ({ type: "resolved" as const, value: v }),
104+
(e) => ({ type: "rejected" as const, error: e })
105+
),
106+
new Promise<{ type: "hung" }>((resolve) =>
107+
setTimeout(() => resolve({ type: "hung" as const }), 2000)
108+
),
109+
]);
110+
111+
// The test fails (proving the bug) if execute() hangs
112+
expect(result.type).not.toBe("hung");
113+
expect(result.type).toBe("rejected");
114+
115+
if (result.type === "rejected") {
116+
expect(result.error).toBeInstanceOf(UnexpectedExitError);
117+
expect(result.error.stderr).toContain("not connected");
118+
}
119+
});
120+
});
121+
});

packages/cli-v3/src/executions/taskRunProcess.ts

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -297,6 +297,19 @@ export class TaskRunProcess {
297297
env: params.env,
298298
isWarmStart: isWarmStart ?? this.options.isWarmStart,
299299
});
300+
} else {
301+
// Child process is dead or disconnected — the IPC send was skipped so the attempt
302+
// promise would hang forever. Reject it immediately to let the caller handle it.
303+
this._attemptStatuses.set(key, "REJECTED");
304+
305+
// @ts-expect-error - rejecter is assigned in the promise constructor above
306+
rejecter(
307+
new UnexpectedExitError(
308+
-1,
309+
null,
310+
"Child process is not connected, cannot execute task run"
311+
)
312+
);
300313
}
301314

302315
const result = await promise;

0 commit comments

Comments
 (0)