Skip to content

Commit 46dad7d

Browse files
authored
fix: prevent processKeepAlive OOM error process reuse (#2261)
* Improve TaskRunProcess health detection so we don't try and reuse an unhealthy process This was happening after the process was killed internally, like by an OOM error * Add changeset
1 parent 4fdf23b commit 46dad7d

File tree

5 files changed

+63
-11
lines changed

5 files changed

+63
-11
lines changed

.changeset/fifty-beers-bake.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
---
2+
"trigger.dev": patch
3+
---
4+
5+
Fixes a bug that would allow processes that had OOM errors to be incorrectly reused when experimental_processKeepAlive was enabled

packages/cli-v3/src/dev/taskRunProcessPool.ts

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -208,10 +208,18 @@ export class TaskRunProcessPool {
208208

209209
private isProcessHealthy(process: TaskRunProcess): boolean {
210210
// Basic health checks - we can expand this later
211-
return !process.isBeingKilled && process.pid !== undefined;
211+
return process.isHealthy;
212212
}
213213

214214
private async killProcess(process: TaskRunProcess): Promise<void> {
215+
if (!process.isHealthy) {
216+
logger.debug("[TaskRunProcessPool] Process is not healthy, skipping cleanup", {
217+
processId: process.pid,
218+
});
219+
220+
return;
221+
}
222+
215223
try {
216224
await process.cleanup(true);
217225
} catch (error) {

packages/cli-v3/src/entryPoints/managed/taskRunProcessProvider.ts

Lines changed: 3 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -135,7 +135,7 @@ export class TaskRunProcessProvider {
135135
this.sendDebugLog("Not keeping TaskRunProcess alive, cleaning up", {
136136
executionCount: this.executionCount,
137137
maxExecutionCount: this.processKeepAliveMaxExecutionCount,
138-
isHealthy: this.isProcessHealthy(process),
138+
isHealthy: process.isHealthy,
139139
});
140140

141141
// Cleanup the process completely
@@ -284,19 +284,12 @@ export class TaskRunProcessProvider {
284284
return (
285285
!!this.persistentProcess &&
286286
this.executionCount < this.processKeepAliveMaxExecutionCount &&
287-
this.isProcessHealthy(this.persistentProcess)
287+
this.persistentProcess.isHealthy
288288
);
289289
}
290290

291291
private shouldKeepProcessAlive(process: TaskRunProcess): boolean {
292-
return (
293-
this.executionCount < this.processKeepAliveMaxExecutionCount && this.isProcessHealthy(process)
294-
);
295-
}
296-
297-
private isProcessHealthy(process: TaskRunProcess): boolean {
298-
// Basic health check - TaskRunProcess will handle more detailed internal health checks
299-
return !process.isBeingKilled && process.pid !== undefined;
292+
return this.executionCount < this.processKeepAliveMaxExecutionCount && process.isHealthy;
300293
}
301294

302295
private async cleanupProcess(taskRunProcess: TaskRunProcess): Promise<void> {

packages/cli-v3/src/executions/taskRunProcess.ts

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -442,10 +442,26 @@ export class TaskRunProcess {
442442
return this._isBeingKilled || this._child?.killed;
443443
}
444444

445+
get isBeingSuspended() {
446+
return this._isBeingSuspended;
447+
}
448+
445449
get pid() {
446450
return this._childPid;
447451
}
448452

453+
get isHealthy() {
454+
if (!this._child) {
455+
return false;
456+
}
457+
458+
if (this.isBeingKilled || this.isBeingSuspended) {
459+
return false;
460+
}
461+
462+
return this._child.connected;
463+
}
464+
449465
static parseExecuteError(error: unknown, dockerMode = true): TaskRunInternalError {
450466
if (error instanceof CancelledProcessError) {
451467
return {

references/hello-world/src/trigger/oom.ts

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,3 +55,33 @@ export const oomTask = task({
5555
}
5656
},
5757
});
58+
59+
export const oomTask2 = task({
60+
id: "oom-task-2",
61+
machine: "micro",
62+
run: async (payload: any, { ctx }) => {
63+
await runMemoryLeakScenario();
64+
},
65+
});
66+
67+
async function runMemoryLeakScenario() {
68+
console.log("🧠 Starting memory leak simulation");
69+
const memoryHogs = [];
70+
let iteration = 0;
71+
72+
while (iteration < 1000) {
73+
// Allocate large chunks of memory
74+
const bigArray = new Array(10000000).fill(`memory-leak-data-${iteration}`);
75+
memoryHogs.push(bigArray);
76+
77+
await setTimeout(200);
78+
iteration++;
79+
80+
const memUsage = process.memoryUsage();
81+
console.log(
82+
`🧠 Memory leak iteration ${iteration}, RSS: ${Math.round(memUsage.rss / 1024 / 1024)} MB`
83+
);
84+
}
85+
86+
console.log("🧠 Memory leak scenario completed");
87+
}

0 commit comments

Comments
 (0)