Skip to content

Commit abbef79

Browse files
jorgeepditommasoclaude
authored
Fix SIGTERM forwarding in AWS Batch jobs (#6414)
Signed-off-by: jorgee <[email protected]> Signed-off-by: Paolo Di Tommaso <[email protected]> Co-authored-by: Paolo Di Tommaso <[email protected]> Co-authored-by: Claude <[email protected]>
1 parent 1449fdf commit abbef79

File tree

3 files changed

+63
-8
lines changed

3 files changed

+63
-8
lines changed

plugins/nf-amazon/src/main/nextflow/cloud/aws/batch/AwsBatchExecutor.groovy

Lines changed: 57 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -366,7 +366,39 @@ class AwsBatchExecutor extends Executor implements ExtensionPoint, TaskArrayExec
366366
final kms = opts.storageKmsKeyId ? " --sse-kms-key-id $opts.storageKmsKeyId" : ''
367367
final requesterPays = opts.requesterPays ? ' --request-payer requester' : ''
368368
final aws = "$cli s3 cp --only-show-errors${sse}${kms}${debug}${requesterPays}"
369-
final cmd = "trap \"{ ret=\$?; $aws ${TaskRun.CMD_LOG} ${workDir}/${TaskRun.CMD_LOG}||true; exit \$ret; }\" EXIT; $aws ${workDir}/${TaskRun.CMD_RUN} - | bash 2>&1 | tee ${TaskRun.CMD_LOG}"
369+
370+
/*
371+
* Enhanced signal handling for AWS Batch tasks to fix nested Nextflow execution issues.
372+
* This implementation addresses the problem of proper signal forwarding when Nextflow
373+
* processes are executed within AWS Batch containers.
374+
*
375+
* References: https://github.com/nextflow-io/nextflow/pull/6414
376+
*
377+
* Trap command breakdown:
378+
*
379+
* 1. TERM signal trap: `trap \"[[ -n \\\$pid ]] && kill -TERM \\\$pid\" TERM`
380+
* - Captures SIGTERM signals sent to the parent shell process
381+
* - Conditionally forwards the TERM signal to the background bash process (stored in $pid)
382+
* - The `[[ -n \\\$pid ]]` test ensures we only attempt to kill if $pid is set and non-empty
383+
* - This prevents attempts to kill process ID 0 or empty values, which could cause unintended behavior
384+
* - Essential for proper cleanup when AWS Batch terminates jobs or when users cancel workflows
385+
*
386+
* 2. EXIT signal trap: `trap \"{ ret=\$?; $aws ${TaskRun.CMD_LOG} ${workDir}/${TaskRun.CMD_LOG}||true; exit \$ret; }\" EXIT`
387+
* - Executes cleanup actions when the shell process exits (normal or abnormal termination)
388+
* - Captures the exit status ($?) of the last executed command before cleanup
389+
* - Uploads the command log file to S3 for debugging and monitoring purposes
390+
* - Uses `||true` to prevent the trap from failing if S3 upload fails (ensures exit code preservation)
391+
* - Preserves and returns the original exit status to maintain proper error propagation
392+
*
393+
* 3. Background execution pattern: `bash > >(tee ${TaskRun.CMD_LOG}) 2>&1 & pid=\$!; wait \$pid`
394+
* - Runs the actual task command in background (&) to allow signal handling
395+
* - Redirects both stdout and stderr (2>&1) to process substitution for real-time logging
396+
* - Uses `tee` to simultaneously write logs to file and display to console
397+
* - Stores the background process ID in $pid for signal forwarding
398+
* - `wait $pid` ensures the parent shell waits for task completion and returns proper exit code
399+
* - This pattern allows the parent shell to remain responsive to signals while task executes
400+
*/
401+
final cmd = "trap \"[[ -n \\\$pid ]] && kill -TERM \\\$pid\" TERM; trap \"{ ret=\$?; $aws ${TaskRun.CMD_LOG} ${workDir}/${TaskRun.CMD_LOG}||true; exit \$ret; }\" EXIT; $aws ${workDir}/${TaskRun.CMD_RUN} - | bash > >(tee ${TaskRun.CMD_LOG}) 2>&1 & pid=\$!; wait \$pid"
370402
return cmd
371403
}
372404

@@ -375,7 +407,30 @@ class AwsBatchExecutor extends Executor implements ExtensionPoint, TaskArrayExec
375407
final sse = opts.storageEncryption ? " --sse $opts.storageEncryption" : ''
376408
final kms = opts.storageKmsKeyId ? " --sse-kms-key-id $opts.storageKmsKeyId" : ''
377409
final requesterPays = opts.requesterPays ? ' --request-payer requester' : ''
378-
final cmd = "trap \"{ ret=\$?; $cli cp${sse}${kms}${requesterPays} ${TaskRun.CMD_LOG} ${workDir}/${TaskRun.CMD_LOG}||true; exit \$ret; }\" EXIT; $cli cat ${workDir}/${TaskRun.CMD_RUN} | bash 2>&1 | tee ${TaskRun.CMD_LOG}"
410+
411+
/*
412+
* Enhanced signal handling for AWS Batch tasks using s5cmd (high-performance S3 client).
413+
* This implementation mirrors the s3Cmd method but uses s5cmd instead of aws-cli for
414+
* improved S3 transfer performance.
415+
*
416+
* References: https://github.com/nextflow-io/nextflow/pull/6414
417+
*
418+
* The trap commands follow the same pattern as s3Cmd method:
419+
*
420+
* 1. TERM signal trap: `trap \"[[ -n \\\$pid ]] && kill -TERM \\\$pid\" TERM`
421+
* - Ensures proper signal forwarding to background processes when SIGTERM is received
422+
* - Critical for handling AWS Batch job termination and user-initiated cancellations
423+
*
424+
* 2. EXIT signal trap: `trap \"{ ret=\$?; $cli cp${sse}${kms}${requesterPays} ${TaskRun.CMD_LOG} ${workDir}/${TaskRun.CMD_LOG}||true; exit \$ret; }\" EXIT`
425+
* - Performs cleanup by uploading task logs using s5cmd instead of aws-cli
426+
* - Maintains exit status preservation for proper error reporting
427+
*
428+
* 3. Background execution with s5cmd: `$cli cat ${workDir}/${TaskRun.CMD_RUN} | bash > >(tee ${TaskRun.CMD_LOG}) 2>&1 & pid=\$!; wait \$pid`
429+
* - Uses s5cmd to stream the task script directly into bash execution
430+
* - Maintains the same signal-responsive background execution pattern
431+
* - Provides real-time logging while allowing proper signal handling
432+
*/
433+
final cmd = "trap \"[[ -n \\\$pid ]] && kill -TERM \\\$pid\" TERM; trap \"{ ret=\$?; $cli cp${sse}${kms}${requesterPays} ${TaskRun.CMD_LOG} ${workDir}/${TaskRun.CMD_LOG}||true; exit \$ret; }\" EXIT; $cli cat ${workDir}/${TaskRun.CMD_RUN} | bash > >(tee ${TaskRun.CMD_LOG}) 2>&1 & pid=\$!; wait \$pid"
379434
return cmd
380435
}
381436

plugins/nf-amazon/src/test/nextflow/cloud/aws/batch/AwsBatchTaskHandlerTest.groovy

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -930,7 +930,7 @@ class AwsBatchTaskHandlerTest extends Specification {
930930
then:
931931
executor.getAwsOptions()>> Mock(AwsOptions) { getAwsCli() >> 'aws' }
932932
then:
933-
result.join(' ') == 'bash -o pipefail -c trap "{ ret=$?; aws s3 cp --only-show-errors .command.log s3://work/.command.log||true; exit $ret; }" EXIT; aws s3 cp --only-show-errors s3://work/.command.run - | bash 2>&1 | tee .command.log'
933+
result.join(' ') == 'bash -o pipefail -c trap "[[ -n \\$pid ]] && kill -TERM \\$pid" TERM; trap "{ ret=$?; aws s3 cp --only-show-errors .command.log s3://work/.command.log||true; exit $ret; }" EXIT; aws s3 cp --only-show-errors s3://work/.command.run - | bash > >(tee .command.log) 2>&1 & pid=$!; wait $pid'
934934

935935
when:
936936
result = handler.getSubmitCommand()
@@ -942,7 +942,7 @@ class AwsBatchTaskHandlerTest extends Specification {
942942
getStorageKmsKeyId() >> 'kms-key-123'
943943
}
944944
then:
945-
result.join(' ') == 'bash -o pipefail -c trap "{ ret=$?; aws s3 cp --only-show-errors --sse aws:kms --sse-kms-key-id kms-key-123 --debug .command.log s3://work/.command.log||true; exit $ret; }" EXIT; aws s3 cp --only-show-errors --sse aws:kms --sse-kms-key-id kms-key-123 --debug s3://work/.command.run - | bash 2>&1 | tee .command.log'
945+
result.join(' ') == 'bash -o pipefail -c trap "[[ -n \\$pid ]] && kill -TERM \\$pid" TERM; trap "{ ret=$?; aws s3 cp --only-show-errors --sse aws:kms --sse-kms-key-id kms-key-123 --debug .command.log s3://work/.command.log||true; exit $ret; }" EXIT; aws s3 cp --only-show-errors --sse aws:kms --sse-kms-key-id kms-key-123 --debug s3://work/.command.run - | bash > >(tee .command.log) 2>&1 & pid=$!; wait $pid'
946946
}
947947

948948
def 'should render submit command with s5cmd' () {
@@ -959,7 +959,7 @@ class AwsBatchTaskHandlerTest extends Specification {
959959
then:
960960
executor.getAwsOptions() >> Mock(AwsOptions) { getS5cmdPath() >> 's5cmd' }
961961
then:
962-
result.join(' ') == 'bash -o pipefail -c trap "{ ret=$?; s5cmd cp .command.log s3://work/.command.log||true; exit $ret; }" EXIT; s5cmd cat s3://work/.command.run | bash 2>&1 | tee .command.log'
962+
result.join(' ') == 'bash -o pipefail -c trap "[[ -n \\$pid ]] && kill -TERM \\$pid" TERM; trap "{ ret=$?; s5cmd cp .command.log s3://work/.command.log||true; exit $ret; }" EXIT; s5cmd cat s3://work/.command.run | bash > >(tee .command.log) 2>&1 & pid=$!; wait $pid'
963963

964964
when:
965965
result = handler.getSubmitCommand()
@@ -970,7 +970,7 @@ class AwsBatchTaskHandlerTest extends Specification {
970970
getStorageKmsKeyId() >> 'kms-key-123'
971971
}
972972
then:
973-
result.join(' ') == 'bash -o pipefail -c trap "{ ret=$?; s5cmd --debug cp --sse aws:kms --sse-kms-key-id kms-key-123 .command.log s3://work/.command.log||true; exit $ret; }" EXIT; s5cmd --debug cat s3://work/.command.run | bash 2>&1 | tee .command.log'
973+
result.join(' ') == 'bash -o pipefail -c trap "[[ -n \\$pid ]] && kill -TERM \\$pid" TERM; trap "{ ret=$?; s5cmd --debug cp --sse aws:kms --sse-kms-key-id kms-key-123 .command.log s3://work/.command.log||true; exit $ret; }" EXIT; s5cmd --debug cat s3://work/.command.run | bash > >(tee .command.log) 2>&1 & pid=$!; wait $pid'
974974

975975
}
976976

plugins/nf-amazon/src/test/nextflow/executor/AwsBatchExecutorTest.groovy

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -127,8 +127,8 @@ class AwsBatchExecutorTest extends Specification {
127127

128128
where:
129129
FUSION | DEFAULT_FS | S5CMD | TASK_DIR | EXPECTED
130-
false | false | false | 's3://foo/work/dir' | 'bash -o pipefail -c \'trap "{ ret=$?; aws s3 cp --only-show-errors .command.log s3://foo/work/dir/.command.log||true; exit $ret; }" EXIT; aws s3 cp --only-show-errors s3://foo/work/dir/.command.run - | bash 2>&1 | tee .command.log\''
131-
false | false | true | 's3://foo/work/dir' | 'bash -o pipefail -c \'trap "{ ret=$?; s5cmd cp .command.log s3://foo/work/dir/.command.log||true; exit $ret; }" EXIT; s5cmd cat s3://foo/work/dir/.command.run | bash 2>&1 | tee .command.log\''
130+
false | false | false | 's3://foo/work/dir' | 'bash -o pipefail -c \'trap "[[ -n \\$pid ]] && kill -TERM \\$pid" TERM; trap "{ ret=$?; aws s3 cp --only-show-errors .command.log s3://foo/work/dir/.command.log||true; exit $ret; }" EXIT; aws s3 cp --only-show-errors s3://foo/work/dir/.command.run - | bash > >(tee .command.log) 2>&1 & pid=$!; wait $pid\''
131+
false | false | true | 's3://foo/work/dir' | 'bash -o pipefail -c \'trap "[[ -n \\$pid ]] && kill -TERM \\$pid" TERM; trap "{ ret=$?; s5cmd cp .command.log s3://foo/work/dir/.command.log||true; exit $ret; }" EXIT; s5cmd cat s3://foo/work/dir/.command.run | bash > >(tee .command.log) 2>&1 & pid=$!; wait $pid\''
132132
and:
133133
true | false | false | '/fusion/work/dir' | 'bash /fusion/work/dir/.command.run'
134134
false | true | false | '/nfs/work/dir' | 'bash /nfs/work/dir/.command.run 2>&1 > /nfs/work/dir/.command.log'

0 commit comments

Comments
 (0)