-
Notifications
You must be signed in to change notification settings - Fork 375
AN-735 WDL 1.1: gpu requirement attribute #7821
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
e29f98a
f2d5310
97f5865
0cd11ef
313c6ad
c7f05a4
88f2dcd
8ac8582
e607ff6
52d9ad8
ddd3e87
c7cb6d7
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,15 @@ | ||
| package cromwell.backend.validation | ||
|
|
||
| import wom.RuntimeAttributesKeys | ||
| import wom.values.WomBoolean | ||
|
|
||
| /** | ||
| * This runtime attribute indicates whether GPU resources are required for the job. If set to true, the backend | ||
| * must ensure that the execution environment has access to GPU resources; if not, the task should fail. | ||
| * Supported starting in WDL 1.1 | ||
| * https://github.com/openwdl/wdl/blob/wdl-1.1/SPEC.md#gpu | ||
| */ | ||
|
|
||
| object GpuRequiredValidation extends BooleanRuntimeAttributesValidation(RuntimeAttributesKeys.GpuRequiredKey) { | ||
| val DefaultValue: WomBoolean = WomBoolean(false) | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,7 @@ | ||
| name: gpu_required_and_requested | ||
| testFormat: workflowsuccess | ||
| backends: [GCPBATCH, AWSBATCH] | ||
|
|
||
| files { | ||
| workflow: wdl/gpu_required_and_requested.wdl | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,6 @@ | ||
| name: gpu_required_not_requested | ||
| testFormat: workflowfailure | ||
|
|
||
| files { | ||
| workflow: wdl/gpu_required_not_requested.wdl | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,22 @@ | ||
| version development-1.1 | ||
|
|
||
| task gpuTask { | ||
| command { | ||
| echo "I should have a GPU :)" | ||
| } | ||
| output { | ||
| String msg = read_string(stdout()) | ||
| } | ||
| runtime { | ||
| docker: "ubuntu:latest" | ||
| gpu: true | ||
| gpuCount: 1 | ||
| } | ||
| } | ||
|
|
||
| workflow wf_gpu_required_and_requested { | ||
| call gpuTask | ||
| output { | ||
| String out = gpuTask.msg | ||
| } | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,21 @@ | ||
| version development-1.1 | ||
|
|
||
| task gpuTask { | ||
| command { | ||
| echo "No GPU :(" | ||
| } | ||
| output { | ||
| String msg = read_string(stdout()) | ||
| } | ||
| runtime { | ||
| docker: "ubuntu:latest" | ||
| gpu: true | ||
| } | ||
| } | ||
|
|
||
| workflow wf_gpu_required_not_requested { | ||
| call gpuTask | ||
| output { | ||
| String out = gpuTask.msg | ||
| } | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -40,6 +40,7 @@ Cromwell recognizes certain runtime attributes and has the ability to format the | |
| | [`maxRetries`](#maxretries) | ✅ | ✅ | | ✅ | ℹ️ Note 3 | | ||
| | [`continueOnReturnCode`](#continueonreturncode) | ✅ | ✅ | | ✅ | ℹ️ Note 3 | | ||
| | [`failOnStderr`](#failonstderr) | ✅ | ✅ | | ✅ | ℹ️ Note 3 | | ||
| | [`gpu`](#gpu) | ✅ | ✅ | ✅ | ✅ | ℹ️ Note 4 | | ||
|
|
||
|
|
||
| > **Note 1** | ||
|
|
@@ -53,6 +54,10 @@ Cromwell recognizes certain runtime attributes and has the ability to format the | |
| > **Note 3** | ||
| > | ||
| > The HPC [Shared Filesystem backend](/backends/HPC#shared-filesystem) (SFS) is fully configurable and any number of attributes can be exposed. Cromwell recognizes some of these attributes (`cpu`, `memory` and `docker`) and parses them into the attribute listed in the table which can be used within the HPC backend configuration. | ||
| > | ||
| > ** Note 4** | ||
| > | ||
| > Supported starting in WDL 1.1 | ||
|
|
||
|
|
||
| ### Google Cloud Specific Attributes | ||
|
|
@@ -300,7 +305,23 @@ runtime { | |
| } | ||
| ``` | ||
|
|
||
| ### `gpu` | ||
| *Default: "false"* | ||
|
|
||
| If `true`, Cromwell will attempt to ensure that the task can run in an environment with GPU support. The task will be | ||
| failed if we can't confirm a GPU is available. This attribute is NOT required to be `true` to run a task with GPUs, it | ||
| merely adds a way to fast-fail tasks that are expected to run with GPUs but are not properly configured to do so. | ||
|
|
||
| - Google Cloud: Cromwell will attempt to examine other runtime attributes such as `gpuCount`, `gpuType`, `predefinedMachineType` to determine whether the task is configured to use a GPU, and fail the task if it is not. | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It would be good to include detail that answers the following user concern:
|
||
| - AWS Batch: Cromwell will attempt to examine other runtime attributes such as `gpuCount` to determine whether the task is configured to use a GPU, and fail the task if it is not. | ||
| - SFS: Cromwell is unable to confirm GPU availability, so tasks with `gpu: true` will always fail. | ||
| - TES: Cromwell is unable to confirm GPU availability, so tasks with `gpu: true` will always fail. | ||
|
|
||
| ``` | ||
| runtime { | ||
| gpu: true | ||
| } | ||
| ``` | ||
|
|
||
| ### `zones` | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
|
|
@@ -36,6 +36,13 @@ final case class GpuResource(gpuType: GpuType, gpuCount: Int Refined Positive) | |||||
|
|
||||||
| final case class MachineType(machineType: String) { | ||||||
| override def toString: String = machineType | ||||||
|
|
||||||
| // This check is valid as of October 2025 | ||||||
| // https://docs.cloud.google.com/compute/docs/gpus | ||||||
| val supportsGpu: Boolean = | ||||||
| machineType.toLowerCase.contains("nvidia") || | ||||||
| machineType.toLowerCase.contains("gpu") || | ||||||
| machineType.toLowerCase.matches("^g[0-9]*-.*") | ||||||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. e.g.
Suggested change
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That machine
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Oh I see, I didn't think of that. Disregard if you don't feel it's necessary. |
||||||
| } | ||||||
|
|
||||||
| final case class GcpBatchRuntimeAttributes(cpu: Int Refined Positive, | ||||||
|
|
@@ -105,6 +112,12 @@ object GcpBatchRuntimeAttributes { | |||||
| runtimeConfig: Option[Config] | ||||||
| ): OptionalRuntimeAttributesValidation[Int Refined Positive] = GpuValidation.optional | ||||||
|
|
||||||
| private def gpuRequiredValidation(runtimeConfig: Option[Config]): RuntimeAttributesValidation[Boolean] = | ||||||
| GpuRequiredValidation | ||||||
| .withDefault( | ||||||
| GpuRequiredValidation.configDefaultWomValue(runtimeConfig) getOrElse GpuRequiredValidation.DefaultValue | ||||||
| ) | ||||||
|
|
||||||
| // As of WDL 1.1 these two are aliases of each other | ||||||
| private val dockerValidation: OptionalRuntimeAttributesValidation[Containers] = DockerValidation.instance | ||||||
| private val containerValidation: OptionalRuntimeAttributesValidation[Containers] = ContainerValidation.instance | ||||||
|
|
@@ -152,6 +165,7 @@ object GcpBatchRuntimeAttributes { | |||||
| .withValidation( | ||||||
| gpuCountValidation(runtimeConfig), | ||||||
| gpuTypeValidation(runtimeConfig), | ||||||
| gpuRequiredValidation(runtimeConfig), | ||||||
| cpuValidation(runtimeConfig), | ||||||
| cpuPlatformValidation(runtimeConfig), | ||||||
| machineTypeValidation(runtimeConfig), | ||||||
|
|
@@ -184,6 +198,8 @@ object GcpBatchRuntimeAttributes { | |||||
| RuntimeAttributesValidation.extractOption(checkpointFileValidationInstance.key, validatedRuntimeAttributes) | ||||||
|
|
||||||
| // GPU | ||||||
| lazy val gpuRequired: Boolean = RuntimeAttributesValidation | ||||||
| .extract(gpuRequiredValidation(runtimeAttrsConfig), validatedRuntimeAttributes) | ||||||
| lazy val gpuType: Option[GpuType] = RuntimeAttributesValidation | ||||||
| .extractOption(gpuTypeValidation(runtimeAttrsConfig).key, validatedRuntimeAttributes) | ||||||
| lazy val gpuCount: Option[Int Refined Positive] = RuntimeAttributesValidation | ||||||
|
|
@@ -200,6 +216,14 @@ object GcpBatchRuntimeAttributes { | |||||
| None | ||||||
| } | ||||||
|
|
||||||
| lazy val gpuRequested: Boolean = gpuResource.isDefined || machineType.exists(_.supportsGpu) | ||||||
|
|
||||||
| if (gpuRequired && !gpuRequested) { | ||||||
| throw new RuntimeException( | ||||||
| s"GPU is required for this task ('gpu' runtime attr is true) but no GPU resource was configured." | ||||||
| ) | ||||||
| } | ||||||
|
|
||||||
| val docker: String = Containers.extractContainer(validatedRuntimeAttributes) | ||||||
| val failOnStderr: Boolean = | ||||||
| RuntimeAttributesValidation.extract(failOnStderrValidation(runtimeAttrsConfig), validatedRuntimeAttributes) | ||||||
|
|
||||||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks for adding that warning, this flag is confusing