broadinstitute · jgainerdewar · Nov 7, 2025 · Oct 29, 2025 · Oct 29, 2025 · Oct 29, 2025
@@ -5,6 +5,7 @@
 ### Progress toward WDL 1.1 Support
 * WDL 1.1 support is in progress. Users that would like to try out the current partial support can do so by using WDL version `development-1.1`. In Cromwell 92, `development-1.1` has been enhanced to include:
     * Support for passthrough syntax for call inputs, e.g. `{ input: foo }` rather than `{ input: foo = foo }`.
+    * Support for new boolean runtime attribute `gpu`, which tells the engine to require a GPU to be available in order to run the task. See [the attribute's docs](https://cromwell.readthedocs.io/en/develop/RuntimeAttributes/#gpu) for details.
 
 ### GPU changes on Google Cloud backend
 

@@ -170,6 +170,16 @@ trait BackendLifecycleActorFactory extends PlatformSpecific {
    */
   val dockerMirroring: Option[DockerMirroring] = None
 
+  /**
+   * Indicates whether this backend may be able to run jobs on GPU-enabled hardware. This assists us in the check we
+   * need to do for the WDL 1.1+ `gpu` runtime attributes, which requires failing a task if we can't confirm that a
+   * GPU is available in its execution environment.
+   *
+   * Backends should override this value to `true` and do their own checking if they are able to run jobs on
+   * GPU-enabled hardware.
+   */
+  val gpuMayBeAvailable: Boolean = false
+
   /**
     * Allows Cromwell to self-identify which cloud it's running on for runtime attribute purposes
     */

@@ -0,0 +1,15 @@
+package cromwell.backend.validation
+
+import wom.RuntimeAttributesKeys
+import wom.values.WomBoolean
+
+/**
+ * This runtime attribute indicates whether GPU resources are required for the job. If set to true, the backend
+ * must ensure that the execution environment has access to GPU resources; if not, the task should fail.
+ * Supported starting in WDL 1.1
+ * https://github.com/openwdl/wdl/blob/wdl-1.1/SPEC.md#gpu
+ */
+
+object GpuRequiredValidation extends BooleanRuntimeAttributesValidation(RuntimeAttributesKeys.GpuRequiredKey) {
+  val DefaultValue: WomBoolean = WomBoolean(false)
+}
@@ -0,0 +1,7 @@
+name: gpu_required_and_requested
+testFormat: workflowsuccess
+backends: [GCPBATCH, AWSBATCH]
+
+files {
+  workflow: wdl/gpu_required_and_requested.wdl
+}
@@ -0,0 +1,6 @@
+name: gpu_required_not_requested
+testFormat: workflowfailure
+
+files {
+  workflow: wdl/gpu_required_not_requested.wdl
+}
@@ -0,0 +1,22 @@
+version development-1.1
+
+task gpuTask {
+  command {
+    echo "I should have a GPU :)"
+  }
+  output {
+    String msg = read_string(stdout())
+  }
+  runtime {
+    docker: "ubuntu:latest"
+    gpu: true
+    gpuCount: 1
+  }
+}
+
+workflow wf_gpu_required_and_requested {
+  call gpuTask
+  output {
+     String out = gpuTask.msg
+  }
+}
@@ -0,0 +1,21 @@
+version development-1.1
+
+task gpuTask {
+  command {
+    echo "No GPU :("
+  }
+  output {
+    String msg = read_string(stdout())
+  }
+  runtime {
+    docker: "ubuntu:latest"
+    gpu: true
+  }
+}
+
+workflow wf_gpu_required_not_requested {
+  call gpuTask
+  output {
+     String out = gpuTask.msg
+  }
+}
@@ -40,6 +40,7 @@ Cromwell recognizes certain runtime attributes and has the ability to format the
 | [`maxRetries`](#maxretries)                     |   ✅   |      ✅       |           |     ✅     |         ℹ️ Note 3         |
 | [`continueOnReturnCode`](#continueonreturncode) |   ✅   |      ✅       |           |     ✅     |         ℹ️ Note 3         |
 | [`failOnStderr`](#failonstderr)                 |   ✅   |      ✅       |           |     ✅     |         ℹ️ Note 3         |
+| [`gpu`](#gpu)                                   |   ✅   |      ✅       |  ✅        |     ✅     |         ℹ️ Note 4         |
 
 
 > **Note 1**
@@ -53,6 +54,10 @@ Cromwell recognizes certain runtime attributes and has the ability to format the
 > **Note 3**
 > 
 > The HPC [Shared Filesystem backend](/backends/HPC#shared-filesystem) (SFS) is fully configurable and any number of attributes can be exposed. Cromwell recognizes some of these attributes (`cpu`, `memory` and `docker`) and parses them into the attribute listed in the table which can be used within the HPC backend configuration.
+> 
+> ** Note 4**
+> 
+> Supported starting in WDL 1.1
 
 
 ### Google Cloud Specific Attributes
@@ -300,7 +305,23 @@ runtime {
 }
 ```
 
+### `gpu`
+*Default: "false"*
+
+If `true`, Cromwell will attempt to ensure that the task can run in an environment with GPU support. The task will be
+failed if we can't confirm a GPU is available. This attribute is NOT required to be `true` to run a task with GPUs, it 
+merely adds a way to fast-fail tasks that are expected to run with GPUs but are not properly configured to do so. 
+
+- Google Cloud: Cromwell will attempt to examine other runtime attributes such as `gpuCount`, `gpuType`, `predefinedMachineType` to determine whether the task is configured to use a GPU, and fail the task if it is not.
+- AWS Batch: Cromwell will attempt to examine other runtime attributes such as `gpuCount` to determine whether the task is configured to use a GPU, and fail the task if it is not.
+- SFS: Cromwell is unable to confirm GPU availability, so tasks with `gpu: true` will always fail.
+- TES: Cromwell is unable to confirm GPU availability, so tasks with `gpu: true` will always fail.
 
+```
+runtime {
+  gpu: true
+}
+```
 
 ### `zones`
 

@@ -3,6 +3,7 @@ package cromwell.engine.workflow.lifecycle.execution.job.preparation
 import _root_.wdl.draft2.model._
 import akka.actor.{ActorRef, FSM, Props}
 import cats.data.Validated.{Invalid, Valid}
+import cats.implicits.catsSyntaxValidatedId
 import common.exception.MessageAggregation
 import common.validation.ErrorOr
 import common.validation.ErrorOr.ErrorOr
@@ -153,9 +154,20 @@ class JobPreparationActor(workflowDescriptor: EngineWorkflowDescriptor,
     for {
       evaluatedInputs <- ErrorOr(resolveAndEvaluateInputs(jobKey, expressionLanguageFunctions, valueStore)).flatten
       runtimeAttributes <- prepareRuntimeAttributes(evaluatedInputs)
+      _ <- checkGpuRequirement(runtimeAttributes)
     } yield (evaluatedInputs, runtimeAttributes)
   }
 
+  // Do a basic backend capability check for GPU availability. If the backend does support GPUs, it should do its own
+  // checks to confirm that this task is configured appropriately to provision them. This check passes if the task
+  // does not request GPU availability via the `gpu` runtime attr, OR if the backend may be able to provide GPUs.
+  private def checkGpuRequirement(runtimeAttributes: Map[LocallyQualifiedName, WomValue]): ErrorOr[Unit] =
+    runtimeAttributes.get(RuntimeAttributesKeys.GpuRequiredKey) match {
+      case Some(WomBoolean(true)) if !factory.gpuMayBeAvailable =>
+        s"GPU required for job ${jobKey.call.localName} via runtime attribute 'gpu', but GPU availability cannot be guaranteed by the backend.".invalidNel
+      case _ => Valid(())
+    }
+
   private def fetchDockerHashesIfNecessary(inputs: WomEvaluatedCallInputs,
                                            attributes: Map[LocallyQualifiedName, WomValue]
   ) = {

@@ -23,6 +23,7 @@ export AWS_SECRET_KEY=$(vault read -field=secret_key secret/dsde/cromwell/common
 # TODO (AN-710) Add back some of these tests (space, scatter, docker_hash_dockerhub, awswdlresultscopying etc.)
 # TODO (AN-710) tests that depend on continueOnReturnCode tests are failing:
 # (exit, valid_return_codes_and_continue_on_return_code, return_codes, globbingBehavior, failures.terminal_status)
+# TODO (AN-794) support GPU tests in AWS job queue (enables test gpu_required_and_requested)
 cromwell::build::run_centaur \
     -p 100 \
     -e localdockertest \
@@ -48,6 +49,7 @@ cromwell::build::run_centaur \
     -e cachewithinwf \
     -e failures.terminal_status \
     -e bad_file_string \
+    -e gpu_required_and_requested
 
 
 cromwell::build::generate_code_coverage

@@ -25,6 +25,7 @@ export AWS_SECRET_KEY=$(vault read -field=secret_key secret/dsde/cromwell/common
 # TODO (AN-710) Add back some of these tests (space, scatter, docker_hash_dockerhub, awswdlresultscopying etc.)
 # TODO (AN-710) tests that depend on continueOnReturnCode tests are failing:
 # (exit, valid_return_codes_and_continue_on_return_code, return_codes, globbingBehavior, failures.terminal_status)
+# TODO (AN-794) support GPU tests in AWS job queue (enables test gpu_required_and_requested)
 cromwell::build::run_centaur \
     -p 500 \
     -e localdockertest \
@@ -49,7 +50,8 @@ cromwell::build::run_centaur \
     -e globbingbehavior \
     -e cachewithinwf \
     -e failures.terminal_status \
-    -e bad_file_string
+    -e bad_file_string \
+    -e gpu_required_and_requested
 
 cromwell::build::generate_code_coverage
 

@@ -79,6 +79,8 @@ case class AwsBatchBackendLifecycleActorFactory(name: String, configurationDescr
 
   override val dockerMirroring: Option[DockerMirroring] = configuration.dockerMirroringOpt
 
+  override val gpuMayBeAvailable: Boolean = true
+
   override def workflowInitializationActorParams(workflowDescriptor: BackendWorkflowDescriptor,
                                                  ioActor: ActorRef,
                                                  calls: Set[CommandCallNode],

@@ -33,26 +33,24 @@ package cromwell.backend.impl.aws
 
 import cats.syntax.apply._
 import cats.syntax.validated._
-import com.typesafe.config.Config
+import com.typesafe.config.{Config, ConfigException, ConfigValueFactory}
 import common.validation.ErrorOr.ErrorOr
 import cromwell.backend.impl.aws.io.{AwsBatchVolume, AwsBatchWorkingDisk}
 import cromwell.backend.standard.StandardValidatedRuntimeAttributesBuilder
 import cromwell.backend.validation._
 import eu.timepit.refined.api.Refined
 import eu.timepit.refined.numeric.Positive
+import org.slf4j.{Logger, LoggerFactory}
+import wdl4s.parser.MemoryUnit
 import wom.RuntimeAttributesKeys
+import wom.RuntimeAttributesKeys.GpuKey
 import wom.format.MemorySize
-import wdl4s.parser.MemoryUnit
 import wom.types._
 import wom.values._
-import com.typesafe.config.{ConfigException, ConfigValueFactory}
 
+import scala.jdk.CollectionConverters._
 import scala.util.matching.Regex
-import org.slf4j.{Logger, LoggerFactory}
-import wom.RuntimeAttributesKeys.GpuKey
-
 import scala.util.{Failure, Success, Try}
-import scala.jdk.CollectionConverters._
 
 /**
  * Attributes that are provided to the job at runtime
@@ -154,6 +152,12 @@ object AwsBatchRuntimeAttributes {
     CpuValidation.instance
       .withDefault(CpuValidation.configDefaultWomValue(runtimeConfig) getOrElse CpuValidation.defaultMin)
 
+  private def gpuRequiredValidation(runtimeConfig: Option[Config]): RuntimeAttributesValidation[Boolean] =
+    GpuRequiredValidation
+      .withDefault(
+        GpuRequiredValidation.configDefaultWomValue(runtimeConfig) getOrElse GpuRequiredValidation.DefaultValue
+      )
+
   private def gpuCountValidation(runtimeConfig: Option[Config]): RuntimeAttributesValidation[Int] =
     PosIntValidation(GpuKey).withDefault(
       PosIntValidation(GpuKey).configDefaultWomValue(runtimeConfig).getOrElse(WomInteger(0))
@@ -298,6 +302,7 @@ object AwsBatchRuntimeAttributes {
       .withValidation(
         cpuValidation(runtimeConfig),
         gpuCountValidation(runtimeConfig),
+        gpuRequiredValidation(runtimeConfig),
         disksValidation(runtimeConfig),
         zonesValidation(runtimeConfig),
         memoryValidation(runtimeConfig),
@@ -322,6 +327,7 @@ object AwsBatchRuntimeAttributes {
       .withValidation(
         cpuValidation(runtimeConfig),
         gpuCountValidation(runtimeConfig),
+        gpuRequiredValidation(runtimeConfig),
         disksValidation(runtimeConfig),
         zonesValidation(runtimeConfig),
         memoryValidation(runtimeConfig),
@@ -354,8 +360,17 @@ object AwsBatchRuntimeAttributes {
   ): AwsBatchRuntimeAttributes = {
     val cpu: Int Refined Positive =
       RuntimeAttributesValidation.extract(cpuValidation(runtimeAttrsConfig), validatedRuntimeAttributes)
+    lazy val gpuRequired: Boolean =
+      RuntimeAttributesValidation.extract(gpuRequiredValidation(runtimeAttrsConfig), validatedRuntimeAttributes)
     val gpuCount: Int =
       RuntimeAttributesValidation.extract(gpuCountValidation(runtimeAttrsConfig), validatedRuntimeAttributes)
+
+    if (gpuRequired && gpuCount == 0) {
+      throw new RuntimeException(
+        s"GPU is required for this task ('gpu' runtime attr is true) but no GPU resource was configured ('gpuCount' is 0)."
+      )
+    }
+
     val zones: Vector[String] = RuntimeAttributesValidation.extract(ZonesValidation, validatedRuntimeAttributes)
     val memory: MemorySize =
       RuntimeAttributesValidation.extract(memoryValidation(runtimeAttrsConfig), validatedRuntimeAttributes)

@@ -610,6 +610,32 @@ class AwsBatchRuntimeAttributesSpec extends AnyWordSpecLike with CromwellTimeout
       )
     }
 
+    "require GPU provisioning when GPU is required" in {
+      val runtimeAttributes = Map(
+        "docker" -> WomString("ubuntu:latest"),
+        "gpu" -> WomBoolean(true)
+      )
+      assertAwsBatchRuntimeAttributesFailedCreation(
+        runtimeAttributes,
+        "GPU is required for this task ('gpu' runtime attr is true) but no GPU resource was configured ('gpuCount' is 0)."
+      )
+    }
+
+    "accept a GPU count when GPU is required" in {
+      val runtimeAttributes = Map(
+        "docker" -> WomString("ubuntu:latest"),
+        "scriptBucketName" -> WomString("my-stuff"),
+        "gpu" -> WomBoolean(true),
+        "gpuCount" -> WomInteger(2)
+      )
+      val expectedRuntimeAttributes =
+        expectedDefaults.copy(gpuCount = 2)
+      assertAwsBatchRuntimeAttributesSuccessfulCreation(
+        runtimeAttributes,
+        expectedRuntimeAttributes
+      )
+    }
+
     // add tests for jobTimeout
 
     "missing or invalid action key result in an invalid awsBatchEvaluateOnExit" in {

@@ -140,6 +140,8 @@ class GcpBatchBackendLifecycleActorFactory(override val name: String,
     }
 
   override val dockerMirroring: Option[DockerMirroring] = batchAttributes.dockerMirroringOpt
+
+  override val gpuMayBeAvailable: Boolean = true
 }
 
 object GcpBatchBackendLifecycleActorFactory extends StrictLogging {

@@ -36,6 +36,13 @@ final case class GpuResource(gpuType: GpuType, gpuCount: Int Refined Positive)
 
 final case class MachineType(machineType: String) {
   override def toString: String = machineType
+
+  // This check is valid as of October 2025
+  // https://docs.cloud.google.com/compute/docs/gpus
+  val supportsGpu: Boolean =
+    machineType.toLowerCase.contains("nvidia") ||
+      machineType.toLowerCase.contains("gpu") ||
+      machineType.toLowerCase.matches("^g[0-9]*-.*")
-      machineType.toLowerCase.matches("^g[0-9]*-.*")
+      machineType.toLowerCase.matches("^[ga][0-9]*-.*")
-      machineType.toLowerCase.matches("^g[0-9]*-.*")
+      machineType.toLowerCase.matches("^[ga][0-9]*-.*")
 }
 
 final case class GcpBatchRuntimeAttributes(cpu: Int Refined Positive,
@@ -105,6 +112,12 @@ object GcpBatchRuntimeAttributes {
     runtimeConfig: Option[Config]
   ): OptionalRuntimeAttributesValidation[Int Refined Positive] = GpuValidation.optional
 
+  private def gpuRequiredValidation(runtimeConfig: Option[Config]): RuntimeAttributesValidation[Boolean] =
+    GpuRequiredValidation
+      .withDefault(
+        GpuRequiredValidation.configDefaultWomValue(runtimeConfig) getOrElse GpuRequiredValidation.DefaultValue
+      )
+
   // As of WDL 1.1 these two are aliases of each other
   private val dockerValidation: OptionalRuntimeAttributesValidation[Containers] = DockerValidation.instance
   private val containerValidation: OptionalRuntimeAttributesValidation[Containers] = ContainerValidation.instance
@@ -152,6 +165,7 @@ object GcpBatchRuntimeAttributes {
       .withValidation(
         gpuCountValidation(runtimeConfig),
         gpuTypeValidation(runtimeConfig),
+        gpuRequiredValidation(runtimeConfig),
         cpuValidation(runtimeConfig),
         cpuPlatformValidation(runtimeConfig),
         machineTypeValidation(runtimeConfig),
@@ -184,6 +198,8 @@ object GcpBatchRuntimeAttributes {
       RuntimeAttributesValidation.extractOption(checkpointFileValidationInstance.key, validatedRuntimeAttributes)
 
     // GPU
+    lazy val gpuRequired: Boolean = RuntimeAttributesValidation
+      .extract(gpuRequiredValidation(runtimeAttrsConfig), validatedRuntimeAttributes)
     lazy val gpuType: Option[GpuType] = RuntimeAttributesValidation
       .extractOption(gpuTypeValidation(runtimeAttrsConfig).key, validatedRuntimeAttributes)
     lazy val gpuCount: Option[Int Refined Positive] = RuntimeAttributesValidation
@@ -200,6 +216,14 @@ object GcpBatchRuntimeAttributes {
       None
     }
 
+    lazy val gpuRequested: Boolean = gpuResource.isDefined || machineType.exists(_.supportsGpu)
+
+    if (gpuRequired && !gpuRequested) {
+      throw new RuntimeException(
+        s"GPU is required for this task ('gpu' runtime attr is true) but no GPU resource was configured."
+      )
+    }
+
     val docker: String = Containers.extractContainer(validatedRuntimeAttributes)
     val failOnStderr: Boolean =
       RuntimeAttributesValidation.extract(failOnStderrValidation(runtimeAttrsConfig), validatedRuntimeAttributes)
-Original file line number
+Diff line change
@@ Expand Up @@
         }
       override val dockerMirroring: Option[DockerMirroring] = batchAttributes.dockerMirroringOpt
+      override val gpuMayBeAvailable: Boolean = true
     }
     object GcpBatchBackendLifecycleActorFactory extends StrictLogging {
@@ Expand Down @@