NVIDIA · gagank1 · Oct 7, 2025 · Oct 7, 2025 · Oct 9, 2025 · Oct 9, 2025
@@ -18,3 +18,7 @@
 - name: ciflow:skip
   description: Skip all CI tests for this PR
   color: B60205  # Red
+
+- name: ciflow:multi-gpu
+  description: (Reserved for future use) Run all multi GPU tests (unit tests, slow tests) for bionemo2
+  color: 12F5AE  # Teal
@@ -30,7 +30,7 @@ Configure CI behavior by applying the relevant labels. By default, only basic un
 - [ciflow:all](https://github.com/NVIDIA/bionemo-framework/blob/main/docs/docs/main/contributing/contributing.md#ciflow:all) - Run all tests (unit tests, slow tests, and notebooks) for bionemo2. This label can be used to enforce running tests for all bionemo2.
 - [ciflow:all-recipes](https://github.com/NVIDIA/bionemo-framework/blob/main/docs/docs/main/contributing/contributing.md#ciflow:all-recipes) - Run tests for all recipes (under bionemo-recipes). This label can be used to enforce running tests for all recipes.
 
-Unit tests marked as `@pytest.mark.multi_gpu` or `@pytest.mark.distributed` are not run in the PR pipeline.
+Multi-GPU tests (marked with `@pytest.mark.multi_gpu`) are not run in PR CI. They run automatically in nightly builds after single-GPU tests pass.
 
 For more details, see [CONTRIBUTING](CONTRIBUTING.md)
 

@@ -19,16 +19,29 @@
 # 2. pre-commit: Runs static code checks and linting
 # 3. get-pr-labels: Retrieves PR labels for conditional job execution
 # 4. build-bionemo-image: Builds Docker image (conditional on triggers/labels)
-# 5. run-tests: Runs unit tests (when image build succeeds)
-# 6. run-tests-slow: Runs slow tests (when image succeeds + ciflow:slow label OR schedule/merge_group/ciflow:all)
-# 7. run-tests-notebooks: Runs notebook tests (when image succeeds + ciflow:notebooks label OR schedule/merge_group/ciflow:all)
-# 8. verify-tests-status: Verifies all test jobs completed successfully
+# 5. run-tests-single-gpu: Runs fast single-GPU unit tests (when image build succeeds)
+# 6. run-tests-multi-gpu: Runs fast multi-GPU tests (conditional - see below)
+# 7. run-tests-slow-single-gpu: Runs slow single-GPU tests (when image succeeds + ciflow:slow/ciflow:all label OR schedule/merge_group)
+# 8. run-tests-slow-multi-gpu: Runs slow multi-GPU tests (conditional - see below)
+# 9. run-tests-notebooks: Runs notebook tests (when image succeeds + ciflow:notebooks/ciflow:all label OR schedule/merge_group)
+# 10. verify-tests-status: Verifies all test jobs completed successfully
 #
 # CONDITIONAL EXECUTION:
 # - build-bionemo-image runs on: schedule, ciflow:all label, (no ciflow:skip + modified files), (merge_group + modified files)
-# - run-tests runs when: build-bionemo-image succeeds
-# - run-tests-slow runs when: build-bionemo-image succeeds AND (schedule OR merge_group OR ciflow:all OR ciflow:slow)
+# - run-tests-single-gpu runs when: build-bionemo-image succeeds
+# - run-tests-multi-gpu runs when: build succeeds AND ((schedule AND single-gpu passes) OR (push AND ciflow:multi-gpu label))
+# - run-tests-slow-single-gpu runs when: build-bionemo-image succeeds AND (schedule OR merge_group OR ciflow:all OR ciflow:slow)
+# - run-tests-slow-multi-gpu runs when: build succeeds AND ((schedule AND slow-single-gpu passes) OR (push AND ciflow:multi-gpu label))
 # - run-tests-notebooks runs when: build-bionemo-image succeeds AND (schedule OR merge_group OR ciflow:all OR ciflow:notebooks)
+#
+# MULTI-GPU TEST EXECUTION:
+# Multi-GPU tests run in these situations:
+# - On schedule (nightly): if build succeeds AND corresponding single-GPU tests pass
+# - On PRs (push events): if build succeeds AND labels match:
+#   * Fast multi-GPU: ciflow:all OR ciflow:multi-gpu
+#   * Slow multi-GPU: ciflow:all OR (ciflow:multi-gpu AND ciflow:slow)
+# - NOT on merge_group or any other events
+# Note: On push, multi-GPU tests run in parallel with single-GPU tests (no dependency)
 
 name: "BioNeMo Framework CI"
 
@@ -206,7 +219,7 @@
           cache-to: ${{ steps.cache.outputs.cache-to }}
 
 
-  run-tests:
+  run-tests-single-gpu:
     needs:
       - build-bionemo-image
       - get-pr-labels
@@ -221,7 +234,7 @@
       - name: Checkout repository
         uses: actions/checkout@v4
 
-      - name: Run tests
+      - name: Run single-GPU tests
         # Tests in this stage generate code coverage metrics for the repository
         # Coverage data is uploaded to Codecov in subsequent stages
         env:
@@ -246,7 +259,45 @@
         with:
           token: ${{ secrets.CODECOV_TOKEN }}
 
-  run-tests-slow:
+  run-tests-multi-gpu:
+    needs:
+      - build-bionemo-image
+      - run-tests-single-gpu
+      - get-pr-labels
+    runs-on: linux-amd64-gpu-rtxa6000-latest-2-nemo
+    container:
+      image: svcbionemo023/bionemo-framework:${{ github.run_id }}
+      credentials:
+        username: ${{ vars.DOCKER_USERNAME }}
+        password: ${{ secrets.DOCKER_PASSWORD }}
+    # Run multi-GPU tests ONLY when:
+    # 1. On schedule: if build succeeds AND single-GPU tests pass
+    # 2. On push: if build succeeds AND (ciflow:all OR ciflow:multi-gpu label)
+    # Do NOT run on merge_group or any other events
+    if: |
+        (needs.build-bionemo-image.result == 'success') &&
+        (
+          (
+            github.event_name == 'schedule' &&
+            needs.run-tests-single-gpu.result == 'success'
+          ) ||
+          (
+            contains(fromJSON(needs.get-pr-labels.outputs.labels || '[]'), 'ciflow:all') ||
+            contains(fromJSON(needs.get-pr-labels.outputs.labels || '[]'), 'ciflow:multi-gpu')
+          )
+        )
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Run multi-GPU tests
+        env:
+          BIONEMO_DATA_SOURCE: ngc
+        run: |
+          chmod +x ./ci/scripts/run_pytest_multigpu.sh
+          ./ci/scripts/run_pytest_multigpu.sh
+
+  run-tests-slow-single-gpu:
     needs:
       - build-bionemo-image
       - get-pr-labels
@@ -268,18 +319,58 @@
       - name: Checkout repository
         uses: actions/checkout@v4
 
-      - name: Run slow tests
+      - name: Run slow single-GPU tests
+        env:
+          BIONEMO_DATA_SOURCE: ngc
+        run: |
+          chmod +x ./ci/scripts/pytest_runner.sh
+          ./ci/scripts/pytest_runner.sh --no-nbval --only-slow --skip-multi-gpu --allow-no-tests
+
+  run-tests-slow-multi-gpu:
+    needs:
+      - build-bionemo-image
+      - run-tests-slow-single-gpu
+      - get-pr-labels
+    runs-on: linux-amd64-gpu-rtxa6000-latest-2-nemo
+    container:
+      image: svcbionemo023/bionemo-framework:${{ github.run_id }}
+      credentials:
+        username: ${{ vars.DOCKER_USERNAME }}
+        password: ${{ secrets.DOCKER_PASSWORD }}
+    # Run slow multi-GPU tests ONLY when:
+    # 1. On schedule: if build succeeds AND slow single-GPU tests pass
+    # 2. On push: if build succeeds AND (ciflow:all OR (ciflow:multi-gpu AND ciflow:slow))
+    # Do NOT run on merge_group or any other events
+    if: |
+        (needs.build-bionemo-image.result == 'success') &&
+        (
+          (
+            github.event_name == 'schedule' &&
+            needs.run-tests-slow-single-gpu.result == 'success'
+          ) ||
+          (
+            contains(fromJSON(needs.get-pr-labels.outputs.labels || '[]'), 'ciflow:all') ||
+            (
+              contains(fromJSON(needs.get-pr-labels.outputs.labels || '[]'), 'ciflow:multi-gpu') &&
+              contains(fromJSON(needs.get-pr-labels.outputs.labels || '[]'), 'ciflow:slow')
+            )
+          )
+        )
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
 
+      - name: Run slow multi-GPU tests
         env:
           BIONEMO_DATA_SOURCE: ngc
         # Not every sub-package has slow tests, and since some sub-packages have tests under the same name we need
         #  to run package by package like we do with the fast tests.
         run: |
-          chmod +x ./ci/scripts/run_pytest_slow.sh
-          ./ci/scripts/run_pytest_slow.sh
+          chmod +x ./ci/scripts/run_pytest_slow_multigpu.sh
+          ./ci/scripts/run_pytest_slow_multigpu.sh
 
 
   run-tests-notebooks:
    needs:
      - build-bionemo-image
      - get-pr-labels
@@ -317,8 +408,10 @@
       - pre-commit
       - get-pr-labels
       - build-bionemo-image
-      - run-tests
-      - run-tests-slow
+      - run-tests-single-gpu
+      - run-tests-multi-gpu
+      - run-tests-slow-single-gpu
+      - run-tests-slow-multi-gpu
       - run-tests-notebooks
       # Add all other run-*-test jobs
     runs-on: ubuntu-latest

@@ -1,3 +1,26 @@
+# BioNeMo Recipes CI Workflow
+#
+# This workflow runs tests for BioNeMo recipes and models on various triggers:
+#
+# TRIGGERS:
+# - Push to pull-request branches or dependabot branches
+# - Merge group events (when PRs are merged via merge queue)
+# - Scheduled runs (daily at 9 AM UTC)
+#
+# WORKFLOW OVERVIEW:
+# 1. changed-dirs: Detects which recipe/model directories have changed
+# 2. get-pr-labels: Retrieves PR labels for conditional job execution
+# 3. unit-tests-single-gpu: Runs single-GPU tests for changed directories
+# 4. unit-tests-multi-gpu: Runs multi-GPU tests (conditional - see below)
+# 5. verify-recipe-tests: Verifies all test jobs completed successfully
+#
+# MULTI-GPU TEST EXECUTION:
+# Multi-GPU tests run in these situations:
+# - On schedule (nightly): if changed dirs exist AND single-GPU tests pass
+# - On PRs (push events): if changed dirs exist AND (ciflow:all-recipes OR ciflow:multi-gpu)
+# - NOT on merge_group or any other events
+# Note: On push, multi-GPU tests run in parallel with single-GPU tests (no dependency)
+
 name: "BioNeMo Recipes CI"
 
 on:
@@ -127,11 +150,11 @@
           echo '${{ toJSON(steps.set-dirs.outputs) }}'
         shell: bash
 
-  unit-tests:
+  unit-tests-single-gpu:
     needs: changed-dirs
     runs-on: linux-amd64-gpu-l4-latest-1
     if: ${{ needs.changed-dirs.outputs.dirs != '[]' }}
-    name: "unit-tests (${{ matrix.recipe.name }})"
+    name: "unit-tests-single-gpu (${{ matrix.recipe.name }})"
     container:
       image: ${{ matrix.recipe.image }}
       options: --shm-size=16G
@@ -169,23 +192,120 @@
             exit 1
           fi
 
-      - name: Run tests
+      - name: Run single-GPU tests
+        working-directory: ${{ matrix.recipe.dir }}
+        run: pytest -v -m "not multi_gpu" .
+
+  # With copy-pr-bot, we need to get the PR labels from the PR API rather than from the event metadata.
+  get-pr-labels:
+    runs-on: ubuntu-latest
+    outputs:
+      labels: ${{ steps.get-labels.outputs.labels || steps.get-labels-empty.outputs.labels }}
+    steps:
+      - name: Get PR number from branch
+        if: startsWith(github.ref, 'refs/heads/pull-request/')
+        id: get-pr-num
+        run: |
+          PR_NUM=$(echo ${{ github.ref_name }} | grep -oE '[0-9]+$')
+          echo "pr_num=$PR_NUM" >> $GITHUB_OUTPUT
+
+      - name: Get PR labels
+        id: get-labels
+        if: startsWith(github.ref, 'refs/heads/pull-request/')
+        env:
+          GH_TOKEN: ${{ github.token }}
+        run: |
+          LABELS=$(gh api repos/${{ github.repository }}/pulls/${{ steps.get-pr-num.outputs.pr_num }} --jq '[.labels[].name]' || echo "[]")
+          echo "labels=$LABELS" >> $GITHUB_OUTPUT
+          echo "Retrieved labels: $LABELS"
+
+      - name: Set empty labels for non-PR branches
+        if: ${{ !startsWith(github.ref, 'refs/heads/pull-request/') }}
+        id: get-labels-empty
+        run: |
+          echo "labels=[]" >> $GITHUB_OUTPUT
+          echo "Set empty labels for non-PR branch"
+
+  unit-tests-multi-gpu:
@@ -22,6 +22,9 @@
 # Note: On push, multi-GPU tests run in parallel with single-GPU tests (no dependency)
 name: "BioNeMo Recipes CI"
+permissions:
+  contents: read
+  pull-requests: read
 on:
  push:
@@ -22,6 +22,9 @@
 # Note: On push, multi-GPU tests run in parallel with single-GPU tests (no dependency)

 name: "BioNeMo Recipes CI"
+permissions:
+  contents: read
+  pull-requests: read

 on:
  push:
+    needs:
+      - changed-dirs
+      - unit-tests-single-gpu
+      - get-pr-labels
+    runs-on: linux-amd64-gpu-rtxa6000-latest-2-nemo
+    # Run multi-GPU tests ONLY when:
+    # 1. On schedule: if changed dirs exist AND single-GPU tests pass
+    # 2. On push: if changed dirs exist AND (ciflow:all-recipes OR ciflow:multi-gpu label)
+    # Do NOT run on merge_group or any other events
+    if: |
+      (needs.changed-dirs.outputs.dirs != '[]') &&
+      (
+        (
+          github.event_name == 'schedule' &&
+          needs.unit-tests-single-gpu.result == 'success'
+        ) ||
+        (
+          contains(fromJSON(needs.get-pr-labels.outputs.labels || '[]'), 'ciflow:all-recipes') ||
+          contains(fromJSON(needs.get-pr-labels.outputs.labels || '[]'), 'ciflow:multi-gpu')
+        )
+      )
+    name: "unit-tests-multi-gpu (${{ matrix.recipe.name }})"
+    container:
+      image: ${{ matrix.recipe.image }}
+      options: --shm-size=16G
+    strategy:
+      matrix:
+        recipe: ${{ fromJson(needs.changed-dirs.outputs.dirs) }}
+      fail-fast: false
+
+    steps:
+      - name: Show GPU info
+        run: nvidia-smi
+      - name: Setup proxy cache
+        uses: nv-gha-runners/setup-proxy-cache@main
+
+      - name: Checkout repository
+        uses: actions/checkout@v4
+        with:
+          sparse-checkout: "${{ matrix.recipe.dir }}"
+          sparse-checkout-cone-mode: false
+
+      - name: Install dependencies
+        working-directory: ${{ matrix.recipe.dir }}
+        run: |
+          if [ -f pyproject.toml ] || [ -f setup.py ]; then
+            PIP_CONSTRAINT= pip install -e .
+            echo "Installed ${{ matrix.recipe.dir }} as editable package"
+          elif [ -f requirements.txt ]; then
+            PIP_CONSTRAINT= pip install -r requirements.txt
+            echo "Installed ${{ matrix.recipe.dir }} from requirements.txt"
+          else
+            echo "No pyproject.toml, setup.py, or requirements.txt found in ${{ matrix.recipe.dir }}"
+            exit 1
+          fi
+
+      - name: Run multi-GPU tests
         working-directory: ${{ matrix.recipe.dir }}
-        run: pytest -v .
+        run: |
+          # Run multi-GPU tests, but allow exit code 5 (no tests found) to pass
+          pytest -v -m "multi_gpu" . || [ $? -eq 5 ]
 
   verify-recipe-tests:
     # This job checks the status of the unit-tests matrix and fails if any matrix job failed or was cancelled.
     # Use this job as the required check for PRs.
-    needs: unit-tests
+    needs:
+      - changed-dirs
+      - get-pr-labels
+      - unit-tests-single-gpu
+      - unit-tests-multi-gpu
     runs-on: ubuntu-latest
     if: always()
     steps:
       - name: Check unit-tests matrix status
         run: |
-          if [[ "${{ needs.unit-tests.result }}" == "failure" || "${{ needs.unit-tests.result }}" == "cancelled" ]]; then
+          if [[ "${{ contains(needs.*.result, 'failure') || contains(needs.*.result, 'cancelled') }}" == "true" ]]; then
             echo "Some unit-tests matrix jobs have failed or been cancelled!"
             exit 1
           else
            echo "All unit-tests matrix jobs have completed successfully or were skipped!"
            exit 0
          fi
@@ -27,4 +27,8 @@ package-dir = { "" = "src" }
 
 [tool.pytest.ini_options]
 addopts = ["--color=yes", "--strict-markers", "--tb=short", "-v"]
-markers = ["fp8: marks tests as requiring FP8 support"]
+markers = [
+    "fp8: marks tests as requiring FP8 support",
+    "slow: medium-complexity tests, like integration tests, on a single GPU",
+    "multi_gpu: tests that require multiple GPUs",
+]
@@ -27,3 +27,7 @@ package-dir = { "" = "src" }
 
 [tool.pytest.ini_options]
 addopts = ["--color=yes", "--strict-markers", "--tb=short", "-v"]
+markers = [
+    "slow: medium-complexity tests, like integration tests, on a single GPU",
+    "multi_gpu: tests that require multiple GPUs",
+]
@@ -61,6 +61,7 @@ def test_single_process_attaches_correct_fp8_recipe(strategy):
         pytest.fail(f"Command failed with exit code {result.returncode}")
 
 
+@pytest.mark.multi_gpu
 @pytest.mark.parametrize(
     "strategy", ["ddp", "fsdp2", pytest.param("mfsdp", marks=pytest.mark.xfail(reason="BIONEMO-2999"))]
 )

@@ -66,6 +66,7 @@ def test_ddp_vs_fsdp_single_gpu(strategy, backend):
         pytest.fail(f"Command failed with exit code {result.returncode}")
 
 
+@pytest.mark.multi_gpu
 @requires_multi_gpu
 @pytest.mark.parametrize("strategy", ["fsdp2", pytest.param("mfsdp", marks=pytest.mark.xfail(reason="BIONEMO-2726"))])
 @pytest.mark.parametrize("backend", ["te", "eager"])

@@ -83,6 +83,7 @@ python_classes = ["Test*"]
 python_functions = ["test_*"]
 markers = [
     "slow: marks tests as slow (deselect with '-m \"not slow\"')",
+    "multi_gpu: marks tests that require multiple GPUs (deselect with '-m \"not multi_gpu\"')",
     "integration: marks tests as integration tests",
     "unit: marks tests as unit tests",
 ]