Skip to content
Draft
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
f82f30e
Update the xla commit to experiment potential fix for concatenate on …
felixwqp Aug 15, 2025
94619f5
Use larger grid to trigger failure condition
giordano Aug 15, 2025
944c508
[CI] Use branch name instad of commit
giordano Aug 15, 2025
ae8e45b
[CI] Compare with default XLA commit
giordano Aug 15, 2025
f887a3d
Remove the grid override to bypass OOM
felixwqp Aug 16, 2025
931882d
Update test-gb-25.yml
wsmoses Aug 19, 2025
028e97f
big grid
wsmoses Aug 19, 2025
bb4be32
Fix setting larger grid
giordano Aug 19, 2025
c66d2fa
[CI] Go back to using default GB-25 and XLA commits
giordano Aug 19, 2025
4424832
Checkout GB-25 branch with reduced code
giordano Aug 23, 2025
a03626f
Skip correctness step
giordano Aug 23, 2025
0bd93ae
Add 'wsmoses-patch-7' to reactant_commit branch
wsmoses Sep 3, 2025
2f01821
Merge branch 'main' into wfelix_xla_dev
giordano Sep 3, 2025
62cc059
Merge branch 'main' into wfelix_xla_dev
giordano Sep 10, 2025
79bfbc1
Run with an even smaller kernel
giordano Sep 12, 2025
223057e
Change test type from fill_north_south to fill_east_west
giordano Sep 13, 2025
822c7e3
Comment out reactant_commit branch entry
giordano Sep 13, 2025
b7cf38e
Update test-gb-25.yml
giordano Sep 13, 2025
ea4169a
Update artifact path to include xla_dump wildcard
giordano Sep 13, 2025
7b28516
Merge branch 'main' into wfelix_xla_dev
wsmoses Sep 30, 2025
6caac71
Update reactant_commit branch in workflow file
wsmoses Sep 30, 2025
5fe17db
Add conditional step to check for GPUs
wsmoses Sep 30, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 22 additions & 8 deletions .github/workflows/test-gb-25.yml
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,8 @@ jobs:
# - '0123456789abcdef0123456789abcdef01234567'
reactant_commit:
- 'main'
- 'wsmoses-patch-8'


steps:
- name: Check GPUs
Expand Down Expand Up @@ -136,7 +138,7 @@ jobs:
uses: actions/checkout@v5
with:
repository: 'PRONTOLab/GB-25'
ref: ${{ matrix.gb25_commit }}
ref: 'mg/oom-reproducer'
path: 'GB-25'
- name: Set GB25_DIR
# We have to use `${GITHUB_WORKSPACE}` instead of `github.workspace` because GitHub
Expand Down Expand Up @@ -183,17 +185,29 @@ jobs:
julia --project --color=yes -e "using MPI; MPI.install_mpiexecjl(; destdir=\"${MPIEXECJL_DIR}\")"
echo "${MPIEXECJL_DIR}" >> "${GITHUB_PATH}"
working-directory: ${{ env.GB25_DIR }}
- name: Run GB-25 simulation
- name: Run GB-25 simulation (east-west)
timeout-minutes: 60
run: |
export XLA_FLAGS='--xla_dump_to=${{ env.GB25_DIR }}/xla_dump'
timeout --signal=TERM --verbose 59m mpiexecjl -np 1 julia --color=yes --project -O0 --startup-file=no --threads=16 --compiled-modules=strict sharding/sharded_baroclinic_instability_simulation_run.jl
export XLA_FLAGS='--xla_dump_to=${{ env.GB25_DIR }}/xla_dump_east_west'
timeout --signal=TERM --verbose 59m mpiexecjl -np 1 julia --color=yes --project -O0 --startup-file=no --threads=16 --compiled-modules=strict sharding/sharded_baroclinic_instability_simulation_run.jl --grid-x=6144 --grid-y=1536 --grid-z=4 --test-type=fill_east_west
working-directory: ${{ env.GB25_DIR }}
- name: Test correctness in GB-25 code
timeout-minutes: 20
- name: Run GB-25 simulation (north-south)
timeout-minutes: 60
run: |
export XLA_FLAGS='--xla_dump_to=${{ env.GB25_DIR }}/xla_dump_north_south'
timeout --signal=TERM --verbose 59m mpiexecjl -np 1 julia --color=yes --project -O0 --startup-file=no --threads=16 --compiled-modules=strict sharding/sharded_baroclinic_instability_simulation_run.jl --grid-x=6144 --grid-y=1536 --grid-z=4 --test-type=fill_north_south
working-directory: ${{ env.GB25_DIR }}
- name: Run GB-25 simulation (all regions)
timeout-minutes: 60
run: |
timeout --signal=TERM --verbose 19m mpiexecjl -np 1 julia --color=yes --project -O0 --startup-file=no --threads=16 --compiled-modules=strict correctness/correctness_sharded_baroclinic_instability_simulation_run.jl
export XLA_FLAGS='--xla_dump_to=${{ env.GB25_DIR }}/xla_dump_all'
timeout --signal=TERM --verbose 59m mpiexecjl -np 1 julia --color=yes --project -O0 --startup-file=no --threads=16 --compiled-modules=strict sharding/sharded_baroclinic_instability_simulation_run.jl --grid-x=6144 --grid-y=1536 --grid-z=4
working-directory: ${{ env.GB25_DIR }}
# - name: Test correctness in GB-25 code
# timeout-minutes: 20
# run: |
# timeout --signal=TERM --verbose 19m mpiexecjl -np 1 julia --color=yes --project -O0 --startup-file=no --threads=16 --compiled-modules=strict correctness/correctness_sharded_baroclinic_instability_simulation_run.jl
# working-directory: ${{ env.GB25_DIR }}
- name: Upload MLIR and XLA modules
uses: actions/upload-artifact@v4
timeout-minutes: 10
Expand All @@ -211,7 +225,7 @@ jobs:
if: ${{ !cancelled() }}
with:
name: 'simulation-xla-dump-${{ env.ARTIFACT_INDEX }}'
path: '${{ env.GB25_DIR }}/**/xla_dump'
path: '${{ env.GB25_DIR }}/**/xla_dump*'
retention-days: 90
overwrite: false
- name: Upload XLA profiler traces
Expand Down
Loading