diff --git a/.github/workflows/test-gb-25.yml b/.github/workflows/test-gb-25.yml index e537d005f4..92894b2915 100644 --- a/.github/workflows/test-gb-25.yml +++ b/.github/workflows/test-gb-25.yml @@ -54,6 +54,8 @@ jobs: # - '0123456789abcdef0123456789abcdef01234567' reactant_commit: - 'main' + - 'wsmoses-patch-8' + steps: - name: Check GPUs @@ -136,7 +138,7 @@ jobs: uses: actions/checkout@v5 with: repository: 'PRONTOLab/GB-25' - ref: ${{ matrix.gb25_commit }} + ref: 'mg/oom-reproducer' path: 'GB-25' - name: Set GB25_DIR # We have to use `${GITHUB_WORKSPACE}` instead of `github.workspace` because GitHub @@ -183,17 +185,29 @@ jobs: julia --project --color=yes -e "using MPI; MPI.install_mpiexecjl(; destdir=\"${MPIEXECJL_DIR}\")" echo "${MPIEXECJL_DIR}" >> "${GITHUB_PATH}" working-directory: ${{ env.GB25_DIR }} - - name: Run GB-25 simulation + - name: Run GB-25 simulation (east-west) timeout-minutes: 60 run: | - export XLA_FLAGS='--xla_dump_to=${{ env.GB25_DIR }}/xla_dump' - timeout --signal=TERM --verbose 59m mpiexecjl -np 1 julia --color=yes --project -O0 --startup-file=no --threads=16 --compiled-modules=strict sharding/sharded_baroclinic_instability_simulation_run.jl + export XLA_FLAGS='--xla_dump_to=${{ env.GB25_DIR }}/xla_dump_east_west' + timeout --signal=TERM --verbose 59m mpiexecjl -np 1 julia --color=yes --project -O0 --startup-file=no --threads=16 --compiled-modules=strict sharding/sharded_baroclinic_instability_simulation_run.jl --grid-x=6144 --grid-y=1536 --grid-z=4 --test-type=fill_east_west working-directory: ${{ env.GB25_DIR }} - - name: Test correctness in GB-25 code - timeout-minutes: 20 + - name: Run GB-25 simulation (north-south) + timeout-minutes: 60 + run: | + export XLA_FLAGS='--xla_dump_to=${{ env.GB25_DIR }}/xla_dump_north_south' + timeout --signal=TERM --verbose 59m mpiexecjl -np 1 julia --color=yes --project -O0 --startup-file=no --threads=16 --compiled-modules=strict sharding/sharded_baroclinic_instability_simulation_run.jl --grid-x=6144 --grid-y=1536 --grid-z=4 --test-type=fill_north_south + working-directory: ${{ env.GB25_DIR }} + - name: Run GB-25 simulation (all regions) + timeout-minutes: 60 run: | - timeout --signal=TERM --verbose 19m mpiexecjl -np 1 julia --color=yes --project -O0 --startup-file=no --threads=16 --compiled-modules=strict correctness/correctness_sharded_baroclinic_instability_simulation_run.jl + export XLA_FLAGS='--xla_dump_to=${{ env.GB25_DIR }}/xla_dump_all' + timeout --signal=TERM --verbose 59m mpiexecjl -np 1 julia --color=yes --project -O0 --startup-file=no --threads=16 --compiled-modules=strict sharding/sharded_baroclinic_instability_simulation_run.jl --grid-x=6144 --grid-y=1536 --grid-z=4 working-directory: ${{ env.GB25_DIR }} + # - name: Test correctness in GB-25 code + # timeout-minutes: 20 + # run: | + # timeout --signal=TERM --verbose 19m mpiexecjl -np 1 julia --color=yes --project -O0 --startup-file=no --threads=16 --compiled-modules=strict correctness/correctness_sharded_baroclinic_instability_simulation_run.jl + # working-directory: ${{ env.GB25_DIR }} - name: Upload MLIR and XLA modules uses: actions/upload-artifact@v4 timeout-minutes: 10 @@ -211,7 +225,7 @@ jobs: if: ${{ !cancelled() }} with: name: 'simulation-xla-dump-${{ env.ARTIFACT_INDEX }}' - path: '${{ env.GB25_DIR }}/**/xla_dump' + path: '${{ env.GB25_DIR }}/**/xla_dump*' retention-days: 90 overwrite: false - name: Upload XLA profiler traces