torchtitan/.github/workflows/integration_test_8gpu_features.yaml at 88f37f65da86eec80bc0247908bd25dff913d781 · pytorch/torchtitan · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
name: 8 GPU Feature Tests

on:
  push:
    branches: [ main ]
    paths-ignore:
      - 'torchtitan/experiments/**'
  pull_request:
    paths-ignore:
      - 'torchtitan/experiments/**'
  schedule:
    # Runs every 6 hours
    - cron: '0 */6 * * *'

concurrency:
  group: unit-test${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }}
  cancel-in-progress: true

defaults:
  run:
    shell: bash -l -eo pipefail {0}

permissions:
      id-token: write
      contents: read

jobs:
  build-test:
    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
    strategy:
      fail-fast: false
      matrix:
        include:
          - name: cuda
            runner: linux.g5.48xlarge.nvidia.gpu
            gpu-arch-type: cuda
            gpu-arch-version: "12.6"
            # This image is faster to clone than the default, but it lacks CC needed by triton
            # (1m25s vs 2m37s).
            docker-image: torchtitan-ubuntu-20.04-clang12
            index-url: https://download.pytorch.org/whl/nightly/cu126
          - name: rocm
            runner: linux.rocm.gpu.gfx942.8
            gpu-arch-type: rocm
            gpu-arch-version: "7.0"
            docker-image: torchtitan-rocm-ubuntu-22.04-clang12
            index-url: https://download.pytorch.org/whl/nightly/rocm7.0
    with:
      runner: ${{ matrix.runner }}
      gpu-arch-type: ${{ matrix.gpu-arch-type }}
      gpu-arch-version: ${{ matrix.gpu-arch-version }}
      docker-image: ${{ matrix.docker-image }}
      repository: pytorch/torchtitan
      upload-artifact: outputs
      timeout: 45
      script: |
        set -eux

        # The generic Linux job chooses to use base env, not the one setup by the image
        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
        conda activate "${CONDA_ENV}"

        # Log CUDA driver version for debugging.
        DRIVER_VERSION=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader | head -n 1 || true)
        echo "CUDA driver version: ${DRIVER_VERSION}"

        pip config --user set global.progress_bar off

        python -m pip install --force-reinstall --pre torch --index-url ${{ matrix.index-url }}

        USE_CPP=0 python -m pip install --pre torchao --index-url ${{ matrix.index-url }}

        sudo mkdir -p "$RUNNER_TEMP/artifacts-to-be-uploaded"
        sudo chown -R $(id -u):$(id -g) "$RUNNER_TEMP/artifacts-to-be-uploaded"

        export TEST_WITH_ROCM=$([[ "${{ matrix.gpu-arch-type }}" == "rocm" ]] && echo 1 || echo 0)
        python -m tests.integration_tests.run_tests --test_suite features $RUNNER_TEMP/artifacts-to-be-uploaded --ngpu 8

        # Verify the accuracy.
        echo "Checking FSDP4 v.s. HSDP2FSDP2TP2 accuracy parity"
        export baseline_options="--parallelism.data_parallel_replicate_degree=1"
        export test_options="--parallelism.data_parallel_replicate_degree=2 --parallelism.tensor_parallel_degree=2"
        python3 scripts/loss_compare.py . . --baseline-options="${baseline_options}" --test-options="${test_options}" --steps=10 --job-dump-folder="${RUNNER_TEMP}/artifacts-to-be-uploaded/accuracy_comparison_outputs" --assert-equal --baseline-ngpus=4 --test-ngpus=8 --steps=1

        # Cleanup the checkpoints so that we don't waste network bandwidth and time.
        rm -rf $RUNNER_TEMP/artifacts-to-be-uploaded/*/checkpoint
        rm -rf artifacts-to-be-uploaded/*/checkpoint