Self-hosted runner (scheduled) #633
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: Self-hosted runner (scheduled) | |
# Note that each job's dependencies go into a corresponding docker file. | |
# | |
# For example for `run_all_tests_torch_cuda_extensions_gpu` the docker image is | |
# `huggingface/transformers-pytorch-deepspeed-latest-gpu`, which can be found at | |
# `docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile` | |
on: | |
repository_dispatch: | |
schedule: | |
- cron: "0 2 * * *" | |
env: | |
HF_HOME: /mnt/cache | |
TRANSFORMERS_IS_CI: yes | |
OMP_NUM_THREADS: 8 | |
MKL_NUM_THREADS: 8 | |
RUN_SLOW: yes | |
SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }} | |
TF_FORCE_GPU_ALLOW_GROWTH: true | |
RUN_PT_TF_CROSS_TESTS: 1 | |
jobs: | |
check_runner_status: | |
name: Check Runner Status | |
runs-on: ubuntu-latest | |
steps: | |
- name: Checkout transformers | |
uses: actions/checkout@v2 | |
with: | |
fetch-depth: 2 | |
- name: Check Runner Status | |
run: python utils/check_self_hosted_runner.py --target_runners single-gpu-scheduled-ci-runner-docker,multi-gpu-scheduled-ci-runner-docker --token ${{ secrets.ACCESS_REPO_INFO_TOKEN }} | |
check_runners: | |
name: Check Runners | |
needs: check_runner_status | |
strategy: | |
matrix: | |
machine_type: [single-gpu, multi-gpu] | |
runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }} | |
container: | |
image: huggingface/transformers-all-latest-gpu | |
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ | |
steps: | |
- name: NVIDIA-SMI | |
run: | | |
nvidia-smi | |
setup: | |
name: Setup | |
needs: check_runners | |
strategy: | |
matrix: | |
machine_type: [single-gpu, multi-gpu] | |
runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }} | |
container: | |
image: huggingface/transformers-all-latest-gpu | |
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ | |
outputs: | |
matrix: ${{ steps.set-matrix.outputs.matrix }} | |
steps: | |
- name: Update clone | |
working-directory: /transformers | |
run: | | |
git fetch && git checkout ${{ github.sha }} | |
- name: Cleanup | |
working-directory: /transformers | |
run: | | |
rm -rf tests/__pycache__ | |
rm -rf tests/models/__pycache__ | |
rm -rf reports | |
- id: set-matrix | |
name: Identify models to test | |
working-directory: /transformers/tests | |
run: | | |
echo "::set-output name=matrix::$(python3 -c 'import os; tests = os.getcwd(); model_tests = os.listdir(os.path.join(tests, "models")); d1 = sorted(list(filter(os.path.isdir, os.listdir(tests)))); d2 = sorted(list(filter(os.path.isdir, [f"models/{x}" for x in model_tests]))); d1.remove("models"); d = d2 + d1; print(d)')" | |
- name: NVIDIA-SMI | |
run: | | |
nvidia-smi | |
run_tests_single_gpu: | |
name: Model tests | |
strategy: | |
fail-fast: false | |
matrix: | |
folders: ${{ fromJson(needs.setup.outputs.matrix) }} | |
machine_type: [single-gpu] | |
runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }} | |
container: | |
image: huggingface/transformers-all-latest-gpu | |
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ | |
needs: setup | |
steps: | |
- name: Echo folder ${{ matrix.folders }} | |
shell: bash | |
# For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to | |
# set the artifact folder names (because the character `/` is not allowed). | |
run: | | |
echo "${{ matrix.folders }}" | |
matrix_folders=${{ matrix.folders }} | |
matrix_folders=${matrix_folders/'models/'/'models_'} | |
echo "$matrix_folders" | |
echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV | |
- name: Update clone | |
working-directory: /transformers | |
run: git fetch && git checkout ${{ github.sha }} | |
- name: NVIDIA-SMI | |
run: | | |
nvidia-smi | |
- name: Environment | |
working-directory: /transformers | |
run: | | |
python3 utils/print_env.py | |
- name: Run all tests on GPU | |
working-directory: /transformers | |
run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} tests/${{ matrix.folders }} | |
- name: Failure short reports | |
if: ${{ failure() }} | |
continue-on-error: true | |
run: cat /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt | |
- name: Test suite reports artifacts | |
if: ${{ always() }} | |
uses: actions/upload-artifact@v2 | |
with: | |
name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports | |
path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} | |
run_tests_multi_gpu: | |
name: Model tests | |
strategy: | |
fail-fast: false | |
matrix: | |
folders: ${{ fromJson(needs.setup.outputs.matrix) }} | |
machine_type: [multi-gpu] | |
runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }} | |
container: | |
image: huggingface/transformers-all-latest-gpu | |
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ | |
needs: setup | |
steps: | |
- name: Echo folder ${{ matrix.folders }} | |
shell: bash | |
# For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to | |
# set the artifact folder names (because the character `/` is not allowed). | |
run: | | |
echo "${{ matrix.folders }}" | |
matrix_folders=${{ matrix.folders }} | |
matrix_folders=${matrix_folders/'models/'/'models_'} | |
echo "$matrix_folders" | |
echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV | |
- name: Update clone | |
working-directory: /transformers | |
run: git fetch && git checkout ${{ github.sha }} | |
- name: NVIDIA-SMI | |
run: | | |
nvidia-smi | |
- name: Environment | |
working-directory: /transformers | |
run: | | |
python3 utils/print_env.py | |
- name: Run all tests on GPU | |
working-directory: /transformers | |
run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} tests/${{ matrix.folders }} | |
- name: Failure short reports | |
if: ${{ failure() }} | |
continue-on-error: true | |
run: cat /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt | |
- name: Test suite reports artifacts | |
if: ${{ always() }} | |
uses: actions/upload-artifact@v2 | |
with: | |
name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports | |
path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} | |
run_examples_gpu: | |
name: Examples directory | |
runs-on: [self-hosted, single-gpu-docker] | |
container: | |
image: huggingface/transformers-all-latest-gpu | |
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ | |
needs: setup | |
steps: | |
- name: Update clone | |
working-directory: /transformers | |
run: git fetch && git checkout ${{ github.sha }} | |
- name: NVIDIA-SMI | |
run: | | |
nvidia-smi | |
- name: Environment | |
working-directory: /transformers | |
run: | | |
python3 utils/print_env.py | |
- name: Run examples tests on GPU | |
working-directory: /transformers | |
run: | | |
pip install -r examples/pytorch/_tests_requirements.txt | |
python3 -m pytest -v --make-reports=single-gpu_examples_gpu examples/pytorch | |
- name: Failure short reports | |
if: ${{ failure() }} | |
continue-on-error: true | |
run: cat /transformers/reports/single-gpu_examples_gpu/failures_short.txt | |
- name: Test suite reports artifacts | |
if: ${{ always() }} | |
uses: actions/upload-artifact@v2 | |
with: | |
name: single-gpu_run_examples_gpu | |
path: /transformers/reports/single-gpu_examples_gpu | |
run_pipelines_torch_gpu: | |
name: PyTorch pipelines | |
strategy: | |
fail-fast: false | |
matrix: | |
machine_type: [single-gpu, multi-gpu] | |
runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }} | |
container: | |
image: huggingface/transformers-pytorch-gpu | |
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ | |
needs: setup | |
steps: | |
- name: Update clone | |
working-directory: /transformers | |
run: git fetch && git checkout ${{ github.sha }} | |
- name: NVIDIA-SMI | |
run: | | |
nvidia-smi | |
- name: Environment | |
working-directory: /transformers | |
run: | | |
python3 utils/print_env.py | |
- name: Run all pipeline tests on GPU | |
working-directory: /transformers | |
run: | | |
python3 -m pytest -n 1 -v --dist=loadfile --make-reports=${{ matrix.machine_type }}_tests_torch_pipeline_gpu tests/pipelines | |
- name: Failure short reports | |
if: ${{ failure() }} | |
continue-on-error: true | |
run: cat /transformers/reports/${{ matrix.machine_type }}_tests_torch_pipeline_gpu/failures_short.txt | |
- name: Test suite reports artifacts | |
if: ${{ always() }} | |
uses: actions/upload-artifact@v2 | |
with: | |
name: ${{ matrix.machine_type }}_run_tests_torch_pipeline_gpu | |
path: /transformers/reports/${{ matrix.machine_type }}_tests_torch_pipeline_gpu | |
run_pipelines_tf_gpu: | |
name: TensorFlow pipelines | |
strategy: | |
fail-fast: false | |
matrix: | |
machine_type: [single-gpu, multi-gpu] | |
runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }} | |
container: | |
image: huggingface/transformers-tensorflow-gpu | |
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ | |
needs: setup | |
steps: | |
- name: Update clone | |
working-directory: /transformers | |
run: | | |
git fetch && git checkout ${{ github.sha }} | |
- name: NVIDIA-SMI | |
run: | | |
nvidia-smi | |
- name: Environment | |
working-directory: /transformers | |
run: | | |
python3 utils/print_env.py | |
- name: Run all pipeline tests on GPU | |
working-directory: /transformers | |
run: | | |
python3 -m pytest -n 1 -v --dist=loadfile --make-reports=${{ matrix.machine_type }}_tests_tf_pipeline_gpu tests/pipelines | |
- name: Failure short reports | |
if: ${{ always() }} | |
run: | | |
cat /transformers/reports/${{ matrix.machine_type }}_tests_tf_pipeline_gpu/failures_short.txt | |
- name: Test suite reports artifacts | |
if: ${{ always() }} | |
uses: actions/upload-artifact@v2 | |
with: | |
name: ${{ matrix.machine_type }}_run_tests_tf_pipeline_gpu | |
path: /transformers/reports/${{ matrix.machine_type }}_tests_tf_pipeline_gpu | |
run_all_tests_torch_cuda_extensions_gpu: | |
name: Torch CUDA extension tests | |
strategy: | |
fail-fast: false | |
matrix: | |
machine_type: [single-gpu, multi-gpu] | |
runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }} | |
needs: setup | |
container: | |
image: huggingface/transformers-pytorch-deepspeed-latest-gpu | |
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ | |
steps: | |
- name: Update clone | |
working-directory: /workspace/transformers | |
run: git fetch && git checkout ${{ github.sha }} | |
- name: Remove cached torch extensions | |
run: rm -rf /github/home/.cache/torch_extensions/ | |
# To avoid unknown test failures | |
- name: Pre build DeepSpeed *again* | |
working-directory: /workspace | |
run: | | |
python3 -m pip uninstall -y deepspeed | |
DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_AIO=1 DS_BUILD_UTILS=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check | |
- name: NVIDIA-SMI | |
run: | | |
nvidia-smi | |
- name: Environment | |
working-directory: /workspace/transformers | |
run: | | |
python utils/print_env.py | |
- name: Run all tests on GPU | |
working-directory: /workspace/transformers | |
run: | | |
python -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu tests/deepspeed tests/extended | |
- name: Failure short reports | |
if: ${{ failure() }} | |
continue-on-error: true | |
run: cat /workspace/transformers/reports/${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu/failures_short.txt | |
- name: Test suite reports artifacts | |
if: ${{ always() }} | |
uses: actions/upload-artifact@v2 | |
with: | |
name: ${{ matrix.machine_type }}_run_tests_torch_cuda_extensions_gpu_test_reports | |
path: /workspace/transformers/reports/${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu | |
send_results: | |
name: Send results to webhook | |
runs-on: ubuntu-latest | |
if: always() | |
needs: [ | |
check_runner_status, | |
check_runners, | |
setup, | |
run_tests_single_gpu, | |
run_tests_multi_gpu, | |
run_examples_gpu, | |
run_pipelines_tf_gpu, | |
run_pipelines_torch_gpu, | |
run_all_tests_torch_cuda_extensions_gpu | |
] | |
steps: | |
- name: Preliminary job status | |
shell: bash | |
# For the meaning of these environment variables, see the job `Setup` | |
run: | | |
echo "Runner availability: ${{ needs.check_runner_status.result }}" | |
echo "Runner status: ${{ needs.check_runners.result }}" | |
echo "Setup status: ${{ needs.setup.result }}" | |
- uses: actions/checkout@v2 | |
- uses: actions/download-artifact@v2 | |
- name: Send message to Slack | |
env: | |
CI_SLACK_BOT_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }} | |
CI_SLACK_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID }} | |
CI_SLACK_CHANNEL_ID_DAILY: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }} | |
CI_SLACK_CHANNEL_DUMMY_TESTS: ${{ secrets.CI_SLACK_CHANNEL_DUMMY_TESTS }} | |
CI_SLACK_REPORT_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }} | |
CI_EVENT: scheduled | |
RUNNER_STATUS: ${{ needs.check_runner_status.result }} | |
RUNNER_ENV_STATUS: ${{ needs.check_runners.result }} | |
SETUP_STATUS: ${{ needs.setup.result }} | |
# We pass `needs.setup.outputs.matrix` as the argument. A processing in `notification_service.py` to change | |
# `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`. | |
run: | | |
pip install slack_sdk | |
python utils/notification_service.py "${{ needs.setup.outputs.matrix }}" |