forked from pytorch/PiPPy
-
Notifications
You must be signed in to change notification settings - Fork 0
269 lines (252 loc) · 12.8 KB
/
pippy_tests.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
name: PiPPy tests
on:
push:
branches:
- main
pull_request:
paths:
- '.github/workflows/pippy**'
- 'pippy/**'
- 'test/**'
- 'examples/**'
- '!docs/**'
- '!**.md'
- 'requirements.txt'
concurrency:
# Cancel CI on previous commit when a new commit is pushed to the same branch
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true
jobs:
pytest_tests:
runs-on: linux.4xlarge
strategy:
matrix:
python-version: ["3.8", "3.9"]
container:
image: python:${{ matrix.python-version }}
steps:
- uses: actions/checkout@v2
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install flake8 pytest pytest-cov pytest-xdist numpy
if [ -f requirements.txt ]; then pip install -r requirements.txt --find-links https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html; fi
- name: Install pavel's huggingface fork
run: pip install git+https://github.com/huggingface/transformers.git@main sentencepiece six sacremoses
- name: Install pippy
run: "python setup.py install"
- name: Test with pytest
run: |
pytest --cov=pippy --ignore=test/hf_test.py --ignore=test/test_fx.py --ignore=test/test_fx_experimental.py --ignore=test/fx test/
# hf_model_tests:
# runs-on: linux.12xlarge
# strategy:
# matrix:
# python-version: ["3.9"]
# shard: ["0", "1", "2", "3", "4", "5", "6", "7"]
# container:
# image: python:${{ matrix.python-version }}
# steps:
# - uses: actions/checkout@v2
# - name: Install dependencies
# run: |
# python -m pip install --upgrade pip
# pip install flake8 pytest pytest-cov pytest-xdist pytest-shard numpy
# if [ -f requirements.txt ]; then pip install -r requirements.txt --find-links https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html; fi
# - name: Install pavel's huggingface fork
# run: pip install git+https://github.com/huggingface/transformers.git@main sentencepiece six sacremoses
# - name: Install pippy
# run: "python setup.py install"
# # Single thread to avoid OOM
# - name: Test forward only
# run: |
# pytest --shard-id=${{ matrix.shard }} --num-shards=8 -k 'not HFModelsForwardBackwardTest' -sv --cov=pippy test/hf_test.py
# - name: Test forward and backward
# run: |
# pytest --shard-id=${{ matrix.shard }} --num-shards=8 -k 'HFModelsForwardBackwardTest' -sv --cov=pippy test/hf_test.py
integration_test_cpu:
runs-on: linux.4xlarge
strategy:
matrix:
python-version: ["3.8", "3.9"]
replicate: ["0", "1"]
schedule: ["FillDrain", "1F1B"]
checkpoint: [ "0", "1" ]
env:
OMP_NUM_THREADS: "1"
container:
image: python:${{ matrix.python-version }}
steps:
- uses: actions/checkout@v2
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install flake8 pytest pytest-cov numpy datasets evaluate scikit-learn sacrebleu
if [ -f requirements.txt ]; then pip install -r requirements.txt --find-links https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html; fi
- name: Install pavel's huggingface fork
run: pip install git+https://github.com/huggingface/transformers.git@main sentencepiece six sacremoses
- name: Install pippy
run: "python setup.py install"
- name: Run forward-only integration test
run: python test/local_test_forward.py --replicate ${{ matrix.replicate }} -s ${{ matrix.schedule }} --checkpoint ${{ matrix.checkpoint }}
- name: Run forward-only-auto-parallel integration test
run: python test/local_test_forward_auto_parallel.py --replicate ${{ matrix.replicate }} -s ${{ matrix.schedule }} --checkpoint ${{ matrix.checkpoint }}
- name: Run forward-loss-backward integration test
run: python test/local_test_forward_backward.py --replicate ${{ matrix.replicate }} -s ${{ matrix.schedule }} --checkpoint ${{ matrix.checkpoint }}
- name: Run null_coalesce_accumulate integration test
run: python test/local_test_null_coalesce_accumulate.py --replicate ${{ matrix.replicate }} -s ${{ matrix.schedule }}
- name: Run PP + DDP test
run: python test/local_test_ddp.py --replicate ${{ matrix.replicate }} -s ${{ matrix.schedule }} --checkpoint ${{ matrix.checkpoint }}
#- name: Run HF BERT forward-only integration test
# run: python test/local_test_forward_hf_bert.py --replicate ${{ matrix.replicate }} -s ${{ matrix.schedule }} --checkpoint ${{ matrix.checkpoint }}
- name: Run HF GPT2 forward-only integration test
run: python test/local_test_forward_hf_gpt2.py --replicate ${{ matrix.replicate }} -s ${{ matrix.schedule }} --checkpoint ${{ matrix.checkpoint }}
- name: Run visualizer test
run: python test/local_test_visualizer.py --replicate ${{ matrix.replicate }} -s ${{ matrix.schedule }}
- name: Run auto-split test
run: python test/local_test_autosplit.py --replicate ${{ matrix.replicate }} -s ${{ matrix.schedule }}
- name: Run compile test
run: python test/local_test_compile.py -s ${{ matrix.schedule }} --checkpoint ${{ matrix.checkpoint }}
# hf_examples_set1:
# runs-on: linux.12xlarge
# strategy:
# matrix:
# python-version: ["3.9"]
# schedule: ["FillDrain", "1F1B"]
# env:
# OMP_NUM_THREADS: "1"
# container:
# image: python:${{ matrix.python-version }}
# steps:
# - uses: actions/checkout@v2
# - name: Install dependencies
# run: |
# python -m pip install --upgrade pip
# pip install flake8 pytest pytest-cov numpy datasets evaluate scikit-learn sacrebleu
# if [ -f requirements.txt ]; then pip install -r requirements.txt --find-links https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html; fi
# - name: Install pavel's huggingface fork
# run: pip install git+https://github.com/huggingface/transformers.git@main sentencepiece six sacremoses
# - name: Install pippy
# run: "python setup.py install"
# - name: Test min-GPT
# run: |
# git config --global --add safe.directory /__w/tau/tau
# git submodule update --init test/minGPT
# python test/min_gpt_tracing.py
# - name: Run GPT2 example
# run: python examples/hf/gpt2/pippy_gpt2.py -s ${{ matrix.schedule }}
# - name: Run BERT example
# run: python examples/hf/bert/pippy_bert.py -s ${{ matrix.schedule }}
# - name: Run T5 example
# run: python examples/hf/t5/pippy_t5.py -s ${{ matrix.schedule }}
# - name: "HF Translation: fine-tune T5 model translation English to Romanian"
# run: >
# python examples/hf/translation/run_translation.py --model_name_or_path t5-small --do_train --source_lang en --target_lang ro --source_prefix "translate English to Romanian: " --dataset_name wmt16 --dataset_config_name ro-en --output_dir /tmp/tst-translation --per_device_train_batch_size=8 --per_device_eval_batch_size=8 --overwrite_output_dir --predict_with_generate --max_steps=10 --dp_group_size=1 --pp_group_size=8
# - name: "HF Translation: fine-tune BART model translation English to Romanian"
# run: >
# python examples/hf/translation/run_translation.py --model_name_or_path facebook/bart-base --do_train --source_lang en --target_lang ro --source_prefix "translate English to Romanian: " --dataset_name wmt16 --dataset_config_name ro-en --output_dir /tmp/tst-translation --per_device_train_batch_size=8 --per_device_eval_batch_size=8 --overwrite_output_dir --predict_with_generate --max_steps=10 --dp_group_size=2 --pp_group_size=8
# hf_examples_set2:
# runs-on: linux.12xlarge
# strategy:
# matrix:
# python-version: ["3.9"]
# schedule: ["FillDrain", "1F1B"]
# env:
# OMP_NUM_THREADS: "1"
# container:
# image: python:${{ matrix.python-version }}
# steps:
# - uses: actions/checkout@v2
# - name: Install dependencies
# run: |
# python -m pip install --upgrade pip
# pip install flake8 pytest pytest-cov numpy datasets evaluate scikit-learn sacrebleu
# if [ -f requirements.txt ]; then pip install -r requirements.txt --find-links https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html; fi
# - name: Install pavel's huggingface fork
# run: pip install git+https://github.com/huggingface/transformers.git@main sentencepiece six sacremoses
# - name: Install pippy
# run: "python setup.py install"
# - name: "HF Causal Language Modeling: fine-tune GPT-2 on WikiText-2"
# run: python examples/hf/language-modeling/run_clm.py --dp_group_size=2 --pp_group_size=8 --model_name_or_path gpt2 --dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 --per_device_train_batch_size 8 --per_device_eval_batch_size 8 --do_train --do_eval --output_dir /tmp/test-clm --max_steps=3 --overwrite_output_dir
# - name: "HF Masked Language Modeling: fine-tune RoBERTa on WikiText-2"
# run: python examples/hf/language-modeling/run_mlm.py --dp_group_size=2 --pp_group_size=8 --model_name_or_path roberta-base --dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 --per_device_train_batch_size 8 --per_device_eval_batch_size 8 --do_train --do_eval --output_dir /tmp/test-mlm --max_steps=3 --overwrite_output_dir
# - name: "HF Text classification: fine-tune BERT on the GLUE benchmark"
# run: python examples/hf/text-classification/run_glue.py --dp_group_size=2 --pp_group_size=8 --model_name_or_path bert-base-cased --task_name mrpc --do_train --do_eval --max_seq_length 128 --per_device_train_batch_size 32 --learning_rate 2e-5 --num_train_epochs 3 --output_dir /tmp/mrpc/ --max_steps=3 --overwrite_output_dir
integration_test_gpu:
runs-on: linux.16xlarge.nvidia.gpu
strategy:
matrix:
python-version: ["3.8"]
replicate: ["0", "1"]
schedule: ["FillDrain", "1F1B"]
env:
DOCKER_IMAGE: qts8n/cuda-python:devel
PIPPY_ROOT: /PiPPy
OMP_NUM_THREADS: "1"
REPLICATE: ${{ matrix.replicate }}
SCHEDULE: ${{ matrix.schedule }}
steps:
- name: Clean working directory
shell: bash
run: |
sudo rm -rf /home/ec2-user/actions-runner/_work/PiPPy/PiPPy/* || true
- uses: actions/checkout@v2
- name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
uses: pytorch/test-infra/.github/actions/setup-nvidia@main
- name: Pull Docker image
run: |
retry () {
"$@" || (sleep 1 && "$@") || (sleep 2 && "$@")
}
retry docker pull "${DOCKER_IMAGE}"
- name: Test docker run
run: |
set -x
# shellcheck disable=SC2086,SC2090
container_name=$(docker run \
--gpus all \
--shm-size=1g --ulimit memlock=-1 \
-e OMP_NUM_THREADS \
-e REPLICATE \
-e SCHEDULE \
--tty \
--detach \
-v "$(pwd):${PIPPY_ROOT}" \
-w "${PIPPY_ROOT}" \
"${DOCKER_IMAGE}"
)
# Run GPU tests and return error signal from docker
docker exec -t -w "${PIPPY_ROOT}" "${container_name}" bash -c "bash .github/workflows/pippy_gpu_tests.sh; exit \$?"
- name: Chown workspace
if: always()
run: |
# Ensure the working directory gets chowned back to the current user
docker run --rm -v "$(pwd):${PIPPY_ROOT}" -w "${PIPPY_ROOT}" "${DOCKER_IMAGE}" chown -R "$(id -u):$(id -g)" .
- name: Kill containers, clean up images
if: always()
run: |
# ignore expansion of "docker ps -q" since it could be empty
# shellcheck disable=SC2046
docker stop $(docker ps -q) || true
# Prune all of the docker images
docker system prune -af
programming_model_tests:
runs-on: linux.4xlarge
strategy:
matrix:
python-version: ["3.9"]
container:
image: python:${{ matrix.python-version }}
steps:
- uses: actions/checkout@v2
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install numpy datasets evaluate scikit-learn sacrebleu
if [ -f requirements.txt ]; then pip install --pre -r requirements.txt --find-links https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html; fi
- name: Install pippy
run: "python setup.py install"
- name: Test PiPPy + Dynamo example
run: python examples/TorchDynamo/pippy_dynamo.py
- name: Run PiPPy in GSPMD style
run: python examples/gspmd/pippy_gspmd.py