33name : E2E (NVIDIA L40S x4) SDK Test
44
55on :
6+ # only run on PRs that touch certain regex paths
7+ pull_request :
8+ branches :
9+ - main
10+ paths :
11+ # note this should match the merging criteria in 'mergify.yml'
12+ - " **.py"
13+ - " tox.ini"
14+ - " pyproject.toml"
15+ - " requirements.txt"
16+ - " requirements-dev.txt"
17+ - " constraints-dev.txt"
18+ - " .github/workflows/e2e-nvidia-l40s-x4-sdk.yaml" # This workflow
619 workflow_dispatch :
720 inputs :
821 pr_or_branch :
922 description : ' pull request number or branch name'
1023 required : true
1124 default : ' main'
25+ concurrency :
26+ group : ${{ github.workflow }}-${{ github.event.number || github.ref }}
27+ cancel-in-progress : true
28+
29+ env :
30+ TMPDIR : /home/tmp
31+
1232jobs :
13- noop :
33+ start-large-ec2-runner :
34+ runs-on : ubuntu-latest
35+ outputs :
36+ label : ${{ steps.launch-ec2-instance-with-fallback.outputs.label }}
37+ ec2-instance-id : ${{ steps.launch-ec2-instance-with-fallback.outputs.ec2-instance-id }}
38+ ec2-instance-region : ${{ steps.launch-ec2-instance-with-fallback.outputs.ec2-instance-region }}
39+ steps :
40+ - name : Checkout "launch-ec2-runner-with-fallback" in-house CI action
41+ uses : actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
42+ with :
43+ repository : instructlab/ci-actions
44+ # clone the "ci-actions" repo to a local directory called "ci-actions", instead of overwriting the current WORKDIR contents
45+ path : ci-actions
46+ ref : release-v0.1
47+ sparse-checkout : |
48+ actions/launch-ec2-runner-with-fallback
49+
50+ - name : Launch EC2 Runner with Fallback
51+ id : launch-ec2-instance-with-fallback
52+ uses : ./ci-actions/actions/launch-ec2-runner-with-fallback
53+ env :
54+ TMPDIR : " /tmp"
55+ with :
56+ aws_access_key_id : ${{ secrets.AWS_ACCESS_KEY_ID }}
57+ aws_secret_access_key : ${{ secrets.AWS_SECRET_ACCESS_KEY }}
58+ github_token : ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
59+ regions_config : >
60+ [
61+ {
62+ "region": "us-east-2",
63+ "subnets": {
64+ "us-east-2a": "${{ vars.SUBNET_US_EAST_2A }}",
65+ "us-east-2b": "${{ vars.SUBNET_US_EAST_2B }}",
66+ "us-east-2c": "${{ vars.SUBNET_US_EAST_2C }}"
67+ },
68+ "ec2-ami": "${{ vars.AWS_EC2_AMI_US_EAST_2 }}",
69+ "security-group-id": "${{ vars.SECURITY_GROUP_ID_US_EAST_2 }}"
70+ },
71+ {
72+ "region": "us-east-1",
73+ "subnets": {
74+ "us-east-1a": "${{ vars.SUBNET_US_EAST_1A }}",
75+ "us-east-1b": "${{ vars.SUBNET_US_EAST_1B }}",
76+ "us-east-1c": "${{ vars.SUBNET_US_EAST_1C }}",
77+ "us-east-1d": "${{ vars.SUBNET_US_EAST_1D }}",
78+ "us-east-1e": "${{ vars.SUBNET_US_EAST_1E }}",
79+ "us-east-1f": "${{ vars.SUBNET_US_EAST_1F }}"
80+ },
81+ "ec2-ami": "${{ vars.AWS_EC2_AMI_US_EAST_1 }}",
82+ "security-group-id": "${{ vars.SECURITY_GROUP_ID_US_EAST_1 }}"
83+ }
84+ ]
85+ try_spot_instance_first : false
86+ ec2_instance_type : g6e.12xlarge
87+ aws_resource_tags : >
88+ [
89+ {"Key": "Name", "Value": "instructlab-ci-github-large-runner"},
90+ {"Key": "GitHubRepository", "Value": "${{ github.repository }}"},
91+ {"Key": "GitHubRef", "Value": "${{ github.ref }}"},
92+ {"Key": "GitHubPR", "Value": "${{ github.event.number }}"}
93+ ]
94+
95+ e2e-medium-test :
96+ needs :
97+ - start-large-ec2-runner
98+ runs-on : ${{ needs.start-large-ec2-runner.outputs.label }}
99+
100+ permissions :
101+ pull-requests : write
102+
103+ steps :
104+ - name : " Harden Runner"
105+ # v2.10.1
106+ uses : step-security/harden-runner@c6295a65d1254861815972266d5933fd6e532bdf
107+ with :
108+ egress-policy : audit
109+ - name : Install Packages
110+ run : |
111+ cat /etc/os-release
112+ mkdir -p "${TMPDIR}"
113+ sudo dnf install -y gcc gcc-c++ make git python3.11 python3.11-devel
114+
115+ - name : Checkout
116+ uses : actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
117+ with :
118+ # https://github.com/actions/checkout/issues/249
119+ fetch-depth : 0
120+
121+ - name : Install dependent PRs if needed
122+ uses : depends-on/depends-on-action@61cb3f4a0e2c8ae4b90c9448dc57c7ba9ca24c35 # main
123+ with :
124+ token : ${{ secrets.GITHUB_TOKEN }}
125+
126+ - name : Fetch and checkout PR
127+ if : ${{ github.event_name == 'pull_request_target' }}
128+ run : |
129+ git fetch origin pull/${{ github.event.number }}/head:pr-${{ github.event.number }}
130+ git checkout pr-${{ github.event.number }}
131+
132+ - name : Update instructlab-training library
133+ run : |
134+ export CUDA_HOME="/usr/local/cuda"
135+ export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
136+ export PATH="$PATH:$CUDA_HOME/bin"
137+ nvidia-smi
138+ python3.11 -m venv --upgrade-deps venv
139+ . venv/bin/activate
140+ pip install instructlab
141+ pip install instructlab[cuda]
142+ pip install vllm
143+ python3.11 -m pip install packaging wheel setuptools-scm
144+ pip install .
145+ pip install .[cuda]
146+ python3.11 -m pip uninstall -y flash-attn
147+ python3.11 -m pip cache purge
148+ python3.11 -m pip install ninja
149+ MAX_JOBS=8 python3.11 -m pip install flash-attn --no-build-isolation
150+
151+ - name : Check disk before tests
152+ run : |
153+ df -h
154+
155+ # TODO: switch to downloading a ds rather than generating one
156+ # - name: Download SDG Dataset
157+ # working-directory: ./training
158+ # uses: actions/download-artifact@v4
159+ # with:
160+ # name: sdg-dataset.jsonl
161+ # path: dataset
162+
163+ - name : Run e2e test
164+ env :
165+ HF_TOKEN : ${{ secrets.HF_TOKEN }}
166+ OPENAI_API_KEY : ${{ secrets.OPENAI_API_KEY }}
167+ run : |
168+ . venv/bin/activate
169+ ls scripts
170+ ls ./
171+ ./scripts/test-sdk.sh
172+
173+ # we know that the file will be named something like f"/training_params_and_metrics_global{os.environ['RANK']}.jsonl" in python
174+ # and we know that it will be written into a directory created by `mktemp -d`.
175+ # Given this information, we can use the following command to find the file:
176+ log_files=$(find /tmp/ -name "training_params_and_metrics_global0.jsonl")
177+ phase_num=1;
178+ for log_file in $log_files; do
179+ mv "${log_file}" phase-${phase_num}-training-log.jsonl
180+ ((phase_num++))
181+ done
182+
183+ - name : Check disk after tests
184+ run : |
185+ df -h
186+
187+ - name : Upload training logs Phase 1
188+ uses : actions/upload-artifact@v4
189+ with :
190+ name : phase-1-training-log.jsonl
191+ path : ./phase-1-training-log.jsonl
192+ retention-days : 1
193+ overwrite : true
194+
195+ - name : Upload training logs Phase 2
196+ uses : actions/upload-artifact@v4
197+ with :
198+ name : phase-2-training-log.jsonl
199+ path : ./phase-2-training-log.jsonl
200+ retention-days : 1
201+ overwrite : true
202+
203+ stop-large-ec2-runner :
204+ needs :
205+ - start-large-ec2-runner
206+ - e2e-medium-test
14207 runs-on : ubuntu-latest
208+ if : ${{ always() }}
15209 steps :
16- - name : No-op
17- run : ' true'
210+ - name : " Harden Runner"
211+ # v2.10.1
212+ uses : step-security/harden-runner@c6295a65d1254861815972266d5933fd6e532bdf
213+ with :
214+ egress-policy : audit
215+
216+ - name : Configure AWS credentials
217+ uses : aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
218+ with :
219+ aws-access-key-id : ${{ secrets.AWS_ACCESS_KEY_ID }}
220+ aws-secret-access-key : ${{ secrets.AWS_SECRET_ACCESS_KEY }}
221+ aws-region : ${{ vars.AWS_REGION }}
222+
223+ - name : Stop EC2 runner
224+ uses : machulav/ec2-github-runner@a8c20fc0876503410b2b966c124abc2311984ce2 # v2.3.9
225+ with :
226+ mode : stop
227+ github-token : ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
228+ label : ${{ needs.start-large-ec2-runner.outputs.label }}
229+ ec2-instance-id : ${{ needs.start-large-ec2-runner.outputs.ec2-instance-id }}
230+
231+ loss-graphs :
232+ needs :
233+ - stop-large-ec2-runner
234+ runs-on : ubuntu-latest
235+ if : ${{ always() }}
236+ steps :
237+ - name : " Harden Runner"
238+ # v2.10.1
239+ uses : step-security/harden-runner@c6295a65d1254861815972266d5933fd6e532bdf
240+ with :
241+ egress-policy : audit
242+
243+ - name : Configure AWS credentials
244+ uses : aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
245+ with :
246+ aws-access-key-id : ${{ secrets.AWS_ACCESS_KEY_ID }}
247+ aws-secret-access-key : ${{ secrets.AWS_SECRET_ACCESS_KEY }}
248+ aws-region : ${{ vars.AWS_REGION }}
249+
250+ - name : Checkout
251+ uses : actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
252+ with :
253+ # https://github.com/actions/checkout/issues/249
254+ fetch-depth : 0
255+
256+ - name : Install dependencies
257+ run : |
258+ python -m pip install --upgrade pip
259+ pip install -r requirements-dev.txt
260+
261+ - name : Download loss data Phase 1
262+ id : phase-1-download-logs
263+ uses : actions/download-artifact@v4
264+ with :
265+ name : phase-1-training-log.jsonl
266+ path : downloaded-data
267+
268+ - name : Download loss data Phase 2
269+ id : phase-2-download-logs
270+ uses : actions/download-artifact@v4
271+ with :
272+ name : phase-2-training-log.jsonl
273+ path : downloaded-data
274+
275+ - name : Try to upload Phase 1 to s3
276+ id : phase-1-upload-s3
277+ continue-on-error : true
278+ run : |
279+ python ./scripts/create-loss-graph.py \
280+ --log-file "${{ steps.phase-1-download-logs.outputs.download-path }}/phase-1-training-log.jsonl" \
281+ --output-file "./phase-1-test.md" \
282+ --phase "1" \
283+ --aws-region "${{ vars.AWS_REGION }}" \
284+ --bucket-name "${{ vars.AWS_S3_LOSS_GRAPHS_BUCKET_NAME }}" \
285+ --base-branch "${GITHUB_REF##*/}" \
286+ --head-sha "${{ github.sha }}" \
287+ --pr-number "${{ github.event.number }}" \
288+ --origin-repository "${{ github.repository }}"
289+
290+ - name : Try to upload Phase 2 to s3
291+ id : phase-2-upload-s3
292+ continue-on-error : true
293+ run : |
294+ python ./scripts/create-loss-graph.py \
295+ --log-file "${{ steps.phase-2-download-logs.outputs.download-path }}/phase-2-training-log.jsonl" \
296+ --output-file "./phase-2-test.md" \
297+ --phase "2" \
298+ --aws-region "${{ vars.AWS_REGION }}" \
299+ --bucket-name "${{ vars.AWS_S3_LOSS_GRAPHS_BUCKET_NAME }}" \
300+ --base-branch "${GITHUB_REF##*/}" \
301+ --head-sha "${{ github.sha }}" \
302+ --pr-number "${{ github.event.number }}" \
303+ --origin-repository "${{ github.repository }}"
304+
305+ - name : Check Phase 1 S3 upload status for success
306+ if : steps.phase-1-upload-s3.outcome == 'success'
307+ run : |
308+ echo "Uploaded Phase 1 loss graph to S3."
309+ cat ./phase-1-test.md >> "${GITHUB_STEP_SUMMARY}"
310+
311+ - name : Check Phase 2 S3 upload status for success
312+ if : steps.phase-2-upload-s3.outcome == 'success'
313+ run : |
314+ echo "Uploaded Phase 2 loss graph to S3."
315+ cat ./phase-2-test.md >> "${GITHUB_STEP_SUMMARY}"
316+
317+ - name : Check Phase 1 S3 upload status for failure
318+ if : steps.phase-1-upload-s3.outcome == 'failure'
319+ run : |
320+ echo "::warning::Failed to upload Phase 1 loss graph to S3. This won't block the workflow, but you may want to investigate."
321+ echo "Loss graph upload failed" >> "${GITHUB_STEP_SUMMARY}"
322+
323+ - name : Check Phase 2 S3 upload status for failure
324+ if : steps.phase-2-upload-s3.outcome == 'failure'
325+ run : |
326+ echo "::warning::Failed to upload Phase 2 loss graph to S3. This won't block the workflow, but you may want to investigate."
327+ echo "Loss graph upload failed" >> "${GITHUB_STEP_SUMMARY}"
0 commit comments