Skip to content

Commit aacd5b4

Browse files
authored
Merge pull request #551 from instructlab/new-ci-job
feat: add medium e2e CI job for each PR
2 parents e94f8ab + 1d6744c commit aacd5b4

File tree

6 files changed

+1047
-3
lines changed

6 files changed

+1047
-3
lines changed

.github/workflows/e2e-nvidia-l40s-x4-sdk.yml

Lines changed: 313 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,15 +3,325 @@
33
name: E2E (NVIDIA L40S x4) SDK Test
44

55
on:
6+
# only run on PRs that touch certain regex paths
7+
pull_request:
8+
branches:
9+
- main
10+
paths:
11+
# note this should match the merging criteria in 'mergify.yml'
12+
- "**.py"
13+
- "tox.ini"
14+
- "pyproject.toml"
15+
- "requirements.txt"
16+
- "requirements-dev.txt"
17+
- "constraints-dev.txt"
18+
- ".github/workflows/e2e-nvidia-l40s-x4-sdk.yaml" # This workflow
619
workflow_dispatch:
720
inputs:
821
pr_or_branch:
922
description: 'pull request number or branch name'
1023
required: true
1124
default: 'main'
25+
concurrency:
26+
group: ${{ github.workflow }}-${{ github.event.number || github.ref }}
27+
cancel-in-progress: true
28+
29+
env:
30+
TMPDIR: /home/tmp
31+
1232
jobs:
13-
noop:
33+
start-large-ec2-runner:
34+
runs-on: ubuntu-latest
35+
outputs:
36+
label: ${{ steps.launch-ec2-instance-with-fallback.outputs.label }}
37+
ec2-instance-id: ${{ steps.launch-ec2-instance-with-fallback.outputs.ec2-instance-id }}
38+
ec2-instance-region: ${{ steps.launch-ec2-instance-with-fallback.outputs.ec2-instance-region }}
39+
steps:
40+
- name: Checkout "launch-ec2-runner-with-fallback" in-house CI action
41+
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
42+
with:
43+
repository: instructlab/ci-actions
44+
# clone the "ci-actions" repo to a local directory called "ci-actions", instead of overwriting the current WORKDIR contents
45+
path: ci-actions
46+
ref: release-v0.1
47+
sparse-checkout: |
48+
actions/launch-ec2-runner-with-fallback
49+
50+
- name: Launch EC2 Runner with Fallback
51+
id: launch-ec2-instance-with-fallback
52+
uses: ./ci-actions/actions/launch-ec2-runner-with-fallback
53+
env:
54+
TMPDIR: "/tmp"
55+
with:
56+
aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }}
57+
aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
58+
github_token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
59+
regions_config: >
60+
[
61+
{
62+
"region": "us-east-2",
63+
"subnets": {
64+
"us-east-2a": "${{ vars.SUBNET_US_EAST_2A }}",
65+
"us-east-2b": "${{ vars.SUBNET_US_EAST_2B }}",
66+
"us-east-2c": "${{ vars.SUBNET_US_EAST_2C }}"
67+
},
68+
"ec2-ami": "${{ vars.AWS_EC2_AMI_US_EAST_2 }}",
69+
"security-group-id": "${{ vars.SECURITY_GROUP_ID_US_EAST_2 }}"
70+
},
71+
{
72+
"region": "us-east-1",
73+
"subnets": {
74+
"us-east-1a": "${{ vars.SUBNET_US_EAST_1A }}",
75+
"us-east-1b": "${{ vars.SUBNET_US_EAST_1B }}",
76+
"us-east-1c": "${{ vars.SUBNET_US_EAST_1C }}",
77+
"us-east-1d": "${{ vars.SUBNET_US_EAST_1D }}",
78+
"us-east-1e": "${{ vars.SUBNET_US_EAST_1E }}",
79+
"us-east-1f": "${{ vars.SUBNET_US_EAST_1F }}"
80+
},
81+
"ec2-ami": "${{ vars.AWS_EC2_AMI_US_EAST_1 }}",
82+
"security-group-id": "${{ vars.SECURITY_GROUP_ID_US_EAST_1 }}"
83+
}
84+
]
85+
try_spot_instance_first: false
86+
ec2_instance_type: g6e.12xlarge
87+
aws_resource_tags: >
88+
[
89+
{"Key": "Name", "Value": "instructlab-ci-github-large-runner"},
90+
{"Key": "GitHubRepository", "Value": "${{ github.repository }}"},
91+
{"Key": "GitHubRef", "Value": "${{ github.ref }}"},
92+
{"Key": "GitHubPR", "Value": "${{ github.event.number }}"}
93+
]
94+
95+
e2e-medium-test:
96+
needs:
97+
- start-large-ec2-runner
98+
runs-on: ${{ needs.start-large-ec2-runner.outputs.label }}
99+
100+
permissions:
101+
pull-requests: write
102+
103+
steps:
104+
- name: "Harden Runner"
105+
# v2.10.1
106+
uses: step-security/harden-runner@c6295a65d1254861815972266d5933fd6e532bdf
107+
with:
108+
egress-policy: audit
109+
- name: Install Packages
110+
run: |
111+
cat /etc/os-release
112+
mkdir -p "${TMPDIR}"
113+
sudo dnf install -y gcc gcc-c++ make git python3.11 python3.11-devel
114+
115+
- name: Checkout
116+
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
117+
with:
118+
# https://github.com/actions/checkout/issues/249
119+
fetch-depth: 0
120+
121+
- name: Install dependent PRs if needed
122+
uses: depends-on/depends-on-action@61cb3f4a0e2c8ae4b90c9448dc57c7ba9ca24c35 # main
123+
with:
124+
token: ${{ secrets.GITHUB_TOKEN }}
125+
126+
- name: Fetch and checkout PR
127+
if: ${{ github.event_name == 'pull_request_target' }}
128+
run: |
129+
git fetch origin pull/${{ github.event.number }}/head:pr-${{ github.event.number }}
130+
git checkout pr-${{ github.event.number }}
131+
132+
- name: Update instructlab-training library
133+
run: |
134+
export CUDA_HOME="/usr/local/cuda"
135+
export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
136+
export PATH="$PATH:$CUDA_HOME/bin"
137+
nvidia-smi
138+
python3.11 -m venv --upgrade-deps venv
139+
. venv/bin/activate
140+
pip install instructlab
141+
pip install instructlab[cuda]
142+
pip install vllm
143+
python3.11 -m pip install packaging wheel setuptools-scm
144+
pip install .
145+
pip install .[cuda]
146+
python3.11 -m pip uninstall -y flash-attn
147+
python3.11 -m pip cache purge
148+
python3.11 -m pip install ninja
149+
MAX_JOBS=8 python3.11 -m pip install flash-attn --no-build-isolation
150+
151+
- name: Check disk before tests
152+
run: |
153+
df -h
154+
155+
# TODO: switch to downloading a ds rather than generating one
156+
# - name: Download SDG Dataset
157+
# working-directory: ./training
158+
# uses: actions/download-artifact@v4
159+
# with:
160+
# name: sdg-dataset.jsonl
161+
# path: dataset
162+
163+
- name: Run e2e test
164+
env:
165+
HF_TOKEN: ${{ secrets.HF_TOKEN }}
166+
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
167+
run: |
168+
. venv/bin/activate
169+
ls scripts
170+
ls ./
171+
./scripts/test-sdk.sh
172+
173+
# we know that the file will be named something like f"/training_params_and_metrics_global{os.environ['RANK']}.jsonl" in python
174+
# and we know that it will be written into a directory created by `mktemp -d`.
175+
# Given this information, we can use the following command to find the file:
176+
log_files=$(find /tmp/ -name "training_params_and_metrics_global0.jsonl")
177+
phase_num=1;
178+
for log_file in $log_files; do
179+
mv "${log_file}" phase-${phase_num}-training-log.jsonl
180+
((phase_num++))
181+
done
182+
183+
- name: Check disk after tests
184+
run: |
185+
df -h
186+
187+
- name: Upload training logs Phase 1
188+
uses: actions/upload-artifact@v4
189+
with:
190+
name: phase-1-training-log.jsonl
191+
path: ./phase-1-training-log.jsonl
192+
retention-days: 1
193+
overwrite: true
194+
195+
- name: Upload training logs Phase 2
196+
uses: actions/upload-artifact@v4
197+
with:
198+
name: phase-2-training-log.jsonl
199+
path: ./phase-2-training-log.jsonl
200+
retention-days: 1
201+
overwrite: true
202+
203+
stop-large-ec2-runner:
204+
needs:
205+
- start-large-ec2-runner
206+
- e2e-medium-test
14207
runs-on: ubuntu-latest
208+
if: ${{ always() }}
15209
steps:
16-
- name: No-op
17-
run: 'true'
210+
- name: "Harden Runner"
211+
# v2.10.1
212+
uses: step-security/harden-runner@c6295a65d1254861815972266d5933fd6e532bdf
213+
with:
214+
egress-policy: audit
215+
216+
- name: Configure AWS credentials
217+
uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
218+
with:
219+
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
220+
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
221+
aws-region: ${{ vars.AWS_REGION }}
222+
223+
- name: Stop EC2 runner
224+
uses: machulav/ec2-github-runner@a8c20fc0876503410b2b966c124abc2311984ce2 # v2.3.9
225+
with:
226+
mode: stop
227+
github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
228+
label: ${{ needs.start-large-ec2-runner.outputs.label }}
229+
ec2-instance-id: ${{ needs.start-large-ec2-runner.outputs.ec2-instance-id }}
230+
231+
loss-graphs:
232+
needs:
233+
- stop-large-ec2-runner
234+
runs-on: ubuntu-latest
235+
if: ${{ always() }}
236+
steps:
237+
- name: "Harden Runner"
238+
# v2.10.1
239+
uses: step-security/harden-runner@c6295a65d1254861815972266d5933fd6e532bdf
240+
with:
241+
egress-policy: audit
242+
243+
- name: Configure AWS credentials
244+
uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
245+
with:
246+
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
247+
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
248+
aws-region: ${{ vars.AWS_REGION }}
249+
250+
- name: Checkout
251+
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
252+
with:
253+
# https://github.com/actions/checkout/issues/249
254+
fetch-depth: 0
255+
256+
- name: Install dependencies
257+
run: |
258+
python -m pip install --upgrade pip
259+
pip install -r requirements-dev.txt
260+
261+
- name: Download loss data Phase 1
262+
id: phase-1-download-logs
263+
uses: actions/download-artifact@v4
264+
with:
265+
name: phase-1-training-log.jsonl
266+
path: downloaded-data
267+
268+
- name: Download loss data Phase 2
269+
id: phase-2-download-logs
270+
uses: actions/download-artifact@v4
271+
with:
272+
name: phase-2-training-log.jsonl
273+
path: downloaded-data
274+
275+
- name: Try to upload Phase 1 to s3
276+
id: phase-1-upload-s3
277+
continue-on-error: true
278+
run: |
279+
python ./scripts/create-loss-graph.py \
280+
--log-file "${{ steps.phase-1-download-logs.outputs.download-path }}/phase-1-training-log.jsonl" \
281+
--output-file "./phase-1-test.md" \
282+
--phase "1" \
283+
--aws-region "${{ vars.AWS_REGION }}" \
284+
--bucket-name "${{ vars.AWS_S3_LOSS_GRAPHS_BUCKET_NAME }}" \
285+
--base-branch "${GITHUB_REF##*/}" \
286+
--head-sha "${{ github.sha }}" \
287+
--pr-number "${{ github.event.number }}" \
288+
--origin-repository "${{ github.repository }}"
289+
290+
- name: Try to upload Phase 2 to s3
291+
id: phase-2-upload-s3
292+
continue-on-error: true
293+
run: |
294+
python ./scripts/create-loss-graph.py \
295+
--log-file "${{ steps.phase-2-download-logs.outputs.download-path }}/phase-2-training-log.jsonl" \
296+
--output-file "./phase-2-test.md" \
297+
--phase "2" \
298+
--aws-region "${{ vars.AWS_REGION }}" \
299+
--bucket-name "${{ vars.AWS_S3_LOSS_GRAPHS_BUCKET_NAME }}" \
300+
--base-branch "${GITHUB_REF##*/}" \
301+
--head-sha "${{ github.sha }}" \
302+
--pr-number "${{ github.event.number }}" \
303+
--origin-repository "${{ github.repository }}"
304+
305+
- name: Check Phase 1 S3 upload status for success
306+
if: steps.phase-1-upload-s3.outcome == 'success'
307+
run: |
308+
echo "Uploaded Phase 1 loss graph to S3."
309+
cat ./phase-1-test.md >> "${GITHUB_STEP_SUMMARY}"
310+
311+
- name: Check Phase 2 S3 upload status for success
312+
if: steps.phase-2-upload-s3.outcome == 'success'
313+
run: |
314+
echo "Uploaded Phase 2 loss graph to S3."
315+
cat ./phase-2-test.md >> "${GITHUB_STEP_SUMMARY}"
316+
317+
- name: Check Phase 1 S3 upload status for failure
318+
if: steps.phase-1-upload-s3.outcome == 'failure'
319+
run: |
320+
echo "::warning::Failed to upload Phase 1 loss graph to S3. This won't block the workflow, but you may want to investigate."
321+
echo "Loss graph upload failed" >> "${GITHUB_STEP_SUMMARY}"
322+
323+
- name: Check Phase 2 S3 upload status for failure
324+
if: steps.phase-2-upload-s3.outcome == 'failure'
325+
run: |
326+
echo "::warning::Failed to upload Phase 2 loss graph to S3. This won't block the workflow, but you may want to investigate."
327+
echo "Loss graph upload failed" >> "${GITHUB_STEP_SUMMARY}"

0 commit comments

Comments
 (0)