Skip to content

Commit dd72aaf

Browse files
author
Verdi March
committed
Slurm job template: how a job can probe instance topology and hostname-instanceid mappings
1 parent e87d5d1 commit dd72aaf

File tree

2 files changed

+210
-0
lines changed

2 files changed

+210
-0
lines changed
Lines changed: 154 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,154 @@
1+
# Slurm job template to probe EC2 informations
2+
3+
Usage: review and customize [job-template.sbatch](job-template.sbatch) to your need.
4+
5+
Below shows the sample output (trimmed) of customizing the template to run Megatron-LM, i.e.,
6+
replace the `srun -l /usr/bin/hostname` with the relevant parts from the [Megatron-LM
7+
example](../../3.test_cases/1.megatron-lm/)).
8+
9+
```text
10+
...
11+
+ validate_ec2_same_spine
12+
++ lstopo_ec2
13+
++ INSTANCE_IDS=($(srun cat /sys/devices/virtual/dmi/id/board_asset_tag))
14+
+++ srun cat /sys/devices/virtual/dmi/id/board_asset_tag
15+
++ local INSTANCE_IDS
16+
++ aws ec2 describe-instance-topology --instance-ids i-1111111111example i-0000000000example
17+
+ local 'TOPO_JSON={
18+
"Instances": [
19+
{
20+
"InstanceId": "i-0000000000example",
21+
"InstanceType": "p4de.24xlarge",
22+
"NetworkNodes": [
23+
"nn-1111111111example",
24+
"nn-2222222222example",
25+
"nn-3333333333example"
26+
],
27+
"AvailabilityZone": "us-west-2b",
28+
"ZoneId": "usw2-az2"
29+
},
30+
{
31+
"InstanceId": "i-1111111111example",
32+
"InstanceType": "p4de.24xlarge",
33+
"NetworkNodes": [
34+
"nn-1111111111example",
35+
"nn-2222222222example",
36+
"nn-3333333333example"
37+
],
38+
"AvailabilityZone": "us-west-2b",
39+
"ZoneId": "usw2-az2"
40+
}
41+
]
42+
}'
43+
+ echo '{
44+
"Instances": [
45+
{
46+
"InstanceId": "i-0000000000example",
47+
"InstanceType": "p4de.24xlarge",
48+
"NetworkNodes": [
49+
"nn-1111111111example",
50+
"nn-2222222222example",
51+
"nn-3333333333example"
52+
],
53+
"AvailabilityZone": "us-west-2b",
54+
"ZoneId": "usw2-az2"
55+
},
56+
{
57+
"InstanceId": "i-1111111111example",
58+
"InstanceType": "p4de.24xlarge",
59+
"NetworkNodes": [
60+
"nn-1111111111example",
61+
"nn-2222222222example",
62+
"nn-3333333333example"
63+
],
64+
"AvailabilityZone": "us-west-2b",
65+
"ZoneId": "usw2-az2"
66+
}
67+
]
68+
}'
69+
{
70+
"Instances": [
71+
{
72+
"InstanceId": "i-0000000000example",
73+
"InstanceType": "p4de.24xlarge",
74+
"NetworkNodes": [
75+
"nn-1111111111example",
76+
"nn-2222222222example",
77+
"nn-3333333333example"
78+
],
79+
"AvailabilityZone": "us-west-2b",
80+
"ZoneId": "usw2-az2"
81+
},
82+
{
83+
"InstanceId": "i-1111111111example",
84+
"InstanceType": "p4de.24xlarge",
85+
"NetworkNodes": [
86+
"nn-1111111111example",
87+
"nn-2222222222example",
88+
"nn-3333333333example"
89+
],
90+
"AvailabilityZone": "us-west-2b",
91+
"ZoneId": "usw2-az2"
92+
}
93+
]
94+
}
95+
++ echo '{
96+
"Instances": [
97+
{
98+
"InstanceId": "i-0000000000example",
99+
"InstanceType": "p4de.24xlarge",
100+
"NetworkNodes": [
101+
"nn-1111111111example",
102+
"nn-2222222222example",
103+
"nn-3333333333example"
104+
],
105+
"AvailabilityZone": "us-west-2b",
106+
"ZoneId": "usw2-az2"
107+
},
108+
{
109+
"InstanceId": "i-1111111111example",
110+
"InstanceType": "p4de.24xlarge",
111+
"NetworkNodes": [
112+
"nn-1111111111example",
113+
"nn-2222222222example",
114+
"nn-3333333333example"
115+
],
116+
"AvailabilityZone": "us-west-2b",
117+
"ZoneId": "usw2-az2"
118+
}
119+
]
120+
}'
121+
++ grep '^ *"nn\-.................\"'
122+
++ sort -n
123+
++ uniq -c
124+
++ wc -l
125+
+ local UNIQ_NN=3
126+
+ echo Expected 3 nn ids, got 3 nn ids
127+
Expected 3 nn ids, got 3 nn ids
128+
+ [[ 3 -eq 3 ]]
129+
...
130+
+ srun -l bash -c 'echo "hostname <=> instance_id mapping: $(hostname) <=> $(cat /sys/devices/virtual/dmi/id/board_asset_tag)"'
131+
1: hostname <=> instance_id mapping: p4de-st-p4de-2 <=> i-0000000000example
132+
0: hostname <=> instance_id mapping: p4de-st-p4de-1 <=> i-1111111111example
133+
+ srun -l bash -c 'echo BEFORE: $(hostname) $(sudo lctl get_param llite.*.stats | grep write_bytes)'
134+
0: BEFORE: p4de-st-p4de-1 write_bytes 10361 samples [bytes] 1 2147479552 401369912220
135+
1: BEFORE: p4de-st-p4de-2
136+
...
137+
++ date
138+
+ BEGIN_TRAINING='Mon Apr 8 08:33:01 UTC 2024'
139+
+ SECONDS=0
140+
+ srun -l --container-image ... --container-mounts ... python -m torch.distributed.run ... /workspace/Megatron-LM/pretrain_gpt.py ...
141+
...
142+
++ date
143+
+ END_TRAINING='Mon Apr 8 09:02:25 UTC 2024'
144+
+ echo 'BEGIN_TRAINING: Mon Apr 8 08:33:01 UTC 2024'
145+
BEGIN_TRAINING: Mon Apr 8 08:33:01 UTC 2024
146+
+ echo 'END_TRAINING : Mon Apr 8 09:02:25 UTC 2024'
147+
END_TRAINING : Mon Apr 8 09:02:25 UTC 2024
148+
+ echo 'Elapsed: 29min 24sec'
149+
Elapsed: 29min 24sec
150+
+ srun -l bash -c 'echo AFTER: $(hostname) $(sudo lctl get_param llite.*.stats | grep write_bytes)'
151+
srun: Step created for StepId=191.4
152+
1: AFTER: p4de-st-p4de-2
153+
0: AFTER: p4de-st-p4de-1 write_bytes 11553 samples [bytes] 1 2147479552 775980517197
154+
```
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
#!/bin/bash
2+
3+
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
4+
# SPDX-License-Identifier: MIT-0
5+
6+
#SBATCH --nodes=2 # number of nodes to use
7+
8+
set -exuo pipefail
9+
10+
11+
################################################################################
12+
## Preamble
13+
################################################################################
14+
# Helper function to query instance topology
15+
lstopo_ec2() {
16+
local INSTANCE_IDS=( $(srun cat /sys/devices/virtual/dmi/id/board_asset_tag) )
17+
aws ec2 describe-instance-topology --instance-ids "${INSTANCE_IDS[@]}"
18+
}
19+
20+
# Are SLURM_NNODES on the same network spine?
21+
validate_ec2_same_spine() {
22+
local TOPO_JSON="$(lstopo_ec2)"
23+
echo "${TOPO_JSON}"
24+
25+
local UNIQ_NN=$(echo "$TOPO_JSON" | grep '^ *"nn\-.................\"' | sort -n | uniq -c | wc -l)
26+
echo Expected 3 nn ids, got $UNIQ_NN nn ids
27+
[[ $UNIQ_NN -eq 3 ]] || echo WARNING: ec2 instances on different network spine...
28+
}
29+
validate_ec2_same_spine
30+
31+
# Track instance ids, later on to view their CloudWatch metrics.
32+
srun -l bash -c "echo \"hostname <=> instance_id mapping: \$(hostname) <=> \$(cat /sys/devices/virtual/dmi/id/board_asset_tag)\""
33+
34+
# Track per-instance cumulative Lustre statistics. In this example, we only show the write_bytes.
35+
srun -l bash -c "echo BEFORE: \$(hostname) \$(sudo lctl get_param llite.*.stats | grep write_bytes)" || true
36+
37+
env
38+
39+
40+
################################################################################
41+
## Actual scripts. Below example runs /usr/bin/hostname on all allocated nodes.
42+
################################################################################
43+
BEGIN_TRAINING=$(date)
44+
SECONDS=0
45+
srun -l /usr/bin/hostname
46+
END_TRAINING=$(date)
47+
echo "BEGIN_TRAINING: ${BEGIN_TRAINING}"
48+
echo "END_TRAINING : ${END_TRAINING}"
49+
echo "Elapsed: $(($SECONDS / 60))min $(($SECONDS % 60))sec"
50+
51+
52+
################################################################################
53+
## Postamble
54+
################################################################################
55+
# Track per-instance cumulative Lustre statistics. In this exmaple, we only show the write_bytes.
56+
srun -l bash -c "echo AFTER: \$(hostname) \$(sudo lctl get_param llite.*.stats | grep write_bytes)" || true

0 commit comments

Comments
 (0)