Skip to content

Commit e891361

Browse files
committed
Remove VP_SIZE argument in tests when not intending to use interleaved PP schedule
Also, label interleaved PP tests explicitly
1 parent 954a65b commit e891361

6 files changed

+46
-10
lines changed

.gitlab-ci.yml

Lines changed: 40 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,20 @@ train.gpt3_core.345m_tp1_pp2_1node_50steps:
143143
TEST_LEVEL: L0
144144

145145
train.gpt3_core.345m_tp1_pp4_1node_50steps:
146+
<<: *selene-test-launcher
147+
variables:
148+
<<: [*VARS]
149+
RUN_MODEL: gpt3
150+
USE_TE: 0
151+
TP_SIZE: 1
152+
PP_SIZE: 4
153+
NUM_NODES: 1
154+
MAX_STEPS: 50
155+
USE_CORE: 1
156+
TIME_LIMIT: "20:00"
157+
TEST_LEVEL: L0
158+
159+
train.gpt3_core.345m_tp1_pp4_interleaved_1node_50steps:
146160
<<: *selene-test-launcher
147161
variables:
148162
<<: [*VARS]
@@ -181,7 +195,6 @@ train.gpt3_core.345m_tp1_pp4_1node_50steps_swiglu:
181195
USE_TE: 0
182196
TP_SIZE: 1
183197
PP_SIZE: 4
184-
VP_SIZE: 1
185198
NUM_NODES: 1
186199
MAX_STEPS: 50
187200
USE_CORE: 1
@@ -198,7 +211,6 @@ train.gpt3_core.345m_tp1_pp4_1node_50steps_disable_bias_linear:
198211
USE_TE: 0
199212
TP_SIZE: 1
200213
PP_SIZE: 4
201-
VP_SIZE: 1
202214
NUM_NODES: 1
203215
MAX_STEPS: 50
204216
USE_CORE: 1
@@ -215,7 +227,6 @@ train.gpt3_core.345m_tp1_pp4_1node_50steps_untie_embeddings_and_outputs:
215227
USE_TE: 0
216228
TP_SIZE: 1
217229
PP_SIZE: 4
218-
VP_SIZE: 1
219230
NUM_NODES: 1
220231
MAX_STEPS: 50
221232
USE_CORE: 1
@@ -232,7 +243,6 @@ train.gpt3_core.345m_tp1_pp4_1node_50steps_sequence_parallel:
232243
USE_TE: 0
233244
TP_SIZE: 1
234245
PP_SIZE: 4
235-
VP_SIZE: 1
236246
NUM_NODES: 1
237247
MAX_STEPS: 50
238248
USE_CORE: 1
@@ -284,6 +294,20 @@ train.gpt3.345m_tp1_pp2_1node_50steps:
284294
TEST_LEVEL: L0
285295

286296
train.gpt3.345m_tp1_pp4_1node_50steps:
297+
<<: *selene-test-launcher
298+
variables:
299+
<<: [*VARS]
300+
RUN_MODEL: gpt3
301+
USE_TE: 0
302+
TP_SIZE: 1
303+
PP_SIZE: 4
304+
NUM_NODES: 1
305+
MAX_STEPS: 50
306+
USE_CORE: 0
307+
TIME_LIMIT: "20:00"
308+
TEST_LEVEL: L0
309+
310+
train.gpt3.345m_tp1_pp4_interleaved_1node_50steps:
287311
<<: *selene-test-launcher
288312
variables:
289313
<<: [*VARS]
@@ -382,7 +406,6 @@ train.te_core_moe_gpt3.345m_tp2_pp2_2experts_1node_50steps:
382406
USE_TE: 0
383407
TP_SIZE: 2
384408
PP_SIZE: 2
385-
VP_SIZE: 1
386409
NUM_NODES: 1
387410
MAX_STEPS: 50
388411
USE_CORE: 1
@@ -399,7 +422,6 @@ train.te_core_moe_gpt3.345m_tp2_pp2_4experts2parallel_1node_50steps:
399422
USE_TE: 0
400423
TP_SIZE: 2
401424
PP_SIZE: 2
402-
VP_SIZE: 1
403425
NUM_NODES: 1
404426
MAX_STEPS: 50
405427
USE_CORE: 1
@@ -416,7 +438,6 @@ train.te_core_moe_gpt3.345m_tp2_pp1_4experts2parallel_1node_50steps:
416438
USE_TE: 0
417439
TP_SIZE: 2
418440
PP_SIZE: 1
419-
VP_SIZE: 1
420441
NUM_NODES: 1
421442
MAX_STEPS: 50
422443
USE_CORE: 1
@@ -433,7 +454,6 @@ train.moe_gpt3.345m_tp2_pp2_4experts_1node_50steps:
433454
USE_TE: 0
434455
TP_SIZE: 2
435456
PP_SIZE: 2
436-
VP_SIZE: 1
437457
NUM_NODES: 1
438458
MAX_STEPS: 50
439459
USE_CORE: 0
@@ -479,6 +499,18 @@ train.bert.345m_tp1_pp2_1node_50steps:
479499
TEST_LEVEL: L0
480500

481501
train.bert.345m_tp1_pp4_1node_50steps:
502+
<<: *selene-test-launcher
503+
variables:
504+
<<: [*VARS]
505+
RUN_MODEL: bert
506+
TP_SIZE: 1
507+
PP_SIZE: 4
508+
NUM_NODES: 1
509+
MAX_STEPS: 50
510+
TIME_LIMIT: "20:00"
511+
TEST_LEVEL: L0
512+
513+
train.bert.345m_tp1_pp4_interleaved_1node_50steps:
482514
<<: *selene-test-launcher
483515
variables:
484516
<<: [*VARS]

tests/functional_tests/shell_test_utils/run_selene_test_launcher_script.sh

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,8 @@ if [[ $USE_CORE -eq 1 && $USE_TE -eq 1 ]]; then
2121
fi
2222

2323
# step 2 : SETTING RUN NAME
24-
RUN_NAME=${RUN_MODEL}_tp${TP_SIZE}_pp${PP_SIZE}_${NUM_NODES}nodes_${MAX_STEPS}steps
24+
if [[ -n $VP_SIZE ]]; then INTERLEAVED_STR="_interleaved"; else INTERLEAVED_STR=""; fi
25+
RUN_NAME=${RUN_MODEL}_tp${TP_SIZE}_pp${PP_SIZE}${INTERLEAVED_STR}_${NUM_NODES}nodes_${MAX_STEPS}steps
2526
if [[ $USE_TE == 1 ]]; then RUN_NAME=${RUN_NAME}_te_enabled; fi
2627
if [[ $USE_CORE == 1 ]]; then RUN_NAME=${RUN_NAME}_core_enabled; fi
2728
if [[ -n $METADATA ]]; then RUN_NAME=${RUN_NAME}_${METADATA}; fi
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.54837, 10.54636, 10.55694, 10.54151, 10.53088, 10.48503, 10.46275, 10.31499, 10.17122, 9.97326]}, "num-zeros": {"start_step": 0, "end_step": 34, "step_interval": 5, "values": [22606.0, 20619.0, 26292.0, 23607.0, 21666.0, 21672.0, 23313.0]}, "iteration_timing_avg": 0.9262994117647059}
1+
{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.5414, 10.53988, 10.55513, 10.52847, 10.54297, 10.51657, 10.47015, 10.36882, 10.23301, 10.05128]}, "num-zeros": {"start_step": 0, "end_step": 34, "step_interval": 5, "values": [26510.0, 16034.0, 24829.0, 21005.0, 20977.0, 19155.0, 18836.0]}, "iteration_timing_avg": 0.6206926470588235}
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.54837, 10.54636, 10.55694, 10.54151, 10.53088, 10.48503, 10.46275, 10.31499, 10.17122, 9.97326]}, "num-zeros": {"start_step": 0, "end_step": 34, "step_interval": 5, "values": [22606.0, 20619.0, 26292.0, 23607.0, 21666.0, 21672.0, 23313.0]}, "iteration_timing_avg": 0.999115588235294}
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
{"lm loss": {"start_step": 0, "end_step": 45, "step_interval": 5, "values": [10.7951, 10.84939, 10.87411, 10.83459, 10.82865, 10.78676, 10.56492, 10.57063, 10.48545]}, "num-zeros": {"start_step": 0, "end_step": 29, "step_interval": 5, "values": [2561.0, 2771.0, 2141.0, 2656.0, 2737.0, 2472.0]}, "iteration_timing_avg": 0.1285973333333333}
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.82096, 10.87358, 10.8827, 10.79796, 10.68762, 10.59849, 10.09941, 10.21477, 10.14024, 9.80787]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1500.0, 1792.0, 1899.0, 1853.0, 1884.0, 1847.0, 1596.0, 1783.0, 2314.0, 2349.0]}, "iteration_timing_avg": 0.12620382352941178}

0 commit comments

Comments
 (0)