Skip to content

Commit

Permalink
ADLR/megatron-lm!2496 - ci: Move most of LTS tests to nightly
Browse files Browse the repository at this point in the history
  • Loading branch information
ko3n1g committed Jan 5, 2025
1 parent 30ffe88 commit 47b8470
Show file tree
Hide file tree
Showing 10 changed files with 862 additions and 241 deletions.
7 changes: 5 additions & 2 deletions tests/test_utils/python_scripts/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,12 @@ def flatten_products(
workload_manifest: jetclient.JETWorkloadManifest,
) -> jetclient.JETWorkloadManifest:
"""Flattens a nested dict of products"""

workload_manifest.products = [
dict(zip(inp.keys(), values))
for inp in workload_manifest.products
dict(**dict(zip(inp.keys(), values)), **{"test_case": product['test_case'][0]})
for product in workload_manifest.products
if "products" in product
for inp in product['products']
for values in itertools.product(*inp.values())
]

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,7 @@ def main(
f"--environment {test_case.spec.environment}",
f"--n-repeat {n_repeat}",
f"--time-limit {time_limit}",
f"--scope {scope}",
f"--test-case '{test_case.spec.test_case}'",
f"--container-tag {container_tag}",
f"--cluster {cluster}",
Expand Down
5 changes: 5 additions & 0 deletions tests/test_utils/python_scripts/launch_jet_workload.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ def launch_and_wait_for_completion(
environment: str,
n_repeat: int,
time_limit: int,
scope: str,
container_image: Optional[str],
container_tag: str,
cluster: str,
Expand All @@ -63,6 +64,7 @@ def launch_and_wait_for_completion(
n_repeat=n_repeat,
time_limit=time_limit,
tag=tag,
scope=scope,
container_image=container_image,
container_tag=container_tag,
environment=environment,
Expand Down Expand Up @@ -160,6 +162,7 @@ def parse_finished_training(logs: List[str]) -> Optional[bool]:
)
@click.option("--n-repeat", required=False, default=1, type=int)
@click.option("--time-limit", required=False, default=1800, type=int)
@click.option("--scope", required=False, default="mr", type=str)
@click.option(
"--account",
required=False,
Expand Down Expand Up @@ -187,6 +190,7 @@ def main(
environment: str,
n_repeat: int,
time_limit: int,
scope: str,
account: str,
partition: Optional[str],
cluster: str,
Expand Down Expand Up @@ -236,6 +240,7 @@ def main(
environment=environment,
n_repeat=n_repeat,
time_limit=time_limit,
scope=scope,
container_image=container_image,
container_tag=container_tag,
cluster=cluster,
Expand Down
93 changes: 70 additions & 23 deletions tests/test_utils/recipes/bert.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ spec:
build: mcore-pyt-{environment}
gpus: 8
platforms: dgx_a100
time_limit:
n_repeat:
artifacts:
/workspace/data/bert_data: text/the_pile/bert_shard00
script: |-
Expand All @@ -30,26 +32,71 @@ spec:
bash ./tests/functional_tests/shell_test_utils/run_ci_test.sh ${{ARGUMENTS[@]}}
products:
- environment: [lts, dev]
scope: [mr]
time_limit: [1800]
n_repeat: [5]
test_case:
- bert_mr_mcore_tp2_pp2_dgx_a100_1N8G
- bert_mr_mcore_tp2_pp2_local_spec_dgx_a100_1N8G
- bert_mr_mcore_tp2_pp2_resume_torch_dist_dgx_a100_1N8G
- bert_mr_mcore_tp2_pp2_resume_torch_dist_local_spec_dgx_a100_1N8G
- bert_mr_tp1_pp4_vp2_dgx_a100_1N8G
- bert_mr_tp1_pp4_vp2_resume_torch_dgx_a100_1N8G
- bert_mr_tp2_pp2_dgx_a100_1N8G
- bert_mr_tp2_pp2_resume_torch_dgx_a100_1N8G
- environment: [lts, dev]
scope: [nightly]
n_repeat: [5]
time_limit: [3600]
test_case:
- bert_nightly_dgx_a100_1N8G_mcore_tp1_pp2
- bert_nightly_dgx_a100_1N8G_mcore_tp1_pp4_vp2
- bert_nightly_dgx_a100_1N8G_mcore_tp4_pp1
- bert_nightly_dgx_a100_1N8G_tp1_pp2
- bert_nightly_dgx_a100_1N8G_tp4_pp1
- test_case: [bert_mr_mcore_tp2_pp2_dgx_a100_1N8G]
products:
- environment: [dev]
scope: [mr]
- environment: [lts]
scope: [nightly]
- test_case: [bert_mr_mcore_tp2_pp2_local_spec_dgx_a100_1N8G]
products:
- environment: [dev]
scope: [mr]
- environment: [lts]
scope: [nightly]
- test_case: [bert_mr_mcore_tp2_pp2_resume_torch_dist_dgx_a100_1N8G]
products:
- environment: [dev]
scope: [mr]
- environment: [lts]
scope: [nightly]
- test_case: [bert_mr_mcore_tp2_pp2_resume_torch_dist_local_spec_dgx_a100_1N8G]
products:
- environment: [dev]
scope: [mr]
- environment: [lts]
scope: [nightly]
- test_case: [bert_mr_tp1_pp4_vp2_dgx_a100_1N8G]
products:
- environment: [dev]
scope: [mr]
- environment: [lts]
scope: [nightly]
- test_case: [bert_mr_tp1_pp4_vp2_resume_torch_dgx_a100_1N8G]
products:
- environment: [dev]
scope: [mr]
- environment: [lts]
scope: [nightly]
- test_case: [bert_mr_tp2_pp2_dgx_a100_1N8G]
products:
- environment: [dev]
scope: [mr]
- environment: [lts]
scope: [nightly]
- test_case: [bert_mr_tp2_pp2_resume_torch_dgx_a100_1N8G]
products:
- environment: [dev]
scope: [mr]
- environment: [lts]
scope: [nightly]
- test_case: [bert_nightly_dgx_a100_1N8G_mcore_tp1_pp2]
products:
- environment: [dev, lts]
scope: [nightly]
- test_case: [bert_nightly_dgx_a100_1N8G_mcore_tp1_pp4_vp2]
products:
- environment: [dev, lts]
scope: [nightly]
- test_case: [bert_nightly_dgx_a100_1N8G_mcore_tp4_pp1]
products:
- environment: [dev, lts]
scope: [nightly]
- test_case: [bert_nightly_dgx_a100_1N8G_tp1_pp2]
products:
- environment: [dev, lts]
scope: [nightly]
- test_case: [bert_nightly_dgx_a100_1N8G_tp4_pp1]
products:
- environment: [dev, lts]
scope: [nightly]
13 changes: 7 additions & 6 deletions tests/test_utils/recipes/gpt-modelopt.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@ spec:
build: mcore-pyt-{environment}
nodes: 1
gpus: 2
platforms: dgx_a100
time_limit:
n_repeat:
artifacts:
/workspace/data/gpt3_data: text/the_pile/shard00
/workspace/checkpoints/teacher: model/gpt_dummy_pyt/ckpt/24.10.0_bf16_teacher
Expand All @@ -29,9 +32,7 @@ spec:
bash ./tests/functional_tests/shell_test_utils/run_ci_test.sh ${{ARGUMENTS[@]}}
products:
- scope: [nightly]
platforms: [dgx_a100]
time_limit: [1200]
environment: [lts, dev] # Disable dev for now
test_case:
- gpt3_nightly_mcore_te_tp2_pp1_modelopt_distill_resume
- test_case: [gpt3_nightly_mcore_te_tp2_pp1_modelopt_distill_resume]
products:
- environment: [dev, lts]
scope: [nightly]
22 changes: 13 additions & 9 deletions tests/test_utils/recipes/gpt-nemo.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ spec:
gpus: 8
platforms: dgx_a100
time_limit: 1800
scope: null
scope:
script: |-
ls
cd /opt/NeMo
Expand All @@ -30,11 +30,15 @@ spec:
bash /opt/megatron-lm/tests/functional_tests/shell_test_utils/run_ci_test.sh ${{ARGUMENTS[@]}}
products:
- environment: [dev]
scope: [mr]
n_repeat: [5]
test_case:
- gpt3-nemo_126m_mr_mbs1_gbs8_mcore_te_tp2_pp4_vp3_seq_par_overlap_p2p_dgx_a100_1N8G
- gpt3-nemo_126m_mr_mbs4_gbs64_mcore_te_tp1_pp1_dgx_a100_1N8G
- gpt3-nemo_126m_mr_mbs1_gbs8_mcore_te_8experts_tp2_ep2_pp1_dgx_a100_1N8G

- test_case: [gpt3-nemo_126m_mr_mbs1_gbs8_mcore_te_tp2_pp4_vp3_seq_par_overlap_p2p_dgx_a100_1N8G]
products:
- environment: [dev]
scope: [mr]
- test_case: [gpt3-nemo_126m_mr_mbs4_gbs64_mcore_te_tp1_pp1_dgx_a100_1N8G]
products:
- environment: [dev]
scope: [mr]
- test_case: [gpt3-nemo_126m_mr_mbs1_gbs8_mcore_te_8experts_tp2_ep2_pp1_dgx_a100_1N8G]
products:
- environment: [dev]
scope: [mr]
Loading

0 comments on commit 47b8470

Please sign in to comment.