Merge branch 'ko3n1g/ci/fix-inputs-to-nemo-ci' into 'main'

ci: nemo-ci inputs See merge request ADLR/megatron-lm!2522
NVIDIA · Jan 8, 2025 · bafab5a · bafab5a
2 parents a26b93d + 67130c9
commit bafab5a
Show file tree

Hide file tree

Showing 4 changed files with 15 additions and 6 deletions.
diff --git a/.gitlab/stages/02.functional-tests.yml b/.gitlab/stages/02.functional-tests.yml
@@ -117,9 +117,9 @@ functional:run_nemo:
     variables: true
   variables:
     MCORE_COMMIT: $CI_COMMIT_SHA
-    TEST_LLM_MODULE: true
-    TEST_ALIGNER_MODULE: false
-    TEST_DATA_CURATOR_MODULE: false
+    TEST_LLM_MODULE: 'True'
+    TEST_ALIGNER_MODULE: 'False'
+    TEST_DATA_CURATOR_MODULE: 'False'
     TESTS_TO_RUN_ON_THIS_COMMIT: nightly
   rules:
     - if: $FUNCTIONAL_TEST == "yes"

diff --git a/...te_tp2_pp1_te_8experts2parallel_ddp_average_in_collective_dgx_a100_1N8G/model_config.yaml b/...te_tp2_pp1_te_8experts2parallel_ddp_average_in_collective_dgx_a100_1N8G/model_config.yaml
@@ -58,4 +58,5 @@ MODEL_ARGS:
   --data-cache-path: ${DATA_CACHE_PATH}
   --bf16: true
   --attention-backend: unfused
+  --log-memory-to-tensorboard: true
 TEST_TYPE: regular
diff --git a/tests/test_utils/python_scripts/launch_jet_workload.py b/tests/test_utils/python_scripts/launch_jet_workload.py
@@ -259,11 +259,18 @@ def main(
                 jet_log = main_job.get_logs()
                 logs = extract_logs_to_string(logs=jet_log)
                 download_job_assets(logs=jet_log, iteration=n_iteration)
+                no_log = False
                 break
             except (requests.exceptions.ConnectionError, json.decoder.JSONDecodeError) as e:
-                print(e)
+                logger.error(e)
                 time.sleep((3**n_download_attempt) * 60)
                 n_download_attempt += 1
+            except KeyError as e:
+                logger.error(e)
+                no_log = True
+
+        if no_log:
+            continue
 
         concat_logs = "\n".join(logs)
         print(f"Logs:\n{concat_logs}")

diff --git a/tests/unit_tests/test_model_configs.py b/tests/unit_tests/test_model_configs.py
@@ -3,7 +3,7 @@
 import pytest
 import yaml
 
-YAML_DIR = pathlib.Path(__file__).parent / ".." / "tests/" / "functional_tests" / "test_cases"
+YAML_DIR = pathlib.Path(__file__).parent / ".." / "functional_tests" / "test_cases"
 
 
 def get_yaml_files(directory):
@@ -24,6 +24,7 @@ def load_yaml(file_path):
 @pytest.mark.parametrize("yaml_file", get_yaml_files(YAML_DIR))
 def test_model_config_tracks_memory(yaml_file, metric):
     """Test if each YAML file contains the required record."""
+    print("gpt3-nemo" in str(yaml_file) or "ckpt_converter" in str(yaml_file))
     if "gpt3-nemo" in str(yaml_file) or "ckpt_converter" in str(yaml_file):
         pytest.skip("Skipping for gpt-nemo")
 
@@ -33,4 +34,4 @@ def test_model_config_tracks_memory(yaml_file, metric):
         "MODEL_ARGS" in model_config
         and metric in model_config["MODEL_ARGS"]
         and model_config["MODEL_ARGS"][metric] is True
-    ), f"Please add {metric} to {yaml_file.parent.name}."
+    ), f"Please add argument `{metric}` to `{yaml_file.parent.name}/model_config.yaml` that its metric gets tracked."