From d65f7e6ce8516c0e2ead29097131cfd609412f55 Mon Sep 17 00:00:00 2001 From: Oliver Koenig Date: Wed, 4 Dec 2024 09:41:04 -0800 Subject: [PATCH] ADLR/megatron-lm!2424 - ci: Fix notifications --- .gitlab/stages/01.test.yml | 11 +- .gitlab/stages/02.functional-tests.yml | 18 +- .../shell_test_utils/notify.sh | 198 ---------------- .../shell_test_utils/notify_unit_tests.sh | 179 --------------- .../{scripts => python_scripts}/common.py | 0 .../generate_jet_trigger_job.py | 4 +- .../generate_local_jobs.py | 2 +- .../launch_jet_workload.py | 2 +- tests/test_utils/shell_scripts/notify.sh | 215 ++++++++++++++++++ unit-test-job-lts.yaml | 96 ++++---- 10 files changed, 277 insertions(+), 448 deletions(-) delete mode 100644 tests/functional_tests/shell_test_utils/notify.sh delete mode 100644 tests/functional_tests/shell_test_utils/notify_unit_tests.sh rename tests/test_utils/{scripts => python_scripts}/common.py (100%) rename tests/test_utils/{scripts => python_scripts}/generate_jet_trigger_job.py (97%) rename tests/test_utils/{scripts => python_scripts}/generate_local_jobs.py (97%) rename tests/test_utils/{scripts => python_scripts}/launch_jet_workload.py (99%) create mode 100644 tests/test_utils/shell_scripts/notify.sh diff --git a/.gitlab/stages/01.test.yml b/.gitlab/stages/01.test.yml index e6e97a8106..47fc43283d 100644 --- a/.gitlab/stages/01.test.yml +++ b/.gitlab/stages/01.test.yml @@ -105,7 +105,7 @@ test:unit_tests_configure: H100_CLUSTER=$([[ "$FUNCTIONAL_TEST_CLUSTER_H100" != "" ]] && echo $FUNCTIONAL_TEST_CLUSTER_H100 || echo $DEFAULT_H100_CLUSTER) - | export PYTHONPATH=$(pwd) - python tests/test_utils/scripts/generate_jet_trigger_job.py \ + python tests/test_utils/python_scripts/generate_jet_trigger_job.py \ --scope "unit-tests" \ --environment lts \ --n-repeat "${UNIT_TEST_REPEAT}" \ @@ -120,7 +120,7 @@ test:unit_tests_configure: --output-path "unit-test-job-lts-legacy.yaml" - | export PYTHONPATH=$(pwd) - python tests/test_utils/scripts/generate_jet_trigger_job.py \ + python tests/test_utils/python_scripts/generate_jet_trigger_job.py \ --scope "unit-tests" \ --environment lts \ --n-repeat "${UNIT_TEST_REPEAT}" \ @@ -135,7 +135,7 @@ test:unit_tests_configure: --output-path "unit-test-job-lts-latest.yaml" - | export PYTHONPATH=$(pwd) - python tests/test_utils/scripts/generate_jet_trigger_job.py \ + python tests/test_utils/python_scripts/generate_jet_trigger_job.py \ --scope "unit-tests" \ --environment dev \ --n-repeat "${UNIT_TEST_REPEAT}" \ @@ -150,7 +150,7 @@ test:unit_tests_configure: --output-path "unit-test-job-dev-legacy.yaml" - | export PYTHONPATH=$(pwd) - python tests/test_utils/scripts/generate_jet_trigger_job.py \ + python tests/test_utils/python_scripts/generate_jet_trigger_job.py \ --scope "unit-tests" \ --environment dev \ --n-repeat "${UNIT_TEST_REPEAT}" \ @@ -239,8 +239,9 @@ test:notify_unit_tests: - export WEBHOOK_URL=${MCORE_NOTIFICATION_HOOK} - export RO_API_TOKEN=${PROJECT_ACCESS_TOKEN_MCORE} - export GITLAB_ENDPOINT + - export CONTEXT="unit-tests-extended" - export DATE=$(date +"%Y-%m-%d") - - bash tests/functional_tests/shell_test_utils/notify_unit_tests.sh ${CI_PIPELINE_ID} + - bash tests/test_utils/shell_scripts/notify.sh ${CI_PIPELINE_ID} "test:unit_tests_pyt" artifacts: when: always paths: diff --git a/.gitlab/stages/02.functional-tests.yml b/.gitlab/stages/02.functional-tests.yml index 88dde9a109..a128345c28 100644 --- a/.gitlab/stages/02.functional-tests.yml +++ b/.gitlab/stages/02.functional-tests.yml @@ -49,7 +49,7 @@ functional:configure: fi - | export PYTHONPATH=$(pwd) - python tests/test_utils/scripts/generate_jet_trigger_job.py \ + python tests/test_utils/python_scripts/generate_jet_trigger_job.py \ --scope $FUNCTIONAL_TEST_SCOPE \ --environment dev \ --n-repeat "$FUNCTIONAL_TEST_REPEAT" \ @@ -64,7 +64,7 @@ functional:configure: ${RELEASE_ARGS[@]} - | export PYTHONPATH=$(pwd) - python tests/test_utils/scripts/generate_jet_trigger_job.py \ + python tests/test_utils/python_scripts/generate_jet_trigger_job.py \ --scope $FUNCTIONAL_TEST_SCOPE \ --environment lts \ --n-repeat "$FUNCTIONAL_TEST_REPEAT" \ @@ -111,7 +111,7 @@ functional:run_dev: variables: ENVIRONMENT: dev -.notify: +functional:notify: extends: [.functional_tests_rules] image: badouralix/curl-jq needs: @@ -132,7 +132,7 @@ functional:run_dev: - export GITLAB_ENDPOINT - export CONTEXT=$FUNCTIONAL_TEST_SCOPE - export DATE=$(date +"%Y-%m-%d") - - bash tests/functional_tests/shell_test_utils/notify.sh ${CI_PIPELINE_ID} ${ENVIRONMENT} + - bash tests/test_utils/shell_scripts/notify.sh ${CI_PIPELINE_ID} "functional:run_" artifacts: when: always paths: @@ -141,13 +141,3 @@ functional:run_dev: - if: $CI_PIPELINE_SOURCE == "schedule" && $FUNCTIONAL_TEST == "yes" when: always - when: never - -functional:notify-lts: - extends: [.notify] - variables: - ENVIRONMENT: lts - -functional:notify-dev: - extends: [.notify] - variables: - ENVIRONMENT: dev diff --git a/tests/functional_tests/shell_test_utils/notify.sh b/tests/functional_tests/shell_test_utils/notify.sh deleted file mode 100644 index 4873576f18..0000000000 --- a/tests/functional_tests/shell_test_utils/notify.sh +++ /dev/null @@ -1,198 +0,0 @@ -set -euxo pipefail - -collect_jobs() { - PAGE=1 - PER_PAGE=100 - RESULTS="[]" - - while true; do - # Fetch the paginated results - RESPONSE=$( - curl \ - -s \ - --globoff \ - --header "PRIVATE-TOKEN: $RO_API_TOKEN" \ - "https://${GITLAB_ENDPOINT}/api/v4/projects/${CI_PROJECT_ID}/pipelines/${DOWNSTREAM_PIPELINE_ID}/jobs?page=$PAGE&per_page=$PER_PAGE" - ) - # Combine the results - RESULTS=$(jq -s '.[0] + .[1]' <<<"$RESULTS $RESPONSE") - - # Check if there are more pages - if [[ $(jq 'length' <<<"$RESPONSE") -lt $PER_PAGE ]]; then - break - fi - - # Increment the page number - PAGE=$((PAGE + 1)) - done - - echo "$RESULTS" -} - -CI_PIPELINE_ID=${1:-16595865} -ENVIRONMENT=${2} - -CI_PROJECT_ID=${CI_PROJECT_ID:-19378} - -# Fetch Elastic logs -set +x -PIPELINE_JSON=$( - curl \ - --fail \ - --silent \ - --header "PRIVATE-TOKEN: ${RO_API_TOKEN}" \ - "https://${GITLAB_ENDPOINT}/api/v4/projects/${CI_PROJECT_ID}/pipelines/${CI_PIPELINE_ID}/bridges?per_page=100" -) || ret_code=$? -set -x -if [[ ${ret_code:-0} -ne 0 ]]; then - echo CI_PIPELINE_ID=$CI_PIPELINE_ID does not exist - exit 1 -fi - -# Fetch GitLab logs of JET downstream pipeline -DOWNSTREAM_PIPELINE_ID=$(jq --arg environment "$ENVIRONMENT" '.[] |select(.name == "functional:run_" + $environment) | .downstream_pipeline.id' <<<"$PIPELINE_JSON") - -PIPELINE_URL=https://${GITLAB_ENDPOINT}/ADLR/megatron-lm/-/pipelines/$CI_PIPELINE_ID -JOB_URL=https://${GITLAB_ENDPOINT}/ADLR/megatron-lm/-/jobs/ - -if [[ $DOWNSTREAM_PIPELINE_ID == null ]]; then - FAILED_JOBS=$(curl \ - --fail \ - --silent \ - --header "PRIVATE-TOKEN: ${RO_API_TOKEN}" \ - "https://${GITLAB_ENDPOINT}/api/v4/projects/${CI_PROJECT_ID}/pipelines/${CI_PIPELINE_ID}/jobs?per_page=100" | - jq --arg JOB_URL "$JOB_URL" '[.[] | select(.status == "failed") | ("<" + $JOB_URL + (.id | tostring) + "|" + .name + ">")] | join("\n• Job: ")' | tr -d '"') - curl \ - -X POST \ - -H "Content-type: application/json" \ - --data ' - { - "blocks": [ - { - "type": "section", - "text": { - "type": "mrkdwn", - "text": "<'$PIPELINE_URL'|Report of '$DATE' ('$CONTEXT')>:\n" - } - }, - { - "type": "section", - "text": { - "type": "mrkdwn", - "text": "\n• Job: '"$FAILED_JOBS"'" - } - }, - ] - - }' \ - $WEBHOOK_URL - -else - set +x - JOBS=$(echo "$(collect_jobs)" | jq '[.[] | {id, name, status}]') - echo $JOBS - set -x - - FAILED_JOBS=$( - echo "$JOBS" | - jq --arg GITLAB_ENDPOINT "$GITLAB_ENDPOINT" '[ - .[] - | select(.status != "success") - | { - name, - id, - "url": ("https://" + $GITLAB_ENDPOINT + "/adlr/megatron-lm/-/jobs/" + (.id | tostring)), - } - ]' - ) - set -x - - for row in $(echo "${FAILED_JOBS}" | jq -r '.[] | @base64'); do - _jq() { - echo ${row} | base64 --decode | jq -r ${1} - } - JOB_ID=$(_jq '.id') - FULL_LOG=$(curl \ - --location \ - --header "PRIVATE-TOKEN: ${RO_API_TOKEN}" \ - "https://${GITLAB_ENDPOINT}/api/v4/projects/${CI_PROJECT_ID}/jobs/${JOB_ID}/trace") - - if [[ "$FULL_LOG" == *exception* ]]; then - LAST_EXCEPTION_POS=$(echo "$FULL_LOG" | grep -o -b 'exception' | tail -1 | cut -d: -f1) - SHORT_LOG=${FULL_LOG:$LAST_EXCEPTION_POS-500:499} - else - SHORT_LOG=${FULL_LOG: -1000} - fi - - FAILED_JOBS=$(echo "$FAILED_JOBS" | - jq \ - --argjson JOB_ID "$JOB_ID" \ - --arg SLURM_FAILURE "$SHORT_LOG" ' - .[] |= ((select(.id==$JOB_ID) += { - "slurm_failure_reason": $SLURM_FAILURE})) - ') - done - - NUM_FAILED=$(echo "$FAILED_JOBS" | jq 'length') - NUM_TOTAL=$(echo "$JOBS" | jq 'length') - - if [[ $NUM_FAILED -eq 0 ]]; then - BLOCKS='[ - { - "type": "section", - "text": { - "type": "mrkdwn", - "text": ":doge3d: <'$PIPELINE_URL'|Report of '$DATE' ('$CONTEXT')>: All '$NUM_TOTAL' passed" - } - } - ]' - else - BLOCKS=$( - echo "$FAILED_JOBS" | - jq --arg DATE "$DATE" --arg CONTEXT "$CONTEXT" --arg URL "$PIPELINE_URL" --arg NUM_FAILED "$NUM_FAILED" --arg NUM_TOTAL "$NUM_TOTAL" ' - [ - { - "type": "section", - "text": { - "type": "mrkdwn", - "text": (":doctorge: <" + $URL + "|Report of " + $DATE + " (" + $CONTEXT + ")>: " + $NUM_FAILED + " of " + $NUM_TOTAL + " failed") - } - } - ] + [ - .[] - | { - "type": "section", - "text": { - "type": "mrkdwn", - "text": ( - "• Job: <" +.url + "|" + .name + ">" - + "\n SLURM failure reason: \n```" + .slurm_failure_reason + "```" - - ) - } - } - ] + [ - { - "type": "section", - "text": { - "type": "mrkdwn", - "text": ("===============================================") - } - } - ]' - ) - fi - - for row in $(echo "${BLOCKS}" | jq -r '.[] | @base64'); do - _jq() { - echo ${row} | base64 --decode - } - - curl \ - -X POST \ - -H "Content-type: application/json" \ - --data '{"blocks": '["$(_jq)"]'}' \ - $WEBHOOK_URL - done - -fi diff --git a/tests/functional_tests/shell_test_utils/notify_unit_tests.sh b/tests/functional_tests/shell_test_utils/notify_unit_tests.sh deleted file mode 100644 index 3e25f44af5..0000000000 --- a/tests/functional_tests/shell_test_utils/notify_unit_tests.sh +++ /dev/null @@ -1,179 +0,0 @@ -set -euxo pipefail - -collect_jobs () { - PAGE=1 - PER_PAGE=100 - RESULTS="[]" - - while true; do - # Fetch the paginated results - RESPONSE=$(curl \ - -s \ - --globoff \ - --header "PRIVATE-TOKEN: $RO_API_TOKEN" \ - "https://${GITLAB_ENDPOINT}/api/v4/projects/${CI_PROJECT_ID}/pipelines/${CI_PIPELINE_ID}/jobs?page=$PAGE&per_page=$PER_PAGE" - ) - # Combine the results - RESULTS=$(jq -s '.[0] + .[1]' <<< "$RESULTS $RESPONSE") - - # Check if there are more pages - if [[ $(jq 'length' <<< "$RESPONSE") -lt $PER_PAGE ]]; then - break - fi - - # Increment the page number - PAGE=$((PAGE + 1)) - done - - echo "$RESULTS" -} - -CI_PIPELINE_ID=${1:-16595865} -CI_PROJECT_ID=${CI_PROJECT_ID:-19378} -PIPELINE_URL=https://${GITLAB_ENDPOINT}/ADLR/megatron-lm/-/pipelines/$CI_PIPELINE_ID -JOB_URL=https://${GITLAB_ENDPOINT}/ADLR/megatron-lm/-/jobs/ -CONTEXT="unit-tests-extended" - -# Fetch Elastic logs -set +x -UNIT_TESTS_JOBS=$(collect_jobs | jq '[.[] | select(.name | startswith("test:pyt"))]') -set -x -if [[ ${ret_code:-0} -ne 0 ]]; then - echo CI_PIPELINE_ID=$CI_PIPELINE_ID does not exist - exit 1 -fi - -if [[ $UNIT_TESTS_JOBS == null ]]; then - FAILED_JOBS=$(curl \ - --fail \ - --silent \ - --header "PRIVATE-TOKEN: ${RO_API_TOKEN}" \ - "https://${GITLAB_ENDPOINT}/api/v4/projects/${CI_PROJECT_ID}/pipelines/${CI_PIPELINE_ID}/jobs?per_page=100" \ - | jq --arg JOB_URL "$JOB_URL" '[.[] | select(.status == "failed") | ("<" + $JOB_URL + (.id | tostring) + "|" + .name + ">")] | join("\n• Job: ")' | tr -d '"') - curl \ - -X POST \ - -H "Content-type: application/json" \ - --data ' - { - "blocks": [ - { - "type": "section", - "text": { - "type": "mrkdwn", - "text": "<'$PIPELINE_URL'|Report of '$DATE' ('$CONTEXT')>:\n" - } - }, - { - "type": "section", - "text": { - "type": "mrkdwn", - "text": "\n• Job: '"$FAILED_JOBS"'" - } - }, - ] - - }' \ - $WEBHOOK_URL - -else - FAILED_JOBS=$(echo -E "$UNIT_TESTS_JOBS" \ - | jq --arg GITLAB_ENDPOINT "$GITLAB_ENDPOINT" --arg JOB_URL "$JOB_URL" '[ - .[] - | select(.status != "success") - | { - name, - id, - "url": ($JOB_URL + (.id | tostring)), - } - ]' - ) - set -x - - for row in $(echo "${FAILED_JOBS}" | jq -r '.[] | @base64'); do - _jq() { - echo ${row} | base64 --decode | jq -r ${1} - } - JOB_ID=$(_jq '.id') - FULL_LOG=$(curl \ - --location \ - --header "PRIVATE-TOKEN: ${RO_API_TOKEN}" \ - "https://${GITLAB_ENDPOINT}/api/v4/projects/${CI_PROJECT_ID}/jobs/${JOB_ID}/trace") - - if [[ "$FULL_LOG" == *exception* ]]; then - LAST_EXCEPTION_POS=$(echo "$FULL_LOG" | grep -o -b 'exception' | tail -1 | cut -d: -f1) - SHORT_LOG=${FULL_LOG:$LAST_EXCEPTION_POS-500:499} - else - SHORT_LOG=${FULL_LOG: -1000} - fi - - FAILED_JOBS=$(echo "$FAILED_JOBS" \ - | jq \ - --argjson JOB_ID "$JOB_ID" \ - --arg SLURM_FAILURE "$SHORT_LOG" ' - .[] |= ((select(.id==$JOB_ID) += { - "slurm_failure_reason": $SLURM_FAILURE})) - ') - done - - NUM_FAILED=$(echo "$FAILED_JOBS" | jq 'length') - NUM_TOTAL=$(echo "$UNIT_TESTS_JOBS" | jq 'length') - - if [[ $NUM_FAILED -eq 0 ]]; then - BLOCKS='[ - { - "type": "section", - "text": { - "type": "mrkdwn", - "text": ":doge3d: <'$PIPELINE_URL'|Report of '$DATE' ('$CONTEXT')>: All '$NUM_TOTAL' passed" - } - } - ]' - else - BLOCKS=$(echo "$FAILED_JOBS" \ - | jq --arg DATE "$DATE" --arg CONTEXT "$CONTEXT" --arg URL "$PIPELINE_URL" --arg NUM_FAILED "$NUM_FAILED" --arg NUM_TOTAL "$NUM_TOTAL" ' - [ - { - "type": "section", - "text": { - "type": "mrkdwn", - "text": (":doctorge: <" + $URL + "|Report of " + $DATE + " (" + $CONTEXT + ")>: " + $NUM_FAILED + " of " + $NUM_TOTAL + " failed") - } - } - ] + [ - .[] - | { - "type": "section", - "text": { - "type": "mrkdwn", - "text": ( - "• Job: <" +.url + "|" + .name + ">" - + "\n SLURM failure reason: \n```" + .slurm_failure_reason + "```" - - ) - } - } - ] + [ - { - "type": "section", - "text": { - "type": "mrkdwn", - "text": ("===============================================") - } - } - ]' - ) - fi - - for row in $(echo "${BLOCKS}" | jq -r '.[] | @base64'); do - _jq() { - echo ${row} | base64 --decode - } - - curl \ - -X POST \ - -H "Content-type: application/json" \ - --data '{"blocks": '["$(_jq)"]'}' \ - $WEBHOOK_URL - done - -fi \ No newline at end of file diff --git a/tests/test_utils/scripts/common.py b/tests/test_utils/python_scripts/common.py similarity index 100% rename from tests/test_utils/scripts/common.py rename to tests/test_utils/python_scripts/common.py diff --git a/tests/test_utils/scripts/generate_jet_trigger_job.py b/tests/test_utils/python_scripts/generate_jet_trigger_job.py similarity index 97% rename from tests/test_utils/scripts/generate_jet_trigger_job.py rename to tests/test_utils/python_scripts/generate_jet_trigger_job.py index 2f8622cfe5..0913b19bd6 100644 --- a/tests/test_utils/scripts/generate_jet_trigger_job.py +++ b/tests/test_utils/python_scripts/generate_jet_trigger_job.py @@ -4,7 +4,7 @@ import click import yaml -from tests.test_utils.scripts import common +from tests.test_utils.python_scripts import common BASE_PATH = pathlib.Path(__file__).parent.resolve() @@ -113,7 +113,7 @@ def main( script = [ "export PYTHONPATH=$(pwd); " - "python tests/test_utils/scripts/launch_jet_workload.py", + "python tests/test_utils/python_scripts/launch_jet_workload.py", f"--model {test_case.spec.model}", f"--environment {test_case.spec.environment}", f"--n-repeat {n_repeat}", diff --git a/tests/test_utils/scripts/generate_local_jobs.py b/tests/test_utils/python_scripts/generate_local_jobs.py similarity index 97% rename from tests/test_utils/scripts/generate_local_jobs.py rename to tests/test_utils/python_scripts/generate_local_jobs.py index ebb3e5b5f9..175492175d 100644 --- a/tests/test_utils/scripts/generate_local_jobs.py +++ b/tests/test_utils/python_scripts/generate_local_jobs.py @@ -12,7 +12,7 @@ import jetclient import yaml -from tests.test_utils.scripts import common +from tests.test_utils.python_scripts import common def load_script(config_path: str) -> str: diff --git a/tests/test_utils/scripts/launch_jet_workload.py b/tests/test_utils/python_scripts/launch_jet_workload.py similarity index 99% rename from tests/test_utils/scripts/launch_jet_workload.py rename to tests/test_utils/python_scripts/launch_jet_workload.py index 5b0dae6f6f..6e0580fcda 100644 --- a/tests/test_utils/scripts/launch_jet_workload.py +++ b/tests/test_utils/python_scripts/launch_jet_workload.py @@ -16,7 +16,7 @@ from jetclient.facades.objects import log as jet_log from jetclient.services.dtos.pipeline import PipelineStatus -from tests.test_utils.scripts import common +from tests.test_utils.python_scripts import common BASE_PATH = pathlib.Path(__file__).parent.resolve() diff --git a/tests/test_utils/shell_scripts/notify.sh b/tests/test_utils/shell_scripts/notify.sh new file mode 100644 index 0000000000..ff4b40107c --- /dev/null +++ b/tests/test_utils/shell_scripts/notify.sh @@ -0,0 +1,215 @@ +set -euxo pipefail + +collect_jobs() { + DOWNSTREAM_PIPELINE_ID=$1 + PAGE=1 + PER_PAGE=100 + RESULTS="[]" + + while true; do + # Fetch the paginated results + RESPONSE=$( + curl \ + -s \ + --globoff \ + --header "PRIVATE-TOKEN: $RO_API_TOKEN" \ + "https://${GITLAB_ENDPOINT}/api/v4/projects/${CI_PROJECT_ID}/pipelines/${DOWNSTREAM_PIPELINE_ID}/jobs?page=$PAGE&per_page=$PER_PAGE" + ) + # Combine the results + RESULTS=$(jq -s '.[0] + .[1]' <<<"$RESULTS $RESPONSE") + + # Check if there are more pages + if [[ $(jq 'length' <<<"$RESPONSE") -lt $PER_PAGE ]]; then + break + fi + + # Increment the page number + PAGE=$((PAGE + 1)) + done + + echo "$RESULTS" +} + +CI_PIPELINE_ID=${1:-16595865} +ENVIRONMENT=${2} + +CI_PROJECT_ID=${CI_PROJECT_ID:-19378} + +# Fetch Elastic logs +set +x +PIPELINE_JSON=$( + curl \ + --fail \ + --silent \ + --header "PRIVATE-TOKEN: ${RO_API_TOKEN}" \ + "https://${GITLAB_ENDPOINT}/api/v4/projects/${CI_PROJECT_ID}/pipelines/${CI_PIPELINE_ID}/bridges?per_page=100" +) || ret_code=$? +set -x +if [[ ${ret_code:-0} -ne 0 ]]; then + echo CI_PIPELINE_ID=$CI_PIPELINE_ID does not exist + exit 1 +fi + +# Fetch GitLab logs of JET downstream pipeline +DOWNSTREAM_PIPELINE_IDS=$(jq \ + -c --arg environment "$ENVIRONMENT" ' + .[] + | select(.name | startswith($environment)) + | { + id: .downstream_pipeline.id, + name: .name + } + ' <<<"$PIPELINE_JSON") + +PIPELINE_URL=https://${GITLAB_ENDPOINT}/ADLR/megatron-lm/-/pipelines/$CI_PIPELINE_ID +JOB_URL=https://${GITLAB_ENDPOINT}/ADLR/megatron-lm/-/jobs/ + +while IFS= read -r DOWNSTREAM_PIPELINE; do + + if [[ $DOWNSTREAM_PIPELINE == null ]]; then + FAILED_JOBS=$(curl \ + --fail \ + --silent \ + --header "PRIVATE-TOKEN: ${RO_API_TOKEN}" \ + "https://${GITLAB_ENDPOINT}/api/v4/projects/${CI_PROJECT_ID}/pipelines/${CI_PIPELINE_ID}/jobs?per_page=100" | + jq --arg JOB_URL "$JOB_URL" '[.[] | select(.status == "failed") | ("<" + $JOB_URL + (.id | tostring) + "|" + .name + ">")] | join("\n• Job: ")' | tr -d '"') + curl \ + -X POST \ + -H "Content-type: application/json" \ + --data ' + { + "blocks": [ + { + "type": "section", + "text": { + "type": "mrkdwn", + "text": "<'$PIPELINE_URL'|Report of '$DATE' ('$CONTEXT')>:\n" + } + }, + { + "type": "section", + "text": { + "type": "mrkdwn", + "text": "\n• Job: '"$FAILED_JOBS"'" + } + }, + ] + + }' \ + $WEBHOOK_URL + + else + DOWNSTREAM_PIPELINE_ID=$(echo $DOWNSTREAM_PIPELINE | jq '.id' | tr -d '"') + DOWNSTREAM_PIPELINE_NAME=$(echo $DOWNSTREAM_PIPELINE | jq '.name' | tr -d '"') + + set +x + JOBS=$(echo "$(collect_jobs $DOWNSTREAM_PIPELINE_ID)" | jq '[.[] | {id, name, status}]') + echo $JOBS + set -x + + FAILED_JOBS=$( + echo "$JOBS" | + jq --arg GITLAB_ENDPOINT "$GITLAB_ENDPOINT" '[ + .[] + | select(.status != "success") + | { + name, + id, + "url": ("https://" + $GITLAB_ENDPOINT + "/adlr/megatron-lm/-/jobs/" + (.id | tostring)), + } + ]' + ) + set -x + + for row in $(echo "${FAILED_JOBS}" | jq -r '.[] | @base64'); do + _jq() { + echo ${row} | base64 --decode | jq -r ${1} + } + JOB_ID=$(_jq '.id') + FULL_LOG=$(curl \ + --location \ + --header "PRIVATE-TOKEN: ${RO_API_TOKEN}" \ + "https://${GITLAB_ENDPOINT}/api/v4/projects/${CI_PROJECT_ID}/jobs/${JOB_ID}/trace") + + if [[ "$FULL_LOG" == *exception* ]]; then + LAST_EXCEPTION_POS=$(echo "$FULL_LOG" | grep -o -b 'exception' | tail -1 | cut -d: -f1) + SHORT_LOG=${FULL_LOG:$LAST_EXCEPTION_POS-500:499} + else + SHORT_LOG=${FULL_LOG: -1000} + fi + + FAILED_JOBS=$(echo "$FAILED_JOBS" | + jq \ + --argjson JOB_ID "$JOB_ID" \ + --arg SLURM_FAILURE "$SHORT_LOG" ' + .[] |= ((select(.id==$JOB_ID) += { + "slurm_failure_reason": $SLURM_FAILURE})) + ') + done + + NUM_FAILED=$(echo "$FAILED_JOBS" | jq 'length') + NUM_TOTAL=$(echo "$JOBS" | jq 'length') + _CONTEXT="$CONTEXT - $DOWNSTREAM_PIPELINE_NAME" + + if [[ $NUM_FAILED -eq 0 ]]; then + BLOCKS='[ + { + "type": "section", + "text": { + "type": "mrkdwn", + "text": ":doge3d: <'$PIPELINE_URL'|Report of '$DATE' ('$_CONTEXT')>: All '$NUM_TOTAL' passed" + } + } + ]' + else + BLOCKS=$( + echo "$FAILED_JOBS" | + jq --arg DATE "$DATE" --arg CONTEXT "$_CONTEXT" --arg URL "$PIPELINE_URL" --arg NUM_FAILED "$NUM_FAILED" --arg NUM_TOTAL "$NUM_TOTAL" ' + [ + { + "type": "section", + "text": { + "type": "mrkdwn", + "text": (":doctorge: <" + $URL + "|Report of " + $DATE + " (" + $CONTEXT + ")>: " + $NUM_FAILED + " of " + $NUM_TOTAL + " failed") + } + } + ] + [ + .[] + | { + "type": "section", + "text": { + "type": "mrkdwn", + "text": ( + "• Job: <" +.url + "|" + .name + ">" + + "\n SLURM failure reason: \n```" + .slurm_failure_reason + "```" + + ) + } + } + ] + [ + { + "type": "section", + "text": { + "type": "mrkdwn", + "text": ("===============================================") + } + } + ]' + ) + fi + + for row in $(echo "${BLOCKS}" | jq -r '.[] | @base64'); do + _jq() { + echo ${row} | base64 --decode + } + + curl \ + -X POST \ + -H "Content-type: application/json" \ + --data '{"blocks": '["$(_jq)"]'}' \ + $WEBHOOK_URL + done + + fi + +done <<<"$DOWNSTREAM_PIPELINE_IDS" diff --git a/unit-test-job-lts.yaml b/unit-test-job-lts.yaml index fd6eb71dfe..ea64ccd6b1 100644 --- a/unit-test-job-lts.yaml +++ b/unit-test-job-lts.yaml @@ -3,84 +3,84 @@ default: other: artifacts: paths: - - results/ + - results/ when: always image: gitlab-master.nvidia.com:5005/adlr/megatron-lm/mcore_utility:20283570 needs: - - job: functional:configure - pipeline: $PARENT_PIPELINE_ID + - job: functional:configure + pipeline: $PARENT_PIPELINE_ID rules: - - if: $CI_PIPELINE_SOURCE == "parent_pipeline" - - if: $CI_MERGE_REQUEST_ID + - if: $CI_PIPELINE_SOURCE == "parent_pipeline" + - if: $CI_MERGE_REQUEST_ID script: - - export PYTHONPATH=$(pwd); python tests/test_utils/scripts/launch_jet_workload.py - --model unit-tests --environment lts --n-repeat 1 --time-limit 1800 --test-case - other --container-tag 20283570 --cluster dgxh100_coreweave + - export PYTHONPATH=$(pwd); python tests/test_utils/python_scripts/launch_jet_workload.py + --model unit-tests --environment lts --n-repeat 1 --time-limit 1800 --test-case + other --container-tag 20283570 --cluster dgxh100_coreweave stage: unit-tests tags: &id001 - - arch/amd64 - - env/prod - - origin/jet-fleet - - owner/jet-core - - purpose/jet-client - - team/megatron + - arch/amd64 + - env/prod + - origin/jet-fleet + - owner/jet-core + - purpose/jet-client + - team/megatron timeout: 7 days stages: -- unit-tests + - unit-tests tests/unit_tests/data/: artifacts: paths: - - results/ + - results/ when: always image: gitlab-master.nvidia.com:5005/adlr/megatron-lm/mcore_utility:20283570 needs: - - job: functional:configure - pipeline: $PARENT_PIPELINE_ID + - job: functional:configure + pipeline: $PARENT_PIPELINE_ID rules: - - if: $CI_PIPELINE_SOURCE == "parent_pipeline" - - if: $CI_MERGE_REQUEST_ID + - if: $CI_PIPELINE_SOURCE == "parent_pipeline" + - if: $CI_MERGE_REQUEST_ID script: - - export PYTHONPATH=$(pwd); python tests/test_utils/scripts/launch_jet_workload.py - --model unit-tests --environment lts --n-repeat 1 --time-limit 1800 --test-case - tests/unit_tests/data/ --container-tag 20283570 --cluster dgxh100_coreweave + - export PYTHONPATH=$(pwd); python tests/test_utils/python_scripts/launch_jet_workload.py + --model unit-tests --environment lts --n-repeat 1 --time-limit 1800 --test-case + tests/unit_tests/data/ --container-tag 20283570 --cluster dgxh100_coreweave stage: unit-tests tags: *id001 timeout: 7 days tests/unit_tests/dist_checkpointing/: artifacts: paths: - - results/ + - results/ when: always image: gitlab-master.nvidia.com:5005/adlr/megatron-lm/mcore_utility:20283570 needs: - - job: functional:configure - pipeline: $PARENT_PIPELINE_ID + - job: functional:configure + pipeline: $PARENT_PIPELINE_ID rules: - - if: $CI_PIPELINE_SOURCE == "parent_pipeline" - - if: $CI_MERGE_REQUEST_ID + - if: $CI_PIPELINE_SOURCE == "parent_pipeline" + - if: $CI_MERGE_REQUEST_ID script: - - export PYTHONPATH=$(pwd); python tests/test_utils/scripts/launch_jet_workload.py - --model unit-tests --environment lts --n-repeat 1 --time-limit 1800 --test-case - tests/unit_tests/dist_checkpointing/ --container-tag 20283570 --cluster dgxh100_coreweave + - export PYTHONPATH=$(pwd); python tests/test_utils/python_scripts/launch_jet_workload.py + --model unit-tests --environment lts --n-repeat 1 --time-limit 1800 --test-case + tests/unit_tests/dist_checkpointing/ --container-tag 20283570 --cluster dgxh100_coreweave stage: unit-tests tags: *id001 timeout: 7 days tests/unit_tests/distributed/: artifacts: paths: - - results/ + - results/ when: always image: gitlab-master.nvidia.com:5005/adlr/megatron-lm/mcore_utility:20283570 needs: - - job: functional:configure - pipeline: $PARENT_PIPELINE_ID + - job: functional:configure + pipeline: $PARENT_PIPELINE_ID rules: - - if: $CI_PIPELINE_SOURCE == "parent_pipeline" - - if: $CI_MERGE_REQUEST_ID + - if: $CI_PIPELINE_SOURCE == "parent_pipeline" + - if: $CI_MERGE_REQUEST_ID script: - - export PYTHONPATH=$(pwd); python tests/test_utils/scripts/launch_jet_workload.py - --model unit-tests --environment lts --n-repeat 1 --time-limit 1800 --test-case - tests/unit_tests/distributed/ --container-tag 20283570 --cluster dgxh100_coreweave + - export PYTHONPATH=$(pwd); python tests/test_utils/python_scripts/launch_jet_workload.py + --model unit-tests --environment lts --n-repeat 1 --time-limit 1800 --test-case + tests/unit_tests/distributed/ --container-tag 20283570 --cluster dgxh100_coreweave stage: unit-tests tags: *id001 timeout: 7 days @@ -88,20 +88,20 @@ tests/unit_tests/distributed/: tests/unit_tests/test_training.py : artifacts: paths: - - results/ + - results/ when: always image: gitlab-master.nvidia.com:5005/adlr/megatron-lm/mcore_utility:20283570 needs: - - job: functional:configure - pipeline: $PARENT_PIPELINE_ID + - job: functional:configure + pipeline: $PARENT_PIPELINE_ID rules: - - if: $CI_PIPELINE_SOURCE == "parent_pipeline" - - if: $CI_MERGE_REQUEST_ID + - if: $CI_PIPELINE_SOURCE == "parent_pipeline" + - if: $CI_MERGE_REQUEST_ID script: - - export PYTHONPATH=$(pwd); python tests/test_utils/scripts/launch_jet_workload.py - --model unit-tests --environment lts --n-repeat 1 --time-limit 1800 --test-case - tests/unit_tests/test_inference.py tests/unit_tests/test_tokenizer.py tests/unit_tests/test_utilities.py - tests/unit_tests/test_training.py --container-tag 20283570 --cluster dgxh100_coreweave + - export PYTHONPATH=$(pwd); python tests/test_utils/python_scripts/launch_jet_workload.py + --model unit-tests --environment lts --n-repeat 1 --time-limit 1800 --test-case + tests/unit_tests/test_inference.py tests/unit_tests/test_tokenizer.py tests/unit_tests/test_utilities.py + tests/unit_tests/test_training.py --container-tag 20283570 --cluster dgxh100_coreweave stage: unit-tests tags: *id001 timeout: 7 days