Skip to content

Commit

Permalink
Merge branch 'ko3n1g/ci/fetch-exitcode' into 'main'
Browse files Browse the repository at this point in the history
ci: Fetch exit-code

See merge request ADLR/megatron-lm!2562
  • Loading branch information
ko3n1g committed Jan 17, 2025
2 parents c614252 + f85b6b1 commit e02a860
Show file tree
Hide file tree
Showing 3 changed files with 21 additions and 4 deletions.
11 changes: 9 additions & 2 deletions tests/functional_tests/shell_test_utils/_run_training.sh
Original file line number Diff line number Diff line change
Expand Up @@ -102,13 +102,20 @@ DISTRIBUTED_ARGS=(
--node_rank $SLURM_NODEID
--log-dir $OUTPUT_PATH
--tee "0:3"
--redirects "3"
)

# Start training
torchrun ${DISTRIBUTED_ARGS[@]} $TRAINING_SCRIPT_PATH $PARAMS
set -e
EXIT_CODE=0
torchrun ${DISTRIBUTED_ARGS[@]} $TRAINING_SCRIPT_PATH $PARAMS || EXIT_CODE=$?
echo $EXIT_CODE
set +e

find "$OUTPUT_PATH" -type f \( -name "stdout.log" -o -name "stderr.log" \) | while read -r file; do
rank_dir=$(basename "$(dirname "$file")")
mv "$file" "$(dirname "$file")/repeat$REPEAT-run$RUN_NUMBER-rank${rank_dir}-$(basename "$file")"
done

if [[ $EXIT_CODE -ne 0 ]]; then
exit $EXIT_CODE
fi
2 changes: 1 addition & 1 deletion tests/test_utils/python_scripts/launch_jet_workload.py
Original file line number Diff line number Diff line change
Expand Up @@ -277,8 +277,8 @@ def main(
no_log = True
except KeyError as e:
logger.error(e)
break
no_log = True
break

if no_log:
logger.error("Did not find any logs to download, retry.")
Expand Down
12 changes: 11 additions & 1 deletion tests/test_utils/recipes/unit-tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,17 @@ spec:
for i in $(seq $UNIT_TEST_REPEAT); do
CMD=$(echo torchrun ${{DISTRIBUTED_ARGS[@]}} -m pytest -xvs --cov-report=term --cov-report=html --cov=megatron/core --no-cov-on-fail ${{IGNORE_ARGS[@]}} -m "'${{MARKER_ARG}}'" $BUCKET)
eval "$CMD"
set -e
EXIT_CODE=0
eval "$CMD" || EXIT_CODE=$?
echo $EXIT_CODE
set +e
if [[ $EXIT_CODE -ne 0 ]]; then
break
fi
done
find "{assets_dir}" -type f \( -name "stdout.log" -o -name "stderr.log" \) | while read -r file; do
Expand Down

0 comments on commit e02a860

Please sign in to comment.