-
Notifications
You must be signed in to change notification settings - Fork 94
/
Copy path4.bmk-pretrain-gpt3-175b.sh
executable file
·81 lines (67 loc) · 2.9 KB
/
4.bmk-pretrain-gpt3-175b.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
#!/bin/bash
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
# SPDX-License-Identifier: MIT-0
set -exo pipefail
[[ -z "${TARGET_PATH}" ]] \
&& { echo Please set environment variable TARGET_PATH ; exit 1 ; } \
|| echo TARGET_PATH=$TARGET_PATH
################################################################################
# 000: Modify this section to define pre-training configuration: model size,
# number of nodes, max. pre-training steps, job's max. runtime.
################################################################################
## Pre-train gpt3-175b on 16 nodes for 5 steps. Number of nodes must be multiplies of 4.
export MODEL=gpt3
export MODEL_SIZE=175b
export NUM_NODES=16
export RUNTIME=4h
export MAX_STEPS=5
declare -a MODEL_ARGS=(
## Uncomment below to enable fp8 training (Transformers Engine) on p5 instances (H100 GPUs)
#training.model.transformer_engine=True
#training.model.fp8=True
)
################################################################################
# 010: Advance users can modify this stanza to customize benchmarking behavior.
################################################################################
declare -a BMK_ARGS=(
# Disable validation, as we're only interested to measure the training time.
training.trainer.limit_val_batches=0.0
# Ignore checkpoints
training.exp_manager.create_checkpoint_callback=False
training.exp_manager.resume_if_exists=False
# https://github.com/NVIDIA/NeMo/pull/6181/files
training.model.data.data_impl=mock
training.model.data.data_prefix=[]
)
################################################################################
# 020: Internal settings.
################################################################################
WORKSPACE_CONT=$TARGET_PATH
CONT_RESULT_DIR=${WORKSPACE_CONT}/results
CONT_TOKENIZER_DIR=${WORKSPACE_CONT}/data/bpe
# Dev/test feature (off by default) to force each pre-training run outputs to a separate directory.
: "${BMK_MODE:=0}"
if [[ ${BMK_MODE} -eq 1 ]]; then
# For debugging: each run has its own output dir.
TIMESTAMP=$(date +'%Y%m%d-%H%M%Sutc-%N')-$((RANDOM))
CONT_RESULT_DIR=${CONT_RESULT_DIR}-${TIMESTAMP}
BMK_ARGS+=(
base_results_dir=${CONT_RESULT_DIR}
training.run.dependency=null
)
echo "
####################
This run will write to directory ${CONT_RESULT_DIR}
####################
"
fi
################################################################################
# 030: Here we go...
################################################################################
HYDRA_FULL_ERROR=1 python3 $TARGET_PATH/launcher_scripts/main.py \
stages=[training] \
training=${MODEL}/${MODEL_SIZE} \
training.trainer.num_nodes=$NUM_NODES \
training.trainer.max_steps=$MAX_STEPS \
training.trainer.val_check_interval=$MAX_STEPS \
"${BMK_ARGS[@]}" "${MODEL_ARGS[@]}" "$@"