-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathllama-3.1-70b-256gpus-a3ultra-fp8.yaml
168 lines (168 loc) · 4.13 KB
/
llama-3.1-70b-256gpus-a3ultra-fp8.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
run:
name: llama-3.1-70b-a3u-bf16
time_limit: 0-03:30:00
dependency: singleton
trainer:
devices: 8
accelerator: gpu
precision: bf16
logger: false
enable_checkpointing: false
use_distributed_sampler: false
max_epochs: null
max_steps: 30
max_time: 05:23:30:00
log_every_n_steps: 1
val_check_interval: 200
limit_val_batches: 5
limit_test_batches: 5
accumulate_grad_batches: 1
gradient_clip_val: 1.0
exp_manager:
exp_dir: null
name: megatron_gpt
resume_if_exists: false
create_dllogger_logger: true
dllogger_logger_kwargs:
verbose: true
stdout: true
resume_ignore_no_checkpoint: true
create_checkpoint_callback: false
checkpoint_callback_params:
monitor: val_loss
save_top_k: 10
mode: min
always_save_nemo: false
save_nemo_on_train_end: false
model_parallel_size: ${multiply:${model.tensor_model_parallel_size}, ${model.pipeline_model_parallel_size}}
log_step_timing: true
step_timing_kwargs:
sync_cuda: true
buffer_size: 5
seconds_to_sleep: 60
explicit_log_dir: null
model:
mcore_gpt: true
micro_batch_size: 1
global_batch_size: 1024
rampup_batch_size: null
tensor_model_parallel_size: 2
pipeline_model_parallel_size: 4
virtual_pipeline_model_parallel_size: 20
context_parallel_size: 1
encoder_seq_length: 8192
max_position_embeddings: 8192
num_layers: 80
hidden_size: 8192
ffn_hidden_size: 28672
num_attention_heads: 64
num_query_groups: 8
init_method_std: 0.008944
use_scaled_init_method: true
hidden_dropout: 0.0
attention_dropout: 0.0
ffn_dropout: 0.0
kv_channels: null
apply_query_key_layer_scaling: true
normalization: rmsnorm
layernorm_epsilon: 1.0e-05
do_layer_norm_weight_decay: false
make_vocab_size_divisible_by: 128
pre_process: true
post_process: true
persist_layer_norm: true
bias: false
activation: fast-swiglu
headscale: false
transformer_block_type: pre_ln
openai_gelu: false
normalize_attention_scores: true
position_embedding_type: rope
rotary_percentage: 1.0
apply_rope_fusion: true
attention_type: multihead
share_embeddings_and_output_weights: false
tokenizer:
library: megatron
type: GPT2BPETokenizer
model: null
delimiter: null
vocab_file: gpt2-vocab.json
merge_file: gpt2-merges.txt
native_amp_init_scale: 4294967296
native_amp_growth_interval: 1000
hysteresis: 2
fp32_residual_connection: false
fp16_lm_cross_entropy: false
megatron_amp_O2: true
grad_allreduce_chunk_size_mb: 125
grad_div_ar_fusion: true
gradient_accumulation_fusion: true
bias_activation_fusion: true
bias_dropout_add_fusion: true
masked_softmax_fusion: true
seed: 1234
resume_from_checkpoint: null
use_cpu_initialization: false
onnx_safe: false
apex_transformer_log_level: 30
gradient_as_bucket_view: true
sync_batch_comm: false
activations_checkpoint_granularity: null
activations_checkpoint_method: null
activations_checkpoint_num_layers: null
num_micro_batches_with_partial_activation_checkpoints: null
activations_checkpoint_layers_per_pipeline: null
sequence_parallel: true
transformer_engine: true
fp8: true
fp8_e4m3: true
fp8_hybrid: true
fp8_margin: 0
fp8_interval: 1
fp8_amax_history_len: 1024
fp8_amax_compute_algo: max
ub_tp_comm_overlap: false
use_flash_attention: true
overlap_p2p_comm: true
batch_p2p_comm: false
gc_interval: 100
optim:
name: distributed_fused_adam
lr: 0.00015
weight_decay: 0.1
betas:
- 0.9
- 0.95
bucket_cap_mb: 125
overlap_grad_sync: true
overlap_param_sync: true
contiguous_grad_buffer: true
contiguous_param_buffer: true
grad_sync_dtype: bf16
sched:
name: CosineAnnealing
warmup_steps: 2000
constant_steps: 11873
min_lr: 1.0e-05
data:
data_impl: mock
splits_string: 90,8,2
seq_length: 8192
skip_warmup: true
num_workers: 2
dataloader_type: single
reset_position_ids: false
reset_attention_mask: false
eod_mask_loss: false
index_mapping_dir: null
data_prefix: []
nsys_profile:
enabled: false
start_step: 17
end_step: 19
ranks:
- 0
- 8
gen_shape: false
fp8_params: true