-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathprocess_training_results.py
180 lines (158 loc) · 5.43 KB
/
process_training_results.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Tool to calculate training metrics for common LLM models"""
import argparse
import json
from src.data_defs import MODEL_FLOPS_PER_SAMPLE, MAX_TFLOPS, ACCELERATORS
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("--file", type=str, required=True, help="DLLogger file")
parser.add_argument(
"--model_flops",
type=float,
required=False,
help="Model flops fw + bw per 1 sample. If not provided will use defaults values in code",
)
parser.add_argument(
"--max_flops",
type=float,
required=False,
help="Max theoretical TFLOPS. If not provided, default values will be used for the accelerator type on bf16 precision",
)
parser.add_argument(
"--batch_size",
type=int,
required=True,
help="Global batch size used during training.",
)
parser.add_argument(
"--model_type",
type=str,
choices=list(MODEL_FLOPS_PER_SAMPLE.keys()),
help="Type of model",
)
parser.add_argument(
"--num_accelerators",
type=int,
required=True,
help="Number of GPUs/TPUs used for training",
)
parser.add_argument(
"--accelerator_type",
type=str,
choices=ACCELERATORS,
help="Number of GPUs used for training",
)
parser.add_argument(
"--precision",
type=str,
choices=["bf16", "fp8"],
default="bf16",
help="Precision using during training",
)
parser.add_argument(
"--start_step",
type=int,
required=False,
default=10,
help="Start step to compute the training step time",
)
parser.add_argument(
"--end_step",
type=int,
required=False,
default=30,
help="Start step to compute the training step time",
)
return parser.parse_args()
def compute_mfu(
step_time: float,
max_tflops: str,
num_accelerators: int,
model_flops_per_sample: float,
batch_size: int,
) -> float:
"""Computes the MFU
Args:
step_time (float): forward + backward step time in seconds
max_tflops (str): Max theoretical TFLOPs supported by the accelerator used
num_accelerators (int): Number of accelerators used during the training process
model_flops_per_sample (float): Number of FLOPS for a single sample training step
batch_size (int): Global batch size used during training
Returns:
float: Returns the Model FLOPS Utilization MFU
"""
tflops_per_accelerator = (
model_flops_per_sample * batch_size / step_time / num_accelerators / 1e12
)
mfu = tflops_per_accelerator / max_tflops
print(f"Average step time: {step_time:.8f}")
print(f"TFLOPS/Accelerator: {tflops_per_accelerator:.8f}")
print(f"MFU: {mfu:.8f}")
return mfu
def get_average_step_time(file: str, start_step: int, end_step: int) -> float:
"""Computes the average step time from a dllogger json file
between the step start_step and end_step, both included.
Args:
file (str): path to the dllogger file to use
Returns:
float: average step time between the steps start_step and end_step
"""
with open(file, "r", encoding="utf-8") as f:
data = f.readlines()
datajson = [json.loads(line[4:]) for line in data]
time_step_accumulator = 0
num_steps = 0
for line in datajson:
if line.get("step") != "PARAMETER":
step = line.get("step")
if step >= start_step and step <= end_step:
time_step_accumulator += line["data"].get("train_step_timing in s")
num_steps += 1
if num_steps == 0:
raise ValueError(
"Make sure your dllogger.json file contains steps in the range of --start_step and --end_step"
)
return time_step_accumulator / num_steps
def main(args):
"""Main processing"""
if args.model_type is None and args.model_flops is None:
print("Either the --model_type or --model_flops is needed")
return
if args.accelerator_type is None and args.max_flops is None:
print("Either the --accelerator_type or --max_flops is needed")
return
model_flops_per_sample = (
args.model_flops
if args.model_flops
else MODEL_FLOPS_PER_SAMPLE[args.model_type]
)
max_tflops = (
args.max_flops
if args.max_flops
else MAX_TFLOPS[(args.accelerator_type, args.precision)]
)
average_step_time = get_average_step_time(
args.file, start_step=args.start_step, end_step=args.end_step
)
compute_mfu(
step_time=average_step_time,
max_tflops=max_tflops,
num_accelerators=args.num_accelerators,
model_flops_per_sample=model_flops_per_sample,
batch_size=args.batch_size,
)
if __name__ == "__main__":
args = parse_args()
main(args)