|
| 1 | +# Copyright 2024 Bytedance Ltd. and/or its affiliates |
| 2 | +# |
| 3 | +# Licensed under the Apache License, Version 2.0 (the "License"); |
| 4 | +# you may not use this file except in compliance with the License. |
| 5 | +# You may obtain a copy of the License at |
| 6 | +# |
| 7 | +# http://www.apache.org/licenses/LICENSE-2.0 |
| 8 | +# |
| 9 | +# Unless required by applicable law or agreed to in writing, software |
| 10 | +# distributed under the License is distributed on an "AS IS" BASIS, |
| 11 | +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 12 | +# See the License for the specific language governing permissions and |
| 13 | +# limitations under the License. |
| 14 | + |
| 15 | +import torch |
| 16 | +from transformers import PretrainedConfig, Qwen2Config, LlamaConfig |
| 17 | + |
| 18 | +VALID_CONFIG_TYPE = (Qwen2Config, LlamaConfig) |
| 19 | + |
| 20 | + |
| 21 | +def get_device_flops(unit="T"): |
| 22 | + |
| 23 | + def unit_convert(number, level): |
| 24 | + units = ["B", "K", "M", "G", "T", "P"] |
| 25 | + if number <= 0: |
| 26 | + return number |
| 27 | + ptr = 0 |
| 28 | + while ptr < len(units) and units[ptr] != level: |
| 29 | + number /= 1000 |
| 30 | + ptr += 1 |
| 31 | + return number |
| 32 | + |
| 33 | + device_name = torch.cuda.get_device_name() |
| 34 | + flops = float("inf") # INF flops for unkown gpu type |
| 35 | + if "H100" in device_name or "H800" in device_name: |
| 36 | + flops = 989e12 |
| 37 | + elif "A100" in device_name or "A800" in device_name: |
| 38 | + flops = 312e12 |
| 39 | + elif "L40" in device_name: |
| 40 | + flops = 181.05e12 |
| 41 | + elif "L20" in device_name: |
| 42 | + flops = 119.5e12 |
| 43 | + elif "H20" in device_name: |
| 44 | + flops = 148e12 |
| 45 | + elif "910B" in device_name: |
| 46 | + flops = 354e12 |
| 47 | + flops_unit = unit_convert(flops, unit) |
| 48 | + return flops_unit |
| 49 | + |
| 50 | + |
| 51 | +class FlopsCounter: |
| 52 | + """ |
| 53 | + Used to count mfu during training loop |
| 54 | +
|
| 55 | + Example: |
| 56 | + flops_counter = FlopsCounter(config) |
| 57 | + flops_achieved, flops_promised = flops_counter.estimate_flops(tokens_list, delta_time) |
| 58 | +
|
| 59 | + """ |
| 60 | + |
| 61 | + def __init__(self, config: PretrainedConfig): |
| 62 | + if not isinstance(config, VALID_CONFIG_TYPE): |
| 63 | + print(f"Only support config type of {VALID_CONFIG_TYPE}, but got {type(config)}. " |
| 64 | + f"MFU will always be zero.") |
| 65 | + |
| 66 | + self.estimate_func = {"qwen2": self._estimate_qwen2_flops, 'llama': self._estimate_qwen2_flops} |
| 67 | + self.config = config |
| 68 | + |
| 69 | + def _estimate_unknown_flops(self, tokens_sum, batch_seqlens, delta_time): |
| 70 | + return 0 |
| 71 | + |
| 72 | + def _estimate_qwen2_flops(self, tokens_sum, batch_seqlens, delta_time): |
| 73 | + assert isinstance(self.config, (Qwen2Config, LlamaConfig)) |
| 74 | + hidden_size = self.config.hidden_size |
| 75 | + vocab_size = self.config.vocab_size |
| 76 | + num_hidden_layers = self.config.num_hidden_layers |
| 77 | + num_key_value_heads = self.config.num_key_value_heads |
| 78 | + num_attention_heads = self.config.num_attention_heads |
| 79 | + intermediate_size = self.config.intermediate_size |
| 80 | + |
| 81 | + head_dim = hidden_size // num_attention_heads |
| 82 | + q_size = num_attention_heads * head_dim |
| 83 | + k_size = num_key_value_heads * head_dim |
| 84 | + v_size = num_key_value_heads * head_dim |
| 85 | + |
| 86 | + # non-attn per layer parm |
| 87 | + # Qwen2/LLama use SwiGelu, gate, having up and down linear layer in mlp |
| 88 | + mlp_N = hidden_size * intermediate_size * 3 |
| 89 | + attn_linear_N = hidden_size * (q_size + k_size + v_size + num_attention_heads * head_dim) |
| 90 | + emd_and_lm_head_N = vocab_size * hidden_size * 2 |
| 91 | + # non-attn all_layer parm |
| 92 | + dense_N = (mlp_N + attn_linear_N) * num_hidden_layers + emd_and_lm_head_N |
| 93 | + # non-attn all_layer & all_token fwd & bwd flops |
| 94 | + dense_N_flops = 6 * dense_N * tokens_sum |
| 95 | + |
| 96 | + # attn all_layer & all_token fwd & bwd flops |
| 97 | + seqlen_square_sum = 0 |
| 98 | + for seqlen in batch_seqlens: |
| 99 | + seqlen_square_sum += seqlen * seqlen |
| 100 | + attn_qkv_flops = 12 * seqlen_square_sum * head_dim * num_attention_heads * num_hidden_layers |
| 101 | + |
| 102 | + # all_layer & all_token fwd & bwd flops |
| 103 | + flops_all_token = dense_N_flops + attn_qkv_flops |
| 104 | + flops_achieved = flops_all_token * (1.0 / delta_time) / 1e12 |
| 105 | + return flops_achieved |
| 106 | + |
| 107 | + def estimate_flops(self, batch_seqlens, delta_time): |
| 108 | + """ |
| 109 | + Estimate the FLOPS based on the number of valid tokens in the current batch and the time taken. |
| 110 | +
|
| 111 | + Args: |
| 112 | + batch_seqlens (List[int]): A list where each element represents the number of valid tokens in the current batch. |
| 113 | + delta_time (float): The time taken to process the batch, in seconds. |
| 114 | +
|
| 115 | + Returns: |
| 116 | + estimated_flops (float): The estimated FLOPS based on the input tokens and time. |
| 117 | + promised_flops (float): The expected FLOPS of the current device. |
| 118 | + """ |
| 119 | + tokens_sum = sum(batch_seqlens) |
| 120 | + func = self.estimate_func.get(self.config.model_type, self._estimate_unknown_flops) |
| 121 | + estimated_flops = func(tokens_sum, batch_seqlens, delta_time) |
| 122 | + promised_flops = get_device_flops() |
| 123 | + return estimated_flops, promised_flops |
0 commit comments