From b054c2cbdfc9b83c03d004a2a1fcaa732ec32c80 Mon Sep 17 00:00:00 2001 From: fsx950223 Date: Thu, 14 Nov 2024 08:24:39 +0000 Subject: [PATCH 01/23] add fp8 tuning config --- .../tensile_config_generator.py | 196 +++++++++++------- 1 file changed, 126 insertions(+), 70 deletions(-) diff --git a/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py b/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py index da8ff1859d..2f1aec0917 100644 --- a/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py +++ b/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py @@ -61,13 +61,17 @@ help="If enabled, only tune the matrix instruction with min tile sizes, else, tune full matrix instructions") parser.add_argument( - "--groups", type=bool, default=True, + "--groups", type=bool, default=False, help="If enabled, will replace MatrixInstruction with GroupedMatrixInstruction") parser.add_argument( "--gridbase_config", type=str, default=None, help="Range config path") +parser.add_argument( + "--full_mfma", type=bool, default=False, + help="If enabled, will search for all mfma instructions") + args = parser.parse_args() NUM_WARM_UP = 20 @@ -81,6 +85,8 @@ res = subprocess.run("rocminfo | grep Compute", stdout=subprocess.PIPE, shell=True, env={"ROCR_VISIBLE_DEVICES":"0"}) match = re.search(CU_RE, res.stdout.decode("utf-8").split('\n')[-2]) NUM_STAGES = 8 +DIV_MI = 3 # 33.3% +MIN_MI = 5 # min 5 solutions CU = 0 if match: CU = int(match.group('COMPUTE_UNIT').strip()) @@ -103,10 +109,18 @@ DeviceNames = ["Device 0050", "Device 0051", "Device 0052", "Device 0054", "Device 0062", "Device 7400", "Device 740c"] ScheduleName = "aldebaran" -fp16_instructions = [[16,16,16,1]] -bf16_instructions = [[16,16,8,1]] -tf32_instructions = [[16,16,8,1]] -fp32_instructions = [[16,16,4,1]] +if args.full_mfma: + fp16_instructions = [[32,32,4,2], [32,32,8,1], [16,16,4,4], [16,16,16,1], [4,4,4,16]] + bf16_instructions = [[32,32,4,2], [32,32,8,1], [16,16,4,4], [16,16,16,1], [4,4,4,16]] + tf32_instructions = [[32,32,2,2], [32,32,4,1], [16,16,2,4], [16,16,8,1], [4,4,2,16]] + fp32_instructions = [[32,32,1,2], [32,32,2,1], [16,16,1,4], [16,16,4,1], [4,4,1,16]] + fp8_instructions = [[32,32,16,1], [16,16,32,1]] +else: + fp16_instructions = [[16,16,16,1]] + bf16_instructions = [[16,16,16,1],[32,32,8,1]] + tf32_instructions = [[16,16,8,1]] + fp32_instructions = [[16,16,4,1]] + fp8_instructions = [[16,16,32,1]] HIPBLASLT_BENCH_RE = ( @@ -136,6 +150,35 @@ r"--compute_type (?P[\w ]+)") +HIPBLASLT_BENCH_RE2 = ( + r"(?P\w+) --api_method c " + r"-m (?P[\d ]+)" + r"-n (?P[\d ]+)" + r"-k (?P[\d ]+)" + r"--lda (?P[\d ]+)" + r"--ldb (?P[\d ]+)" + r"--ldc (?P[\d ]+)" + r"--ldd (?P[\d ]+)" + r"--stride_a (?P[\d ]+)" + r"--stride_b (?P[\d ]+)" + r"--stride_c (?P[\d ]+)" + r"--stride_d (?P[\d ]+)" + r"--alpha (?P[\d\. ]+)" + r"--beta (?P[\d\. ]+)" + r"--transA (?P[\w ]+)" + r"--transB (?P[\w ]+)" + r"--batch_count (?P[\d ]+)" + r"--scaleA (?P[\w ]+)" + r"--scaleB (?P[\w ]+)" + r"--bias_vector --bias_source (?P[\w ]+)" + r"--a_type (?P[\w ]+)" + r"--b_type (?P[\w ]+)" + r"--c_type (?P[\w ]+)" + r"--d_type (?P[\w ]+)" + r"--scale_type (?P[\w ]+)" + r"--bias_type (?P[\w ]+)" + r"--compute_type (?P[\w ]+)") + # Function to extract problem sizes from a line def extract_problem_size(match): return [int(match.group('M').strip()), int(match.group('N').strip()), int(match.group('BATCH_COUNT').strip()), int(match.group('K').strip())] @@ -149,6 +192,8 @@ def instruction_map(dtype_dict): return fp16_instructions elif dtype_dict["DataType"] == 'B': return bf16_instructions + elif dtype_dict["DataType"] == 'F8': + return fp8_instructions else: return None @@ -161,6 +206,8 @@ def datatype_map(dtype): return "XS" elif dtype == "bf16_r": return "B" + elif dtype == "f8_r": + return "F8" else: return None @@ -178,7 +225,7 @@ def extract_dtype(match): ComputeDataType = datatype_map(match.group('COMPUTE_TYPE').strip()) TransposeA = trans_map(match.group('TRANS_A').strip()) TransposeB = trans_map(match.group('TRANS_B').strip()) - if DataType in ["H", "B"]: + if DataType in ["H", "B", "F8"]: HighPrecisionAccumulate = True else: HighPrecisionAccumulate = False @@ -343,9 +390,14 @@ def dump_yaml(gpu_idx, gemm_group, yaml_file, m_sum, n_sum, batch_sum, k_sum, it # Read problem sizes from the input file with open(args.hipblaslt_log, 'r') as f: for line in f: - match = re.search( - HIPBLASLT_BENCH_RE, line - ) + if 'f8_r' in line: + match = re.search( + HIPBLASLT_BENCH_RE2, line + ) + else: + match = re.search( + HIPBLASLT_BENCH_RE, line + ) if match: if line in unique_gemms: unique_gemms[line] += 1 @@ -369,10 +421,16 @@ def dump_yaml(gpu_idx, gemm_group, yaml_file, m_sum, n_sum, batch_sum, k_sum, it n_sum = 0 batch_sum = 0 k_sum = 0 + for k, v in unique_gemms_subgroup: - match = re.search( - HIPBLASLT_BENCH_RE, k - ) + if 'f8_r' in k: + match = re.search( + HIPBLASLT_BENCH_RE2, k + ) + else: + match = re.search( + HIPBLASLT_BENCH_RE, k + ) if match: size = extract_problem_size(match) @@ -382,51 +440,49 @@ def dump_yaml(gpu_idx, gemm_group, yaml_file, m_sum, n_sum, batch_sum, k_sum, it dtype_str = json.dumps(dtype) if mfma_instructions is None: continue + mfma_instruction_found = False - mfma_instruction = mfma_instructions[0] - for _ in range(NUM_STAGES): - matmul_instruction_gen = list(find_matmul_instruction(mfma_instruction, size)) - if args.groups: - mi_groups0, mi_groups1, matmul_instruction_gen = get_groups(matmul_instruction_gen) - else: - mi_groups0 = [] - mi_groups1 = [] - - DIV_MI = 3 # 33.3% - MIN_MI = 5 # min 5 solutions - - total_inst = min(len(matmul_instruction_gen) // DIV_MI, MIN_MI) # At least 5 insts and max of 33.3% of insts. - for index, matmul_instruction in enumerate(matmul_instruction_gen): - if matmul_instruction is not None: - if dtype_str not in matmul_instructions: - matmul_instructions[dtype_str] = dict() - matmul_instructions[dtype_str][str(matmul_instruction)] = matmul_instruction + for mfma_instruction in mfma_instructions: + for _ in range(NUM_STAGES): + matmul_instruction_gen = list(find_matmul_instruction(mfma_instruction, size)) + if args.groups: + mi_groups0, mi_groups1, matmul_instruction_gen = get_groups(matmul_instruction_gen) + else: + mi_groups0 = [] + mi_groups1 = [] + + total_inst = min(len(matmul_instruction_gen) // DIV_MI, MIN_MI) # At least 5 insts and max of 33.3% of insts. + for index, matmul_instruction in enumerate(matmul_instruction_gen): + if matmul_instruction is not None: + if dtype_str not in matmul_instructions: + matmul_instructions[dtype_str] = dict() + matmul_instructions[dtype_str][str(matmul_instruction)] = matmul_instruction + if args.fast and (index > total_inst): + break + total_inst = min(len(mi_groups0) // DIV_MI, MIN_MI) + for index, mi_0 in enumerate(mi_groups0): + if dtype_str not in groups: + groups[dtype_str] = [{},{}] + groups[dtype_str][0]["MatrixInstruction"] = {} + groups[dtype_str][1]["MatrixInstruction"] = {} + groups[dtype_str][0]["MatrixInstruction"][str(mi_0)] = mi_0 if args.fast and (index > total_inst): break - total_inst = min(len(mi_groups0) // DIV_MI, MIN_MI) - for index, mi_0 in enumerate(mi_groups0): - if dtype_str not in groups: - groups[dtype_str] = [{},{}] - groups[dtype_str][0]["MatrixInstruction"] = {} - groups[dtype_str][1]["MatrixInstruction"] = {} - groups[dtype_str][0]["MatrixInstruction"][str(mi_0)] = mi_0 - if args.fast and (index > total_inst): - break - total_inst = min(len(mi_groups1) // DIV_MI, MIN_MI) - for index, mi_1 in enumerate(mi_groups1): - if dtype_str not in groups: - groups[dtype_str] = [{},{}] - groups[dtype_str][0]["MatrixInstruction"] = {} - groups[dtype_str][1]["MatrixInstruction"] = {} - groups[dtype_str][1]["MatrixInstruction"][str(mi_1)] = mi_1 - if args.fast and (index > total_inst): + total_inst = min(len(mi_groups1) // DIV_MI, MIN_MI) + for index, mi_1 in enumerate(mi_groups1): + if dtype_str not in groups: + groups[dtype_str] = [{},{}] + groups[dtype_str][0]["MatrixInstruction"] = {} + groups[dtype_str][1]["MatrixInstruction"] = {} + groups[dtype_str][1]["MatrixInstruction"][str(mi_1)] = mi_1 + if args.fast and (index > total_inst): + break + if len(matmul_instruction_gen) > 0 or len(mi_groups0) > 0 or len(mi_groups1) > 0: + mfma_instruction_found = True break - if len(matmul_instruction_gen) > 0 or len(mi_groups0) > 0 or len(mi_groups1) > 0: - mfma_instruction_found = True - break - else: - max_dim = int(np.argmax(size)) - size[max_dim] = size[max_dim] // 2 + else: + max_dim = int(np.argmax(size)) + size[max_dim] = size[max_dim] // 2 if not mfma_instruction_found: print(f"Can't find mfma instructions for {original_size}, please contact hipblaslt expert") @@ -494,23 +550,23 @@ def dump_yaml(gpu_idx, gemm_group, yaml_file, m_sum, n_sum, batch_sum, k_sum, it if mfma_instructions is None: continue mfma_instruction_found = False - mfma_instruction = mfma_instructions[0] - for _ in range(NUM_STAGES): - matmul_instruction_gen = list(find_matmul_instruction(mfma_instruction, size)) - total_inst = min(len(matmul_instruction_gen) // 3, 5) # At least 5 insts and max of 33.3% of insts. - for index, matmul_instruction in enumerate(matmul_instruction_gen): - if matmul_instruction is not None: - if dtype_str not in matmul_instructions: - matmul_instructions[dtype_str] = dict() - matmul_instructions[dtype_str][str(matmul_instruction)] = matmul_instruction - if args.fast and (index > total_inst): - break - if len(matmul_instruction_gen) > 0: - mfma_instruction_found = True - break - else: - max_dim = int(np.argmax(size)) - size[max_dim] = size[max_dim] // 2 + for mfma_instruction in mfma_instructions: + for _ in range(NUM_STAGES): + matmul_instruction_gen = list(find_matmul_instruction(mfma_instruction, size)) + total_inst = min(len(matmul_instruction_gen) // 3, 5) # At least 5 insts and max of 33.3% of insts. + for index, matmul_instruction in enumerate(matmul_instruction_gen): + if matmul_instruction is not None: + if dtype_str not in matmul_instructions: + matmul_instructions[dtype_str] = dict() + matmul_instructions[dtype_str][str(matmul_instruction)] = matmul_instruction + if args.fast and (index > total_inst): + break + if len(matmul_instruction_gen) > 0: + mfma_instruction_found = True + break + else: + max_dim = int(np.argmax(size)) + size[max_dim] = size[max_dim] // 2 if not mfma_instruction_found: print(f"Can't find mfma instructions for {original_size}, please contact hipblaslt expert") else: From 4e4952328d23bd8b00604bf1beb7508a671a8a19 Mon Sep 17 00:00:00 2001 From: fsx950223 Date: Fri, 15 Nov 2024 08:26:07 +0000 Subject: [PATCH 02/23] fix bugs --- .../Utilities/tensile_generator/tensile_config_generator.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py b/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py index 2f1aec0917..8566009449 100644 --- a/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py +++ b/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py @@ -84,7 +84,7 @@ ArchitectureName = res.stdout.decode("utf-8").strip() res = subprocess.run("rocminfo | grep Compute", stdout=subprocess.PIPE, shell=True, env={"ROCR_VISIBLE_DEVICES":"0"}) match = re.search(CU_RE, res.stdout.decode("utf-8").split('\n')[-2]) -NUM_STAGES = 8 +NUM_STAGES = 32 DIV_MI = 3 # 33.3% MIN_MI = 5 # min 5 solutions CU = 0 @@ -246,7 +246,7 @@ def find_matmul_instruction(mfma_instruction, size): continue for n_tiles in reversed(range(1, CU+1)): n_tile_size = size[1] // n_tiles - if n_tile_size > 256: + if n_tile_size > 256 // m_tile_size: continue wave_tile_n = math.ceil(n_tile_size / mfma_instruction[1]) if wave_tile_n <= 0: @@ -307,7 +307,7 @@ def calculate_min_flops(m_sum, n_sum, batch_sum, k_sum, iters): batch_avg = batch_sum / len(unique_gemms_subgroup) k_avg = k_sum / len(unique_gemms_subgroup) - return (ENQUEUES_PER_SYNC + args.iters) * m_avg * n_avg * batch_avg * k_avg / 2 + return (ENQUEUES_PER_SYNC + iters) * m_avg * n_avg * batch_avg * k_avg / 2 def dump_yaml(gpu_idx, gemm_group, yaml_file, m_sum, n_sum, batch_sum, k_sum, iters, groups): MinFlopsPerSync = calculate_min_flops(m_sum, n_sum, batch_sum, k_sum, iters) From ed12d74006f22f8db3995d8d1870191b26da4f06 Mon Sep 17 00:00:00 2001 From: fsx950223 Date: Fri, 15 Nov 2024 10:04:22 +0000 Subject: [PATCH 03/23] fix arch name for mi308 and mi210 --- tensilelite/Tensile/Utilities/tensile_generator/README.md | 5 +---- .../Utilities/tensile_generator/tensile_config_generator.py | 2 +- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/tensilelite/Tensile/Utilities/tensile_generator/README.md b/tensilelite/Tensile/Utilities/tensile_generator/README.md index 183307eb06..dbf56a39f5 100644 --- a/tensilelite/Tensile/Utilities/tensile_generator/README.md +++ b/tensilelite/Tensile/Utilities/tensile_generator/README.md @@ -60,15 +60,12 @@ To use the `tensile_config_generator.py` script, follow these steps: MI308: - Modify yamls under ```/3_LibraryLogic/```. ```- gfx942 -> - {Architecture: gfx942, CUCount: {20|80}}``` - For cpx, use the gfx942_20cu folder; for spx, use the gfx942_80cu folder. ``` python3 ./tensilelite/Tensile/Utilities/merge.py --no_eff library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/aquavanjaram/{gfx942_20cu|gfx942_80cu}/{Equality|GridBased}/ /3_LibraryLogic/ library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/aquavanjaram/{gfx942_20cu|gfx942_80cu}/{Equality|GridBased}/ ``` MI210: - - Modify yamls under ```/3_LibraryLogic/```. ```- gfx90a -> - {Architecture: gfx90a, CUCount: 104}``` + ``` python3 ./tensilelite/Tensile/Utilities/merge.py --no_eff library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/aldebaran/104CU/{Equality|GridBased}/ /3_LibraryLogic/ library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/aldebaran/104CU/{Equality|GridBased}/ ``` diff --git a/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py b/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py index 8566009449..973df6961c 100644 --- a/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py +++ b/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py @@ -375,7 +375,7 @@ def dump_yaml(gpu_idx, gemm_group, yaml_file, m_sum, n_sum, batch_sum, k_sum, it data["BenchmarkProblems"][i][0] = dtype data["LibraryLogic"]["DeviceNames"] = DeviceNames data["LibraryLogic"]["ScheduleName"] = ScheduleName - data["LibraryLogic"]["ArchitectureName"] = ArchitectureName + data["LibraryLogic"]["ArchitectureName"] = {"Architecture": ArchitectureName, "CUCount": CU} data["LibraryLogic"]["LibraryType"] = LibraryType # Write the updated YAML file yaml_file = os.path.basename(yaml_file) From 554e13e20d4af015a97b425c8447df1cfe0deac4 Mon Sep 17 00:00:00 2001 From: fsx950223 Date: Mon, 18 Nov 2024 02:23:17 +0000 Subject: [PATCH 04/23] add scale and bias --- .../tensile_config_generator.py | 138 ++++++++++++++---- 1 file changed, 111 insertions(+), 27 deletions(-) diff --git a/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py b/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py index 973df6961c..244f84dfd8 100644 --- a/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py +++ b/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py @@ -149,8 +149,62 @@ r"--bias_type (?P[\w ]+)" r"--compute_type (?P[\w ]+)") +HIPBLASLT_BENCH_RE_SAB = ( + r"(?P\w+) --api_method c " + r"-m (?P[\d ]+)" + r"-n (?P[\d ]+)" + r"-k (?P[\d ]+)" + r"--lda (?P[\d ]+)" + r"--ldb (?P[\d ]+)" + r"--ldc (?P[\d ]+)" + r"--ldd (?P[\d ]+)" + r"--stride_a (?P[\d ]+)" + r"--stride_b (?P[\d ]+)" + r"--stride_c (?P[\d ]+)" + r"--stride_d (?P[\d ]+)" + r"--alpha (?P[\d\. ]+)" + r"--beta (?P[\d\. ]+)" + r"--transA (?P[\w ]+)" + r"--transB (?P[\w ]+)" + r"--batch_count (?P[\d ]+)" + r"--scaleA (?P[\w ]+)" + r"--scaleB (?P[\w ]+)" + r"--a_type (?P[\w ]+)" + r"--b_type (?P[\w ]+)" + r"--c_type (?P[\w ]+)" + r"--d_type (?P[\w ]+)" + r"--scale_type (?P[\w ]+)" + r"--bias_type (?P[\w ]+)" + r"--compute_type (?P[\w ]+)") + +HIPBLASLT_BENCH_RE_BIAS = ( + r"(?P\w+) --api_method c " + r"-m (?P[\d ]+)" + r"-n (?P[\d ]+)" + r"-k (?P[\d ]+)" + r"--lda (?P[\d ]+)" + r"--ldb (?P[\d ]+)" + r"--ldc (?P[\d ]+)" + r"--ldd (?P[\d ]+)" + r"--stride_a (?P[\d ]+)" + r"--stride_b (?P[\d ]+)" + r"--stride_c (?P[\d ]+)" + r"--stride_d (?P[\d ]+)" + r"--alpha (?P[\d\. ]+)" + r"--beta (?P[\d\. ]+)" + r"--transA (?P[\w ]+)" + r"--transB (?P[\w ]+)" + r"--batch_count (?P[\d ]+)" + r"--bias_vector --bias_source (?P[\w ]+)" + r"--a_type (?P[\w ]+)" + r"--b_type (?P[\w ]+)" + r"--c_type (?P[\w ]+)" + r"--d_type (?P[\w ]+)" + r"--scale_type (?P[\w ]+)" + r"--bias_type (?P[\w ]+)" + r"--compute_type (?P[\w ]+)") -HIPBLASLT_BENCH_RE2 = ( +HIPBLASLT_BENCH_RE_SAB_BIAS = ( r"(?P\w+) --api_method c " r"-m (?P[\d ]+)" r"-n (?P[\d ]+)" @@ -213,18 +267,33 @@ def datatype_map(dtype): def trans_map(trans): if trans == "T": - return True + return 1 elif trans == "N": - return False + return 0 else: return None +def bias_datatype_map(dtype): + if dtype == "f16_r": + return [datatype_map('f32_r'), datatype_map('f16_r')] + elif dtype == "f32_r": + return [datatype_map('f32_r')] + elif dtype == "xf32_r": + return [datatype_map('f32_r'), datatype_map('xf32_r')] + elif dtype == "bf16_r": + return [datatype_map('f32_r'), datatype_map('bf16_r')] + elif dtype == "f8_r": + return [datatype_map('f8_r')] + else: + return [] + def extract_dtype(match): - DataType = datatype_map(match.group('A_TYPE').strip()) - DestDataType = datatype_map(match.group('C_TYPE').strip()) - ComputeDataType = datatype_map(match.group('COMPUTE_TYPE').strip()) - TransposeA = trans_map(match.group('TRANS_A').strip()) - TransposeB = trans_map(match.group('TRANS_B').strip()) + gdict = match.groupdict() + DataType = datatype_map(gdict.get('A_TYPE', '').strip()) + DestDataType = datatype_map(gdict.get('C_TYPE', '').strip()) + ComputeDataType = datatype_map(gdict.get('COMPUTE_TYPE', '').strip()) + TransposeA = trans_map(gdict.get('TRANS_A', '').strip()) + TransposeB = trans_map(gdict.get('TRANS_B', '').strip()) if DataType in ["H", "B", "F8"]: HighPrecisionAccumulate = True else: @@ -233,7 +302,16 @@ def extract_dtype(match): if ComputeDataType == "XS": ComputeDataType = "S" F32XdlMathOp = 'x' - return {"Batched": True, "DataType": DataType, "DestDataType": DestDataType, "ComputeDataType": ComputeDataType, "TransposeA": TransposeA, "TransposeB": TransposeB, "HighPrecisionAccumulate": HighPrecisionAccumulate, "F32XdlMathOp": F32XdlMathOp, "OperationType": "GEMM", "UseBeta": True} + res = {"Batched": True, "DataType": DataType, "DestDataType": DestDataType, "ComputeDataType": ComputeDataType, "TransposeA": TransposeA, "TransposeB": TransposeB, "HighPrecisionAccumulate": HighPrecisionAccumulate, "F32XdlMathOp": F32XdlMathOp, "OperationType": "GEMM", "UseBeta": True} + + if gdict.get("BIAS_SOURCE"): + res["UseBias"] = 1 + res["BiasSrc"] = gdict.get('BIAS_SOURCE', '').strip().upper() + res["BiasDataTypeList"] = list(bias_datatype_map(gdict.get("BIAS_TYPE", '').strip())) + if gdict.get("SCALE_A") is not None and gdict.get("SCALE_B") is not None: + res["UseScaleAB"] = "Scalar" + res["UseScaleAlphaVec"] = 1 + return res def find_matmul_instruction(mfma_instruction, size): for bm in range(int(math.log(mfma_instruction[3],2))+1): @@ -282,6 +360,25 @@ def get_groups(matmul_instruction_gen): mi_left.append(mi) return mi_groups0, mi_groups1, mi_left +def match_pattern(line): + if 'bias_vector' in line and 'scaleA' in line and 'scaleB' in line: + match = re.search( + HIPBLASLT_BENCH_RE_SAB_BIAS, line + ) + elif 'bias_vector' in line: + match = re.search( + HIPBLASLT_BENCH_RE_BIAS, line + ) + elif 'scaleA' in line and 'scaleB' in line: + match = re.search( + HIPBLASLT_BENCH_RE_SAB, line + ) + else: + match = re.search( + HIPBLASLT_BENCH_RE, line + ) + return match + def extract_range(data): shapes = [] if 'Exact' in data: @@ -326,6 +423,8 @@ def dump_yaml(gpu_idx, gemm_group, yaml_file, m_sum, n_sum, batch_sum, k_sum, it if i >= len(data["BenchmarkProblems"]): data["BenchmarkProblems"].append(copy.deepcopy(data["BenchmarkProblems"][0])) data["BenchmarkProblems"][i][1]["BenchmarkFinalParameters"][0]["ProblemSizes"] = gemm_group[dtype_str] + if "BiasDataTypeList" in dtype: + data["BenchmarkProblems"][i][1]["BenchmarkFinalParameters"].append({"BiasTypeArgs": list(dtype["BiasDataTypeList"])}) # Add groupd here if needed group_params = [[]] @@ -390,14 +489,7 @@ def dump_yaml(gpu_idx, gemm_group, yaml_file, m_sum, n_sum, batch_sum, k_sum, it # Read problem sizes from the input file with open(args.hipblaslt_log, 'r') as f: for line in f: - if 'f8_r' in line: - match = re.search( - HIPBLASLT_BENCH_RE2, line - ) - else: - match = re.search( - HIPBLASLT_BENCH_RE, line - ) + match = match_pattern(line) if match: if line in unique_gemms: unique_gemms[line] += 1 @@ -423,15 +515,7 @@ def dump_yaml(gpu_idx, gemm_group, yaml_file, m_sum, n_sum, batch_sum, k_sum, it k_sum = 0 for k, v in unique_gemms_subgroup: - if 'f8_r' in k: - match = re.search( - HIPBLASLT_BENCH_RE2, k - ) - else: - match = re.search( - HIPBLASLT_BENCH_RE, k - ) - + match = match_pattern(k) if match: size = extract_problem_size(match) original_size = copy.deepcopy(size) @@ -440,7 +524,7 @@ def dump_yaml(gpu_idx, gemm_group, yaml_file, m_sum, n_sum, batch_sum, k_sum, it dtype_str = json.dumps(dtype) if mfma_instructions is None: continue - + mfma_instruction_found = False for mfma_instruction in mfma_instructions: for _ in range(NUM_STAGES): From 14a4fc66be909eb24f6e6fb02174e0fed4b7ed46 Mon Sep 17 00:00:00 2001 From: fsx950223 Date: Mon, 18 Nov 2024 07:10:00 +0000 Subject: [PATCH 05/23] add full mi --- .../Utilities/tensile_generator/README.md | 6 +++--- .../tensile_config_generator.py | 19 ++++++++++++------- 2 files changed, 15 insertions(+), 10 deletions(-) diff --git a/tensilelite/Tensile/Utilities/tensile_generator/README.md b/tensilelite/Tensile/Utilities/tensile_generator/README.md index dbf56a39f5..d4b4fdeb84 100644 --- a/tensilelite/Tensile/Utilities/tensile_generator/README.md +++ b/tensilelite/Tensile/Utilities/tensile_generator/README.md @@ -48,7 +48,7 @@ To use the `tensile_config_generator.py` script, follow these steps: 3. Install hipBLASLt and Tensile (change the path to the hipBLASLt repo): ``` - bash ./install.sh -idc -a $(/opt/rocm/llvm/bin/offload-arch) --keep-build-tmp + bash ./install.sh -idc -a $(/opt/rocm/llvm/bin/offload-arch) --cpu_ref_lib=lapack ``` 4. Tune GEMM kernels using the generated YAML files: @@ -65,14 +65,14 @@ To use the `tensile_config_generator.py` script, follow these steps: python3 ./tensilelite/Tensile/Utilities/merge.py --no_eff library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/aquavanjaram/{gfx942_20cu|gfx942_80cu}/{Equality|GridBased}/ /3_LibraryLogic/ library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/aquavanjaram/{gfx942_20cu|gfx942_80cu}/{Equality|GridBased}/ ``` MI210: - + ``` python3 ./tensilelite/Tensile/Utilities/merge.py --no_eff library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/aldebaran/104CU/{Equality|GridBased}/ /3_LibraryLogic/ library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/aldebaran/104CU/{Equality|GridBased}/ ``` 6. Rebuild hipBLASLt with the merged results: ``` - bash ./install.sh -idc -a $(/opt/rocm/llvm/bin/offload-arch) --keep-build-tmp + bash ./install.sh -idc -a $(/opt/rocm/llvm/bin/offload-arch) --cpu_ref_lib=lapack ``` For more detailed information on the script's functionality and advanced usage, please refer to the comments within the `tensile_config_generator.py` file. diff --git a/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py b/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py index 244f84dfd8..d9e260d54a 100644 --- a/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py +++ b/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py @@ -72,6 +72,10 @@ "--full_mfma", type=bool, default=False, help="If enabled, will search for all mfma instructions") +parser.add_argument( + "--full_mi", type=bool, default=False, + help="If enabled, will search for all mi instructions") + args = parser.parse_args() NUM_WARM_UP = 20 @@ -525,7 +529,7 @@ def dump_yaml(gpu_idx, gemm_group, yaml_file, m_sum, n_sum, batch_sum, k_sum, it if mfma_instructions is None: continue - mfma_instruction_found = False + matmul_instruction_found = False for mfma_instruction in mfma_instructions: for _ in range(NUM_STAGES): matmul_instruction_gen = list(find_matmul_instruction(mfma_instruction, size)) @@ -562,13 +566,14 @@ def dump_yaml(gpu_idx, gemm_group, yaml_file, m_sum, n_sum, batch_sum, k_sum, it if args.fast and (index > total_inst): break if len(matmul_instruction_gen) > 0 or len(mi_groups0) > 0 or len(mi_groups1) > 0: - mfma_instruction_found = True - break + matmul_instruction_found = True + if not args.full_mi: + break else: max_dim = int(np.argmax(size)) size[max_dim] = size[max_dim] // 2 - if not mfma_instruction_found: + if not matmul_instruction_found: print(f"Can't find mfma instructions for {original_size}, please contact hipblaslt expert") else: if dtype_str in gemm_group: @@ -633,7 +638,7 @@ def dump_yaml(gpu_idx, gemm_group, yaml_file, m_sum, n_sum, batch_sum, k_sum, it mfma_instructions = instruction_map(dtype) if mfma_instructions is None: continue - mfma_instruction_found = False + matmul_instruction_found = False for mfma_instruction in mfma_instructions: for _ in range(NUM_STAGES): matmul_instruction_gen = list(find_matmul_instruction(mfma_instruction, size)) @@ -646,12 +651,12 @@ def dump_yaml(gpu_idx, gemm_group, yaml_file, m_sum, n_sum, batch_sum, k_sum, it if args.fast and (index > total_inst): break if len(matmul_instruction_gen) > 0: - mfma_instruction_found = True + matmul_instruction_found = True break else: max_dim = int(np.argmax(size)) size[max_dim] = size[max_dim] // 2 - if not mfma_instruction_found: + if not matmul_instruction_found: print(f"Can't find mfma instructions for {original_size}, please contact hipblaslt expert") else: if dtype_str in gemm_group: From d41b250725ff5bbb36e1e11bb4b40b9d27eb5c5a Mon Sep 17 00:00:00 2001 From: fsx950223 Date: Tue, 19 Nov 2024 02:28:29 +0000 Subject: [PATCH 06/23] fix bugs --- .../tensile_config_generator.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py b/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py index d9e260d54a..1789ef49c9 100644 --- a/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py +++ b/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py @@ -73,9 +73,13 @@ help="If enabled, will search for all mfma instructions") parser.add_argument( - "--full_mi", type=bool, default=False, + "--full_stage", type=bool, default=False, help="If enabled, will search for all mi instructions") +parser.add_argument( + "--num_stages", type=int, default=8, + help="How many times to divide matrix") + args = parser.parse_args() NUM_WARM_UP = 20 @@ -88,7 +92,7 @@ ArchitectureName = res.stdout.decode("utf-8").strip() res = subprocess.run("rocminfo | grep Compute", stdout=subprocess.PIPE, shell=True, env={"ROCR_VISIBLE_DEVICES":"0"}) match = re.search(CU_RE, res.stdout.decode("utf-8").split('\n')[-2]) -NUM_STAGES = 32 +NUM_STAGES = args.num_stages DIV_MI = 3 # 33.3% MIN_MI = 5 # min 5 solutions CU = 0 @@ -124,7 +128,7 @@ bf16_instructions = [[16,16,16,1],[32,32,8,1]] tf32_instructions = [[16,16,8,1]] fp32_instructions = [[16,16,4,1]] - fp8_instructions = [[16,16,32,1]] + fp8_instructions = [[32,32,16,1], [16,16,32,1]] HIPBLASLT_BENCH_RE = ( @@ -328,7 +332,7 @@ def find_matmul_instruction(mfma_instruction, size): continue for n_tiles in reversed(range(1, CU+1)): n_tile_size = size[1] // n_tiles - if n_tile_size > 256 // m_tile_size: + if n_tile_size > 256: continue wave_tile_n = math.ceil(n_tile_size / mfma_instruction[1]) if wave_tile_n <= 0: @@ -567,10 +571,10 @@ def dump_yaml(gpu_idx, gemm_group, yaml_file, m_sum, n_sum, batch_sum, k_sum, it break if len(matmul_instruction_gen) > 0 or len(mi_groups0) > 0 or len(mi_groups1) > 0: matmul_instruction_found = True - if not args.full_mi: + if not args.full_stage: break else: - max_dim = int(np.argmax(size)) + max_dim = int(np.argmax(size[:2])) size[max_dim] = size[max_dim] // 2 if not matmul_instruction_found: From 3c5fb0a47218242f100b76662c465cd391141878 Mon Sep 17 00:00:00 2001 From: fsx950223 Date: Tue, 19 Nov 2024 06:05:12 +0000 Subject: [PATCH 07/23] fix bugs --- .../tensile_generator/tensile_config_generator.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py b/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py index 1789ef49c9..fa2dbb3509 100644 --- a/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py +++ b/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py @@ -95,6 +95,7 @@ NUM_STAGES = args.num_stages DIV_MI = 3 # 33.3% MIN_MI = 5 # min 5 solutions +NONTEMPORALRATIO = 8 CU = 0 if match: CU = int(match.group('COMPUTE_UNIT').strip()) @@ -283,13 +284,13 @@ def trans_map(trans): def bias_datatype_map(dtype): if dtype == "f16_r": - return [datatype_map('f32_r'), datatype_map('f16_r')] + return [datatype_map('f16_r')] elif dtype == "f32_r": return [datatype_map('f32_r')] elif dtype == "xf32_r": - return [datatype_map('f32_r'), datatype_map('xf32_r')] + return [datatype_map('xf32_r')] elif dtype == "bf16_r": - return [datatype_map('f32_r'), datatype_map('bf16_r')] + return [datatype_map('bf16_r')] elif dtype == "f8_r": return [datatype_map('f8_r')] else: @@ -352,7 +353,6 @@ def find_matmul_instruction(mfma_instruction, size): def get_groups(matmul_instruction_gen): # Extract skinny MTs for Groups - NONTEMPORALRATIO = 8 mi_groups0 = [] mi_groups1 = [] mi_left = [] From 7e4a03547e78cc5ff4683169e53b016aae888e75 Mon Sep 17 00:00:00 2001 From: fsx950223 Date: Tue, 19 Nov 2024 08:21:41 +0000 Subject: [PATCH 08/23] add activation pattern --- .../tensile_config_generator.py | 245 ++++++++++++------ 1 file changed, 159 insertions(+), 86 deletions(-) diff --git a/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py b/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py index fa2dbb3509..c692eef4b4 100644 --- a/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py +++ b/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py @@ -131,8 +131,7 @@ fp32_instructions = [[16,16,4,1]] fp8_instructions = [[32,32,16,1], [16,16,32,1]] - -HIPBLASLT_BENCH_RE = ( +HIPBLASLT_BENCH_BASE = ( r"(?P\w+) --api_method c " r"-m (?P[\d ]+)" r"-n (?P[\d ]+)" @@ -150,97 +149,156 @@ r"--transA (?P[\w ]+)" r"--transB (?P[\w ]+)" r"--batch_count (?P[\d ]+)" - r"--a_type (?P[\w ]+)" - r"--b_type (?P[\w ]+)" - r"--c_type (?P[\w ]+)" - r"--d_type (?P[\w ]+)" - r"--scale_type (?P[\w ]+)" - r"--bias_type (?P[\w ]+)" - r"--compute_type (?P[\w ]+)") +) -HIPBLASLT_BENCH_RE_SAB = ( - r"(?P\w+) --api_method c " - r"-m (?P[\d ]+)" - r"-n (?P[\d ]+)" - r"-k (?P[\d ]+)" - r"--lda (?P[\d ]+)" - r"--ldb (?P[\d ]+)" - r"--ldc (?P[\d ]+)" - r"--ldd (?P[\d ]+)" - r"--stride_a (?P[\d ]+)" - r"--stride_b (?P[\d ]+)" - r"--stride_c (?P[\d ]+)" - r"--stride_d (?P[\d ]+)" - r"--alpha (?P[\d\. ]+)" - r"--beta (?P[\d\. ]+)" - r"--transA (?P[\w ]+)" - r"--transB (?P[\w ]+)" - r"--batch_count (?P[\d ]+)" - r"--scaleA (?P[\w ]+)" - r"--scaleB (?P[\w ]+)" - r"--a_type (?P[\w ]+)" - r"--b_type (?P[\w ]+)" - r"--c_type (?P[\w ]+)" - r"--d_type (?P[\w ]+)" - r"--scale_type (?P[\w ]+)" - r"--bias_type (?P[\w ]+)" - r"--compute_type (?P[\w ]+)") +# Optional patterns for scale and bias +SCALE_PATTERN = r"--scaleA (?P[\w ]+)--scaleB (?P[\w ]+)" +BIAS_PATTERN = r"--bias_vector --bias_source (?P[\w ]+)" +ACTIVATION_PATTERN = r"--activation_type (?P[\w ]+)" -HIPBLASLT_BENCH_RE_BIAS = ( - r"(?P\w+) --api_method c " - r"-m (?P[\d ]+)" - r"-n (?P[\d ]+)" - r"-k (?P[\d ]+)" - r"--lda (?P[\d ]+)" - r"--ldb (?P[\d ]+)" - r"--ldc (?P[\d ]+)" - r"--ldd (?P[\d ]+)" - r"--stride_a (?P[\d ]+)" - r"--stride_b (?P[\d ]+)" - r"--stride_c (?P[\d ]+)" - r"--stride_d (?P[\d ]+)" - r"--alpha (?P[\d\. ]+)" - r"--beta (?P[\d\. ]+)" - r"--transA (?P[\w ]+)" - r"--transB (?P[\w ]+)" - r"--batch_count (?P[\d ]+)" - r"--bias_vector --bias_source (?P[\w ]+)" - r"--a_type (?P[\w ]+)" - r"--b_type (?P[\w ]+)" - r"--c_type (?P[\w ]+)" - r"--d_type (?P[\w ]+)" - r"--scale_type (?P[\w ]+)" - r"--bias_type (?P[\w ]+)" - r"--compute_type (?P[\w ]+)") - -HIPBLASLT_BENCH_RE_SAB_BIAS = ( - r"(?P\w+) --api_method c " - r"-m (?P[\d ]+)" - r"-n (?P[\d ]+)" - r"-k (?P[\d ]+)" - r"--lda (?P[\d ]+)" - r"--ldb (?P[\d ]+)" - r"--ldc (?P[\d ]+)" - r"--ldd (?P[\d ]+)" - r"--stride_a (?P[\d ]+)" - r"--stride_b (?P[\d ]+)" - r"--stride_c (?P[\d ]+)" - r"--stride_d (?P[\d ]+)" - r"--alpha (?P[\d\. ]+)" - r"--beta (?P[\d\. ]+)" - r"--transA (?P[\w ]+)" - r"--transB (?P[\w ]+)" - r"--batch_count (?P[\d ]+)" - r"--scaleA (?P[\w ]+)" - r"--scaleB (?P[\w ]+)" - r"--bias_vector --bias_source (?P[\w ]+)" +# Common ending pattern +TYPE_PATTERN = ( r"--a_type (?P[\w ]+)" r"--b_type (?P[\w ]+)" r"--c_type (?P[\w ]+)" r"--d_type (?P[\w ]+)" r"--scale_type (?P[\w ]+)" r"--bias_type (?P[\w ]+)" - r"--compute_type (?P[\w ]+)") + r"--compute_type (?P[\w ]+)" +) + +# Build the combined pattern with optional parts +def build_pattern(has_scale=False, has_bias=False, has_activation=False): + pattern = HIPBLASLT_BENCH_BASE + if has_scale: + pattern += SCALE_PATTERN + if has_bias: + pattern += BIAS_PATTERN + pattern += TYPE_PATTERN + if has_activation: + pattern += ACTIVATION_PATTERN + return pattern + +# Create the four variations +HIPBLASLT_BENCH_RE = build_pattern() +HIPBLASLT_BENCH_RE_SAB = build_pattern(has_scale=True) +HIPBLASLT_BENCH_RE_BIAS = build_pattern(has_bias=True) +HIPBLASLT_BENCH_RE_ACT = build_pattern(has_activation=True) +HIPBLASLT_BENCH_RE_SAB_ACT = build_pattern(has_scale=True, has_activation=True) +HIPBLASLT_BENCH_RE_BIAS_ACT = build_pattern(has_bias=True, has_activation=True) +HIPBLASLT_BENCH_RE_SAB_BIAS = build_pattern(has_scale=True, has_bias=True) +HIPBLASLT_BENCH_RE_SAB_BIAS_ACT = build_pattern(has_scale=True, has_bias=True, has_activation=True) + + +# HIPBLASLT_BENCH_RE = ( +# r"(?P\w+) --api_method c " +# r"-m (?P[\d ]+)" +# r"-n (?P[\d ]+)" +# r"-k (?P[\d ]+)" +# r"--lda (?P[\d ]+)" +# r"--ldb (?P[\d ]+)" +# r"--ldc (?P[\d ]+)" +# r"--ldd (?P[\d ]+)" +# r"--stride_a (?P[\d ]+)" +# r"--stride_b (?P[\d ]+)" +# r"--stride_c (?P[\d ]+)" +# r"--stride_d (?P[\d ]+)" +# r"--alpha (?P[\d\. ]+)" +# r"--beta (?P[\d\. ]+)" +# r"--transA (?P[\w ]+)" +# r"--transB (?P[\w ]+)" +# r"--batch_count (?P[\d ]+)" +# r"--a_type (?P[\w ]+)" +# r"--b_type (?P[\w ]+)" +# r"--c_type (?P[\w ]+)" +# r"--d_type (?P[\w ]+)" +# r"--scale_type (?P[\w ]+)" +# r"--bias_type (?P[\w ]+)" +# r"--compute_type (?P[\w ]+)") + +# HIPBLASLT_BENCH_RE_SAB = ( +# r"(?P\w+) --api_method c " +# r"-m (?P[\d ]+)" +# r"-n (?P[\d ]+)" +# r"-k (?P[\d ]+)" +# r"--lda (?P[\d ]+)" +# r"--ldb (?P[\d ]+)" +# r"--ldc (?P[\d ]+)" +# r"--ldd (?P[\d ]+)" +# r"--stride_a (?P[\d ]+)" +# r"--stride_b (?P[\d ]+)" +# r"--stride_c (?P[\d ]+)" +# r"--stride_d (?P[\d ]+)" +# r"--alpha (?P[\d\. ]+)" +# r"--beta (?P[\d\. ]+)" +# r"--transA (?P[\w ]+)" +# r"--transB (?P[\w ]+)" +# r"--batch_count (?P[\d ]+)" +# r"--scaleA (?P[\w ]+)" +# r"--scaleB (?P[\w ]+)" +# r"--a_type (?P[\w ]+)" +# r"--b_type (?P[\w ]+)" +# r"--c_type (?P[\w ]+)" +# r"--d_type (?P[\w ]+)" +# r"--scale_type (?P[\w ]+)" +# r"--bias_type (?P[\w ]+)" +# r"--compute_type (?P[\w ]+)") + +# HIPBLASLT_BENCH_RE_BIAS = ( +# r"(?P\w+) --api_method c " +# r"-m (?P[\d ]+)" +# r"-n (?P[\d ]+)" +# r"-k (?P[\d ]+)" +# r"--lda (?P[\d ]+)" +# r"--ldb (?P[\d ]+)" +# r"--ldc (?P[\d ]+)" +# r"--ldd (?P[\d ]+)" +# r"--stride_a (?P[\d ]+)" +# r"--stride_b (?P[\d ]+)" +# r"--stride_c (?P[\d ]+)" +# r"--stride_d (?P[\d ]+)" +# r"--alpha (?P[\d\. ]+)" +# r"--beta (?P[\d\. ]+)" +# r"--transA (?P[\w ]+)" +# r"--transB (?P[\w ]+)" +# r"--batch_count (?P[\d ]+)" +# r"--bias_vector --bias_source (?P[\w ]+)" +# r"--a_type (?P[\w ]+)" +# r"--b_type (?P[\w ]+)" +# r"--c_type (?P[\w ]+)" +# r"--d_type (?P[\w ]+)" +# r"--scale_type (?P[\w ]+)" +# r"--bias_type (?P[\w ]+)" +# r"--compute_type (?P[\w ]+)") + +# HIPBLASLT_BENCH_RE_SAB_BIAS = ( +# r"(?P\w+) --api_method c " +# r"-m (?P[\d ]+)" +# r"-n (?P[\d ]+)" +# r"-k (?P[\d ]+)" +# r"--lda (?P[\d ]+)" +# r"--ldb (?P[\d ]+)" +# r"--ldc (?P[\d ]+)" +# r"--ldd (?P[\d ]+)" +# r"--stride_a (?P[\d ]+)" +# r"--stride_b (?P[\d ]+)" +# r"--stride_c (?P[\d ]+)" +# r"--stride_d (?P[\d ]+)" +# r"--alpha (?P[\d\. ]+)" +# r"--beta (?P[\d\. ]+)" +# r"--transA (?P[\w ]+)" +# r"--transB (?P[\w ]+)" +# r"--batch_count (?P[\d ]+)" +# r"--scaleA (?P[\w ]+)" +# r"--scaleB (?P[\w ]+)" +# r"--bias_vector --bias_source (?P[\w ]+)" +# r"--a_type (?P[\w ]+)" +# r"--b_type (?P[\w ]+)" +# r"--c_type (?P[\w ]+)" +# r"--d_type (?P[\w ]+)" +# r"--scale_type (?P[\w ]+)" +# r"--bias_type (?P[\w ]+)" +# r"--compute_type (?P[\w ]+)") # Function to extract problem sizes from a line def extract_problem_size(match): @@ -317,6 +375,9 @@ def extract_dtype(match): res["UseBias"] = 1 res["BiasSrc"] = gdict.get('BIAS_SOURCE', '').strip().upper() res["BiasDataTypeList"] = list(bias_datatype_map(gdict.get("BIAS_TYPE", '').strip())) + if gdict.get("ACTIVATION_TYPE"): + res["Activation"] = True + res["ActivationType"] = "hipblaslt_all" if gdict.get("SCALE_A") is not None and gdict.get("SCALE_B") is not None: res["UseScaleAB"] = "Scalar" res["UseScaleAlphaVec"] = 1 @@ -369,10 +430,22 @@ def get_groups(matmul_instruction_gen): return mi_groups0, mi_groups1, mi_left def match_pattern(line): - if 'bias_vector' in line and 'scaleA' in line and 'scaleB' in line: + if 'activation_type' in line and 'bias_vector' in line and 'scaleA' in line and 'scaleB' in line: + match = re.search( + HIPBLASLT_BENCH_RE_SAB_BIAS_ACT, line + ) + elif 'bias_vector' in line and 'scaleA' in line and 'scaleB' in line: match = re.search( HIPBLASLT_BENCH_RE_SAB_BIAS, line ) + elif 'activation_type' in line and 'scaleA' in line and 'scaleB' in line: + match = re.search( + HIPBLASLT_BENCH_RE_SAB_ACT, line + ) + elif 'bias_vector' in line and 'activation_type' in line: + match = re.search( + HIPBLASLT_BENCH_RE_BIAS_ACT, line + ) elif 'bias_vector' in line: match = re.search( HIPBLASLT_BENCH_RE_BIAS, line From 81d4a91e4fa204e8445ba514a68971cf372f1fcf Mon Sep 17 00:00:00 2001 From: fsx950223 Date: Tue, 19 Nov 2024 10:04:25 +0000 Subject: [PATCH 09/23] fix a bug --- .../tensile_config_generator.py | 114 +----------------- 1 file changed, 2 insertions(+), 112 deletions(-) diff --git a/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py b/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py index c692eef4b4..e9feea2fd6 100644 --- a/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py +++ b/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py @@ -190,116 +190,6 @@ def build_pattern(has_scale=False, has_bias=False, has_activation=False): HIPBLASLT_BENCH_RE_SAB_BIAS_ACT = build_pattern(has_scale=True, has_bias=True, has_activation=True) -# HIPBLASLT_BENCH_RE = ( -# r"(?P\w+) --api_method c " -# r"-m (?P[\d ]+)" -# r"-n (?P[\d ]+)" -# r"-k (?P[\d ]+)" -# r"--lda (?P[\d ]+)" -# r"--ldb (?P[\d ]+)" -# r"--ldc (?P[\d ]+)" -# r"--ldd (?P[\d ]+)" -# r"--stride_a (?P[\d ]+)" -# r"--stride_b (?P[\d ]+)" -# r"--stride_c (?P[\d ]+)" -# r"--stride_d (?P[\d ]+)" -# r"--alpha (?P[\d\. ]+)" -# r"--beta (?P[\d\. ]+)" -# r"--transA (?P[\w ]+)" -# r"--transB (?P[\w ]+)" -# r"--batch_count (?P[\d ]+)" -# r"--a_type (?P[\w ]+)" -# r"--b_type (?P[\w ]+)" -# r"--c_type (?P[\w ]+)" -# r"--d_type (?P[\w ]+)" -# r"--scale_type (?P[\w ]+)" -# r"--bias_type (?P[\w ]+)" -# r"--compute_type (?P[\w ]+)") - -# HIPBLASLT_BENCH_RE_SAB = ( -# r"(?P\w+) --api_method c " -# r"-m (?P[\d ]+)" -# r"-n (?P[\d ]+)" -# r"-k (?P[\d ]+)" -# r"--lda (?P[\d ]+)" -# r"--ldb (?P[\d ]+)" -# r"--ldc (?P[\d ]+)" -# r"--ldd (?P[\d ]+)" -# r"--stride_a (?P[\d ]+)" -# r"--stride_b (?P[\d ]+)" -# r"--stride_c (?P[\d ]+)" -# r"--stride_d (?P[\d ]+)" -# r"--alpha (?P[\d\. ]+)" -# r"--beta (?P[\d\. ]+)" -# r"--transA (?P[\w ]+)" -# r"--transB (?P[\w ]+)" -# r"--batch_count (?P[\d ]+)" -# r"--scaleA (?P[\w ]+)" -# r"--scaleB (?P[\w ]+)" -# r"--a_type (?P[\w ]+)" -# r"--b_type (?P[\w ]+)" -# r"--c_type (?P[\w ]+)" -# r"--d_type (?P[\w ]+)" -# r"--scale_type (?P[\w ]+)" -# r"--bias_type (?P[\w ]+)" -# r"--compute_type (?P[\w ]+)") - -# HIPBLASLT_BENCH_RE_BIAS = ( -# r"(?P\w+) --api_method c " -# r"-m (?P[\d ]+)" -# r"-n (?P[\d ]+)" -# r"-k (?P[\d ]+)" -# r"--lda (?P[\d ]+)" -# r"--ldb (?P[\d ]+)" -# r"--ldc (?P[\d ]+)" -# r"--ldd (?P[\d ]+)" -# r"--stride_a (?P[\d ]+)" -# r"--stride_b (?P[\d ]+)" -# r"--stride_c (?P[\d ]+)" -# r"--stride_d (?P[\d ]+)" -# r"--alpha (?P[\d\. ]+)" -# r"--beta (?P[\d\. ]+)" -# r"--transA (?P[\w ]+)" -# r"--transB (?P[\w ]+)" -# r"--batch_count (?P[\d ]+)" -# r"--bias_vector --bias_source (?P[\w ]+)" -# r"--a_type (?P[\w ]+)" -# r"--b_type (?P[\w ]+)" -# r"--c_type (?P[\w ]+)" -# r"--d_type (?P[\w ]+)" -# r"--scale_type (?P[\w ]+)" -# r"--bias_type (?P[\w ]+)" -# r"--compute_type (?P[\w ]+)") - -# HIPBLASLT_BENCH_RE_SAB_BIAS = ( -# r"(?P\w+) --api_method c " -# r"-m (?P[\d ]+)" -# r"-n (?P[\d ]+)" -# r"-k (?P[\d ]+)" -# r"--lda (?P[\d ]+)" -# r"--ldb (?P[\d ]+)" -# r"--ldc (?P[\d ]+)" -# r"--ldd (?P[\d ]+)" -# r"--stride_a (?P[\d ]+)" -# r"--stride_b (?P[\d ]+)" -# r"--stride_c (?P[\d ]+)" -# r"--stride_d (?P[\d ]+)" -# r"--alpha (?P[\d\. ]+)" -# r"--beta (?P[\d\. ]+)" -# r"--transA (?P[\w ]+)" -# r"--transB (?P[\w ]+)" -# r"--batch_count (?P[\d ]+)" -# r"--scaleA (?P[\w ]+)" -# r"--scaleB (?P[\w ]+)" -# r"--bias_vector --bias_source (?P[\w ]+)" -# r"--a_type (?P[\w ]+)" -# r"--b_type (?P[\w ]+)" -# r"--c_type (?P[\w ]+)" -# r"--d_type (?P[\w ]+)" -# r"--scale_type (?P[\w ]+)" -# r"--bias_type (?P[\w ]+)" -# r"--compute_type (?P[\w ]+)") - # Function to extract problem sizes from a line def extract_problem_size(match): return [int(match.group('M').strip()), int(match.group('N').strip()), int(match.group('BATCH_COUNT').strip()), int(match.group('K').strip())] @@ -387,6 +277,7 @@ def find_matmul_instruction(mfma_instruction, size): for bm in range(int(math.log(mfma_instruction[3],2))+1): for m_tiles in reversed(range(1, CU+1)): m_tile_size = size[0] // m_tiles + # TODO:fp8 384x384 if m_tile_size > 256: continue wave_tile_m = math.ceil(m_tile_size / mfma_instruction[0]) @@ -409,8 +300,7 @@ def find_matmul_instruction(mfma_instruction, size): if wave_tile_n // (2**l) >= 1 and wave_tile_n // (2**l) <= 32: matmul_instruction[-3] = wave_tile_n // (2**l) matmul_instruction[-1] = 2**l - - yield matmul_instruction + yield copy.deepcopy(matmul_instruction) def get_groups(matmul_instruction_gen): # Extract skinny MTs for Groups From e2c139c4e3c7f552c21528fb6a3343c006032376 Mon Sep 17 00:00:00 2001 From: fsx950223 Date: Thu, 21 Nov 2024 05:19:33 +0000 Subject: [PATCH 10/23] fix merge bug when inData has no soltion --- tensilelite/Tensile/Utilities/merge.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tensilelite/Tensile/Utilities/merge.py b/tensilelite/Tensile/Utilities/merge.py index 767365a775..14cad76eae 100644 --- a/tensilelite/Tensile/Utilities/merge.py +++ b/tensilelite/Tensile/Utilities/merge.py @@ -309,6 +309,7 @@ def mergeLogic(oriData, incData, forceMerge, trimSize=True, addSolutionTags=Fals origNumSizes = len(oriData[7]) origNumSolutions = len(oriData[5]) + incData[7] = incData[7] or [] incNumSizes = len(incData[7]) incNumSolutions = len(incData[5]) @@ -320,7 +321,7 @@ def mergeLogic(oriData, incData, forceMerge, trimSize=True, addSolutionTags=Fals incTaggedSizes = addSolutionTagToKeys(incData[7], incData[5]) if addSolutionTags: oriData[7] = origTaggedSizes - incData[7] = incTaggedSizes + incData[7] = incTaggedSizes # Print warning if addSolutionTags=False results in removed sizes else: origSet = {tuple(size) for size, [_, _] in oriData[7]} From de87a414aa5e48869ed70f71e9722fbdd1763cd3 Mon Sep 17 00:00:00 2001 From: fsx950223 Date: Fri, 22 Nov 2024 09:38:25 +0000 Subject: [PATCH 11/23] limit wave size less or equal than 4 --- .../Utilities/tensile_generator/tensile_config_generator.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py b/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py index e9feea2fd6..3cbda1fe95 100644 --- a/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py +++ b/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py @@ -129,7 +129,7 @@ bf16_instructions = [[16,16,16,1],[32,32,8,1]] tf32_instructions = [[16,16,8,1]] fp32_instructions = [[16,16,4,1]] - fp8_instructions = [[32,32,16,1], [16,16,32,1]] + fp8_instructions = [[16,16,32,1]] HIPBLASLT_BENCH_BASE = ( r"(?P\w+) --api_method c " @@ -296,7 +296,7 @@ def find_matmul_instruction(mfma_instruction, size): matmul_instruction[-4] = wave_tile_m // (2**k) matmul_instruction[-2] = 2**k - for l in reversed(range(3)): + for l in reversed(range(3-k)): if wave_tile_n // (2**l) >= 1 and wave_tile_n // (2**l) <= 32: matmul_instruction[-3] = wave_tile_n // (2**l) matmul_instruction[-1] = 2**l From c4dac5de53b9d40133c246eee7bd8a80c1405f0e Mon Sep 17 00:00:00 2001 From: fsx950223 Date: Mon, 25 Nov 2024 06:28:08 +0000 Subject: [PATCH 12/23] update readme --- tensilelite/Tensile/Utilities/tensile_generator/README.md | 3 +++ .../Utilities/tensile_generator/tensile_config_generator.py | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/tensilelite/Tensile/Utilities/tensile_generator/README.md b/tensilelite/Tensile/Utilities/tensile_generator/README.md index d4b4fdeb84..0dd4df9935 100644 --- a/tensilelite/Tensile/Utilities/tensile_generator/README.md +++ b/tensilelite/Tensile/Utilities/tensile_generator/README.md @@ -35,6 +35,9 @@ To use the `tensile_config_generator.py` script, follow these steps: | `--iters ITERS` | Max tuning iterations | | `--fast BOOL` | If enabled, only tune the matrix instruction with min tile sizes, else, tune full matrix instructions | | `--gridbase_config GRIDBASE_CONFIG` | Path to gridbase config file | + | `--full_mfma BOOL` | If enabled, will search for all mfma instructions | + | `--full_stage BOOL` | If enabled, will search for all stages instructions | + | `--num_stages STAGES` | How many times to divide matrix | Equality tuning example: ``` diff --git a/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py b/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py index 3cbda1fe95..940491f6f8 100644 --- a/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py +++ b/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py @@ -74,7 +74,7 @@ parser.add_argument( "--full_stage", type=bool, default=False, - help="If enabled, will search for all mi instructions") + help="If enabled, will search for all stages instructions") parser.add_argument( "--num_stages", type=int, default=8, From 7702661f62bcc3292cc06f4270e0d66874948b93 Mon Sep 17 00:00:00 2001 From: fsx950223 Date: Mon, 25 Nov 2024 11:51:47 +0000 Subject: [PATCH 13/23] fix full stage --- .../Utilities/tensile_generator/tensile_config_generator.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py b/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py index 940491f6f8..e53e1cd12c 100644 --- a/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py +++ b/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py @@ -536,9 +536,9 @@ def dump_yaml(gpu_idx, gemm_group, yaml_file, m_sum, n_sum, batch_sum, k_sum, it matmul_instruction_found = True if not args.full_stage: break - else: - max_dim = int(np.argmax(size[:2])) - size[max_dim] = size[max_dim] // 2 + + max_dim = int(np.argmax(size[:2])) + size[max_dim] = size[max_dim] // 2 if not matmul_instruction_found: print(f"Can't find mfma instructions for {original_size}, please contact hipblaslt expert") From aae674040da0687cbaa3e64ca0190216c9608837 Mon Sep 17 00:00:00 2001 From: fsx950223 Date: Tue, 26 Nov 2024 08:41:50 +0000 Subject: [PATCH 14/23] update pattern to match latest hipblaslt log --- .../tensile_config_generator.py | 50 ++++++------------- 1 file changed, 14 insertions(+), 36 deletions(-) diff --git a/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py b/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py index e53e1cd12c..4ee42c6e29 100644 --- a/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py +++ b/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py @@ -131,6 +131,7 @@ fp32_instructions = [[16,16,4,1]] fp8_instructions = [[16,16,32,1]] + HIPBLASLT_BENCH_BASE = ( r"(?P\w+) --api_method c " r"-m (?P[\d ]+)" @@ -149,12 +150,12 @@ r"--transA (?P[\w ]+)" r"--transB (?P[\w ]+)" r"--batch_count (?P[\d ]+)" + r"--scaleA (?P[\d ]+)" + r"--scaleB (?P[\d ]+)" ) # Optional patterns for scale and bias -SCALE_PATTERN = r"--scaleA (?P[\w ]+)--scaleB (?P[\w ]+)" BIAS_PATTERN = r"--bias_vector --bias_source (?P[\w ]+)" -ACTIVATION_PATTERN = r"--activation_type (?P[\w ]+)" # Common ending pattern TYPE_PATTERN = ( @@ -165,30 +166,22 @@ r"--scale_type (?P[\w ]+)" r"--bias_type (?P[\w ]+)" r"--compute_type (?P[\w ]+)" + r"--algo_method (?P[\w ]+)" + r"--solution_index (?P[\d ]+)" + r"--activation_type (?P[\w ]+)" ) # Build the combined pattern with optional parts -def build_pattern(has_scale=False, has_bias=False, has_activation=False): +def build_pattern(has_bias=False): pattern = HIPBLASLT_BENCH_BASE - if has_scale: - pattern += SCALE_PATTERN if has_bias: pattern += BIAS_PATTERN pattern += TYPE_PATTERN - if has_activation: - pattern += ACTIVATION_PATTERN return pattern # Create the four variations HIPBLASLT_BENCH_RE = build_pattern() -HIPBLASLT_BENCH_RE_SAB = build_pattern(has_scale=True) HIPBLASLT_BENCH_RE_BIAS = build_pattern(has_bias=True) -HIPBLASLT_BENCH_RE_ACT = build_pattern(has_activation=True) -HIPBLASLT_BENCH_RE_SAB_ACT = build_pattern(has_scale=True, has_activation=True) -HIPBLASLT_BENCH_RE_BIAS_ACT = build_pattern(has_bias=True, has_activation=True) -HIPBLASLT_BENCH_RE_SAB_BIAS = build_pattern(has_scale=True, has_bias=True) -HIPBLASLT_BENCH_RE_SAB_BIAS_ACT = build_pattern(has_scale=True, has_bias=True, has_activation=True) - # Function to extract problem sizes from a line def extract_problem_size(match): @@ -251,6 +244,8 @@ def extract_dtype(match): ComputeDataType = datatype_map(gdict.get('COMPUTE_TYPE', '').strip()) TransposeA = trans_map(gdict.get('TRANS_A', '').strip()) TransposeB = trans_map(gdict.get('TRANS_B', '').strip()) + scaleA = gdict.get("SCALE_A").strip() + scaleB = gdict.get("SCALE_B").strip() if DataType in ["H", "B", "F8"]: HighPrecisionAccumulate = True else: @@ -268,9 +263,12 @@ def extract_dtype(match): if gdict.get("ACTIVATION_TYPE"): res["Activation"] = True res["ActivationType"] = "hipblaslt_all" - if gdict.get("SCALE_A") is not None and gdict.get("SCALE_B") is not None: + if scaleA == "1" and scaleB == "1" is not None: res["UseScaleAB"] = "Scalar" res["UseScaleAlphaVec"] = 1 + elif scaleA == "2" and scaleB == "2" is not None: + res["UseScaleAB"] = "Vector" + res["UseScaleAlphaVec"] = 1 return res def find_matmul_instruction(mfma_instruction, size): @@ -320,30 +318,10 @@ def get_groups(matmul_instruction_gen): return mi_groups0, mi_groups1, mi_left def match_pattern(line): - if 'activation_type' in line and 'bias_vector' in line and 'scaleA' in line and 'scaleB' in line: - match = re.search( - HIPBLASLT_BENCH_RE_SAB_BIAS_ACT, line - ) - elif 'bias_vector' in line and 'scaleA' in line and 'scaleB' in line: - match = re.search( - HIPBLASLT_BENCH_RE_SAB_BIAS, line - ) - elif 'activation_type' in line and 'scaleA' in line and 'scaleB' in line: - match = re.search( - HIPBLASLT_BENCH_RE_SAB_ACT, line - ) - elif 'bias_vector' in line and 'activation_type' in line: - match = re.search( - HIPBLASLT_BENCH_RE_BIAS_ACT, line - ) - elif 'bias_vector' in line: + if 'bias_vector' in line: match = re.search( HIPBLASLT_BENCH_RE_BIAS, line ) - elif 'scaleA' in line and 'scaleB' in line: - match = re.search( - HIPBLASLT_BENCH_RE_SAB, line - ) else: match = re.search( HIPBLASLT_BENCH_RE, line From 65e85de166d2728496e3e2ddaf5e681a0b51741f Mon Sep 17 00:00:00 2001 From: fsx950223 Date: Tue, 26 Nov 2024 08:52:53 +0000 Subject: [PATCH 15/23] fix activation --- .../tensile_generator/tensile_config_generator.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py b/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py index 4ee42c6e29..db4d6eb47f 100644 --- a/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py +++ b/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py @@ -246,6 +246,8 @@ def extract_dtype(match): TransposeB = trans_map(gdict.get('TRANS_B', '').strip()) scaleA = gdict.get("SCALE_A").strip() scaleB = gdict.get("SCALE_B").strip() + activation_type = gdict.get("ACTIVATION_TYPE").strip() + bias_source = gdict.get('BIAS_SOURCE', '').strip().upper() if DataType in ["H", "B", "F8"]: HighPrecisionAccumulate = True else: @@ -256,11 +258,11 @@ def extract_dtype(match): F32XdlMathOp = 'x' res = {"Batched": True, "DataType": DataType, "DestDataType": DestDataType, "ComputeDataType": ComputeDataType, "TransposeA": TransposeA, "TransposeB": TransposeB, "HighPrecisionAccumulate": HighPrecisionAccumulate, "F32XdlMathOp": F32XdlMathOp, "OperationType": "GEMM", "UseBeta": True} - if gdict.get("BIAS_SOURCE"): + if bias_source: res["UseBias"] = 1 - res["BiasSrc"] = gdict.get('BIAS_SOURCE', '').strip().upper() + res["BiasSrc"] = bias_source res["BiasDataTypeList"] = list(bias_datatype_map(gdict.get("BIAS_TYPE", '').strip())) - if gdict.get("ACTIVATION_TYPE"): + if activation_type != "none": res["Activation"] = True res["ActivationType"] = "hipblaslt_all" if scaleA == "1" and scaleB == "1" is not None: From 42caf9e21fd6c76d6c13013492fa81377c32e3da Mon Sep 17 00:00:00 2001 From: fsx950223 Date: Wed, 27 Nov 2024 03:02:49 +0000 Subject: [PATCH 16/23] update log example --- .../hipblaslt_gemm_log_example.txt | 34 ++++++++----------- 1 file changed, 14 insertions(+), 20 deletions(-) diff --git a/tensilelite/Tensile/Utilities/tensile_generator/hipblaslt_gemm_log_example.txt b/tensilelite/Tensile/Utilities/tensile_generator/hipblaslt_gemm_log_example.txt index 23913ee7ad..e19f08f82a 100644 --- a/tensilelite/Tensile/Utilities/tensile_generator/hipblaslt_gemm_log_example.txt +++ b/tensilelite/Tensile/Utilities/tensile_generator/hipblaslt_gemm_log_example.txt @@ -1,20 +1,14 @@ -hipblaslt-bench --api_method c -m 1 -n 200 -k 24 --lda 24 --ldb 24 --ldc 1 --ldd 1 --stride_a 24 --stride_b 4800 --stride_c 200 --stride_d 200 --alpha 1.000000 --beta 0.000000 --transA T --transB N --batch_count 800 --a_type f32_r --b_type f32_r --c_type f32_r --d_type f32_r --scale_type f32_r --bias_type f32_r --compute_type f32_r -hipblaslt-bench --api_method c -m 1 -n 24 -k 200 --lda 1 --ldb 24 --ldc 1 --ldd 1 --stride_a 200 --stride_b 4800 --stride_c 24 --stride_d 24 --alpha 1.000000 --beta 0.000000 --transA N --transB T --batch_count 800 --a_type f16_r --b_type f16_r --c_type f16_r --d_type f16_r --scale_type f32_r --bias_type f32_r --compute_type f32_r -hipblaslt-bench --api_method c -m 1024 -n 200 -k 5244 --lda 1024 --ldb 5244 --ldc 1024 --ldd 1024 --stride_a 0 --stride_b 0 --stride_c 0 --stride_d 0 --alpha 1.000000 --beta 1.000000 --transA N --transB N --batch_count 1 --a_type f16_r --b_type f16_r --c_type f32_r --d_type f32_r --scale_type f32_r --bias_type f32_r --compute_type f32_r -hipblaslt-bench --api_method c -m 128 -n 150 -k 128 --lda 128 --ldb 128 --ldc 128 --ldd 128 --stride_a 16384 --stride_b 19200 --stride_c 19200 --stride_d 19200 --alpha 1.000000 --beta 0.000000 --transA N --transB N --batch_count 2 --a_type f16_r --b_type f16_r --c_type f16_r --d_type f16_r --scale_type f32_r --bias_type f32_r --compute_type f32_r -hipblaslt-bench --api_method c -m 128 -n 200 -k 256 --lda 128 --ldb 256 --ldc 128 --ldd 128 --stride_a 0 --stride_b 0 --stride_c 0 --stride_d 0 --alpha 1.000000 --beta 1.000000 --transA N --transB N --batch_count 1 --a_type f16_r --b_type f16_r --c_type f32_r --d_type f32_r --scale_type f32_r --bias_type f32_r --compute_type f32_r -hipblaslt-bench --api_method c -m 150 -n 200 -k 32 --lda 32 --ldb 32 --ldc 150 --ldd 150 --stride_a 4800 --stride_b 6400 --stride_c 30000 --stride_d 30000 --alpha 1.000000 --beta 0.000000 --transA T --transB N --batch_count 8 --a_type f16_r --b_type f16_r --c_type f32_r --d_type f32_r --scale_type f32_r --bias_type f32_r --compute_type f32_r -hipblaslt-bench --api_method c -m 2 -n 200 -k 64 --lda 2 --ldb 64 --ldc 2 --ldd 2 --stride_a 0 --stride_b 0 --stride_c 0 --stride_d 0 --alpha 1.000000 --beta 1.000000 --transA N --transB N --batch_count 1 --a_type f16_r --b_type f16_r --c_type f32_r --d_type f32_r --scale_type f32_r --bias_type f32_r --compute_type f32_r -hipblaslt-bench --api_method c -m 200 -n 200 -k 24 --lda 24 --ldb 24 --ldc 200 --ldd 200 --stride_a 4800 --stride_b 4800 --stride_c 40000 --stride_d 40000 --alpha 1.000000 --beta 0.000000 --transA T --transB N --batch_count 4 --a_type f16_r --b_type f16_r --c_type f32_r --d_type f32_r --scale_type f32_r --bias_type f32_r --compute_type f32_r -hipblaslt-bench --api_method c -m 24 -n 200 -k 200 --lda 24 --ldb 200 --ldc 24 --ldd 24 --stride_a 4800 --stride_b 40000 --stride_c 4800 --stride_d 4800 --alpha 1.000000 --beta 0.000000 --transA N --transB N --batch_count 4 --a_type f16_r --b_type f16_r --c_type f32_r --d_type f32_r --scale_type f32_r --bias_type f32_r --compute_type f32_r -hipblaslt-bench --api_method c -m 24 -n 200 -k 50 --lda 24 --ldb 50 --ldc 24 --ldd 24 --stride_a 1200 --stride_b 10000 --stride_c 4800 --stride_d 4800 --alpha 1.000000 --beta 0.000000 --transA N --transB N --batch_count 8 --a_type f16_r --b_type f16_r --c_type f16_r --d_type f16_r --scale_type f32_r --bias_type f32_r --compute_type f32_r -hipblaslt-bench --api_method c -m 256 -n 200 -k 512 --lda 256 --ldb 512 --ldc 256 --ldd 256 --stride_a 0 --stride_b 0 --stride_c 0 --stride_d 0 --alpha 1.000000 --beta 1.000000 --transA N --transB N --batch_count 1 --a_type f16_r --b_type f16_r --c_type f32_r --d_type f32_r --scale_type f32_r --bias_type f32_r --compute_type f32_r -hipblaslt-bench --api_method c -m 256 -n 200 -k 96 --lda 256 --ldb 96 --ldc 256 --ldd 256 --stride_a 0 --stride_b 0 --stride_c 0 --stride_d 0 --alpha 1.000000 --beta 0.000000 --transA N --transB N --batch_count 1 --a_type f16_r --b_type f16_r --c_type f16_r --d_type f16_r --scale_type f32_r --bias_type f32_r --compute_type f32_r -hipblaslt-bench --api_method c -m 32 -n 200 -k 150 --lda 32 --ldb 150 --ldc 32 --ldd 32 --stride_a 4800 --stride_b 30000 --stride_c 6400 --stride_d 6400 --alpha 1.000000 --beta 0.000000 --transA N --transB N --batch_count 8 --a_type f16_r --b_type f16_r --c_type f32_r --d_type f32_r --scale_type f32_r --bias_type f32_r --compute_type f32_r -hipblaslt-bench --api_method c -m 384 -n 200 -k 96 --lda 384 --ldb 96 --ldc 384 --ldd 384 --stride_a 0 --stride_b 0 --stride_c 0 --stride_d 0 --alpha 1.000000 --beta 0.000000 --transA N --transB N --batch_count 1 --a_type f16_r --b_type f16_r --c_type f16_r --d_type f16_r --scale_type f32_r --bias_type f32_r --compute_type f32_r -hipblaslt-bench --api_method c -m 50 -n 200 -k 24 --lda 24 --ldb 24 --ldc 50 --ldd 50 --stride_a 1200 --stride_b 4800 --stride_c 10000 --stride_d 10000 --alpha 1.000000 --beta 0.000000 --transA T --transB N --batch_count 8 --a_type f16_r --b_type f16_r --c_type f32_r --d_type f32_r --scale_type f32_r --bias_type f32_r --compute_type f32_r -hipblaslt-bench --api_method c -m 512 -n 200 -k 1024 --lda 512 --ldb 1024 --ldc 512 --ldd 512 --stride_a 0 --stride_b 0 --stride_c 0 --stride_d 0 --alpha 1.000000 --beta 1.000000 --transA N --transB N --batch_count 1 --a_type f16_r --b_type f16_r --c_type f32_r --d_type f32_r --scale_type f32_r --bias_type f32_r --compute_type f32_r -hipblaslt-bench --api_method c -m 64 -n 200 -k 128 --lda 64 --ldb 128 --ldc 64 --ldd 64 --stride_a 0 --stride_b 0 --stride_c 0 --stride_d 0 --alpha 1.000000 --beta 1.000000 --transA N --transB N --batch_count 1 --a_type f16_r --b_type f16_r --c_type f32_r --d_type f32_r --scale_type f32_r --bias_type f32_r --compute_type f32_r -hipblaslt-bench --api_method c -m 96 -n 200 -k 96 --lda 96 --ldb 96 --ldc 96 --ldd 96 --stride_a 0 --stride_b 0 --stride_c 0 --stride_d 0 --alpha 1.000000 --beta 0.000000 --transA N --transB N --batch_count 1 --a_type f16_r --b_type f16_r --c_type f16_r --d_type f16_r --scale_type f32_r --bias_type f32_r --compute_type f32_r -hipblaslt-bench --api_method c -m 96 -n 50 -k 96 --lda 96 --ldb 96 --ldc 96 --ldd 96 --stride_a 9216 --stride_b 4800 --stride_c 4800 --stride_d 4800 --alpha 1.000000 --beta 0.000000 --transA N --transB N --batch_count 2 --a_type f16_r --b_type f16_r --c_type f16_r --d_type f16_r --scale_type f32_r --bias_type f32_r --compute_type f32_r -hipblaslt-bench --api_method c -m 96 -n 6000 -k 96 --lda 96 --ldb 96 --ldc 96 --ldd 96 --stride_a 0 --stride_b 0 --stride_c 0 --stride_d 0 --alpha 1.000000 --beta 0.000000 --transA N --transB N --batch_count 1 --a_type f16_r --b_type f16_r --c_type f16_r --d_type f16_r --scale_type f32_r --bias_type f32_r --compute_type f32_r +hipblaslt-bench --api_method c -m 10 -n 1000 -k 128 --lda 128 --ldb 128 --ldc 10 --ldd 10 --stride_a 0 --stride_b 0 --stride_c 0 --stride_d 0 --alpha 1.000000 --beta 1.000000 --transA T --transB N --batch_count 1 --scaleA 0 --scaleB 0 --a_type f32_r --b_type f32_r --c_type f32_r --d_type f32_r --scale_type f32_r --bias_type f32_r --compute_type f32_r --algo_method index --solution_index 156482 --activation_type none +hipblaslt-bench --api_method c -m 10 -n 32 -k 128 --lda 128 --ldb 128 --ldc 10 --ldd 10 --stride_a 0 --stride_b 0 --stride_c 0 --stride_d 0 --alpha 1.000000 --beta 1.000000 --transA T --transB N --batch_count 1 --scaleA 0 --scaleB 0 --a_type f32_r --b_type f32_r --c_type f32_r --d_type f32_r --scale_type f32_r --bias_type f32_r --compute_type f32_r --algo_method index --solution_index 156117 --activation_type none +hipblaslt-bench --api_method c -m 10 -n 64 -k 128 --lda 128 --ldb 128 --ldc 10 --ldd 10 --stride_a 0 --stride_b 0 --stride_c 0 --stride_d 0 --alpha 1.000000 --beta 1.000000 --transA T --transB N --batch_count 1 --scaleA 0 --scaleB 0 --a_type f32_r --b_type f32_r --c_type f32_r --d_type f32_r --scale_type f32_r --bias_type f32_r --compute_type f32_r --algo_method index --solution_index 156482 --activation_type none +hipblaslt-bench --api_method c -m 128 -n 10 -k 32 --lda 128 --ldb 10 --ldc 128 --ldd 128 --stride_a 0 --stride_b 0 --stride_c 0 --stride_d 0 --alpha 1.000000 --beta 0.000000 --transA N --transB T --batch_count 1 --scaleA 0 --scaleB 0 --a_type f32_r --b_type f32_r --c_type f32_r --d_type f32_r --scale_type f32_r --bias_type f32_r --compute_type f32_r --algo_method index --solution_index 13713 --activation_type none +hipblaslt-bench --api_method c -m 128 -n 10 -k 64 --lda 128 --ldb 10 --ldc 128 --ldd 128 --stride_a 0 --stride_b 0 --stride_c 0 --stride_d 0 --alpha 1.000000 --beta 0.000000 --transA N --transB T --batch_count 1 --scaleA 0 --scaleB 0 --a_type f32_r --b_type f32_r --c_type f32_r --d_type f32_r --scale_type f32_r --bias_type f32_r --compute_type f32_r --algo_method index --solution_index 13713 --activation_type none +hipblaslt-bench --api_method c -m 128 -n 1000 -k 9216 --lda 9216 --ldb 9216 --ldc 128 --ldd 128 --stride_a 0 --stride_b 0 --stride_c 0 --stride_d 0 --alpha 1.000000 --beta 1.000000 --transA T --transB N --batch_count 1 --scaleA 0 --scaleB 0 --a_type f32_r --b_type f32_r --c_type f32_r --d_type f32_r --scale_type f32_r --bias_type f32_r --compute_type f32_r --algo_method index --solution_index 156486 --activation_type none +hipblaslt-bench --api_method c -m 128 -n 32 -k 10 --lda 128 --ldb 10 --ldc 128 --ldd 128 --stride_a 0 --stride_b 0 --stride_c 0 --stride_d 0 --alpha 1.000000 --beta 0.000000 --transA N --transB N --batch_count 1 --scaleA 0 --scaleB 0 --a_type f32_r --b_type f32_r --c_type f32_r --d_type f32_r --scale_type f32_r --bias_type f32_r --compute_type f32_r --algo_method index --solution_index 19256 --activation_type none +hipblaslt-bench --api_method c -m 128 -n 32 -k 9216 --lda 9216 --ldb 9216 --ldc 128 --ldd 128 --stride_a 0 --stride_b 0 --stride_c 0 --stride_d 0 --alpha 1.000000 --beta 1.000000 --transA T --transB N --batch_count 1 --scaleA 0 --scaleB 0 --a_type f32_r --b_type f32_r --c_type f32_r --d_type f32_r --scale_type f32_r --bias_type f32_r --compute_type f32_r --algo_method index --solution_index 156426 --activation_type none +hipblaslt-bench --api_method c -m 128 -n 64 -k 10 --lda 128 --ldb 10 --ldc 128 --ldd 128 --stride_a 0 --stride_b 0 --stride_c 0 --stride_d 0 --alpha 1.000000 --beta 0.000000 --transA N --transB N --batch_count 1 --scaleA 0 --scaleB 0 --a_type f32_r --b_type f32_r --c_type f32_r --d_type f32_r --scale_type f32_r --bias_type f32_r --compute_type f32_r --algo_method index --solution_index 19234 --activation_type none +hipblaslt-bench --api_method c -m 128 -n 64 -k 9216 --lda 9216 --ldb 9216 --ldc 128 --ldd 128 --stride_a 0 --stride_b 0 --stride_c 0 --stride_d 0 --alpha 1.000000 --beta 1.000000 --transA T --transB N --batch_count 1 --scaleA 0 --scaleB 0 --a_type f32_r --b_type f32_r --c_type f32_r --d_type f32_r --scale_type f32_r --bias_type f32_r --compute_type f32_r --algo_method index --solution_index 156490 --activation_type none +hipblaslt-bench --api_method c -m 9216 -n 128 -k 32 --lda 9216 --ldb 128 --ldc 9216 --ldd 9216 --stride_a 0 --stride_b 0 --stride_c 0 --stride_d 0 --alpha 1.000000 --beta 0.000000 --transA N --transB T --batch_count 1 --scaleA 0 --scaleB 0 --a_type f32_r --b_type f32_r --c_type f32_r --d_type f32_r --scale_type f32_r --bias_type f32_r --compute_type f32_r --algo_method index --solution_index 13717 --activation_type none +hipblaslt-bench --api_method c -m 9216 -n 128 -k 64 --lda 9216 --ldb 128 --ldc 9216 --ldd 9216 --stride_a 0 --stride_b 0 --stride_c 0 --stride_d 0 --alpha 1.000000 --beta 0.000000 --transA N --transB T --batch_count 1 --scaleA 0 --scaleB 0 --a_type f32_r --b_type f32_r --c_type f32_r --d_type f32_r --scale_type f32_r --bias_type f32_r --compute_type f32_r --algo_method index --solution_index 13717 --activation_type none +hipblaslt-bench --api_method c -m 9216 -n 32 -k 128 --lda 9216 --ldb 128 --ldc 9216 --ldd 9216 --stride_a 0 --stride_b 0 --stride_c 0 --stride_d 0 --alpha 1.000000 --beta 0.000000 --transA N --transB N --batch_count 1 --scaleA 0 --scaleB 0 --a_type f32_r --b_type f32_r --c_type f32_r --d_type f32_r --scale_type f32_r --bias_type f32_r --compute_type f32_r --algo_method index --solution_index 19272 --activation_type none +hipblaslt-bench --api_method c -m 9216 -n 64 -k 128 --lda 9216 --ldb 128 --ldc 9216 --ldd 9216 --stride_a 0 --stride_b 0 --stride_c 0 --stride_d 0 --alpha 1.000000 --beta 0.000000 --transA N --transB N --batch_count 1 --scaleA 0 --scaleB 0 --a_type f32_r --b_type f32_r --c_type f32_r --d_type f32_r --scale_type f32_r --bias_type f32_r --compute_type f32_r --algo_method index --solution_index 19260 --activation_type none From 601db370fc1e8feb2345375782ca538775f438c8 Mon Sep 17 00:00:00 2001 From: fsx950223 Date: Wed, 27 Nov 2024 06:25:25 +0000 Subject: [PATCH 17/23] apply full stage in gridbase tuning --- .../tensile_generator/tensile_config_generator.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py b/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py index db4d6eb47f..388d1a7073 100644 --- a/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py +++ b/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py @@ -597,12 +597,15 @@ def dump_yaml(gpu_idx, gemm_group, yaml_file, m_sum, n_sum, batch_sum, k_sum, it matmul_instructions[dtype_str][str(matmul_instruction)] = matmul_instruction if args.fast and (index > total_inst): break + if len(matmul_instruction_gen) > 0: matmul_instruction_found = True - break - else: - max_dim = int(np.argmax(size)) - size[max_dim] = size[max_dim] // 2 + if not args.full_stage: + break + + max_dim = int(np.argmax(size)) + size[max_dim] = size[max_dim] // 2 + if not matmul_instruction_found: print(f"Can't find mfma instructions for {original_size}, please contact hipblaslt expert") else: From b16de48c1d3b28e3a549110338cbc45410c13e3d Mon Sep 17 00:00:00 2001 From: fsx950223 Date: Thu, 28 Nov 2024 02:48:03 +0000 Subject: [PATCH 18/23] fix dulplicate problem size introduced by solution index --- .../tensile_config_generator.py | 137 +++++++++--------- 1 file changed, 70 insertions(+), 67 deletions(-) diff --git a/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py b/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py index 388d1a7073..bdece6bee5 100644 --- a/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py +++ b/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py @@ -225,15 +225,15 @@ def trans_map(trans): def bias_datatype_map(dtype): if dtype == "f16_r": - return [datatype_map('f16_r')] + return [datatype_map('f32_r'), datatype_map('f16_r')] elif dtype == "f32_r": return [datatype_map('f32_r')] elif dtype == "xf32_r": return [datatype_map('xf32_r')] elif dtype == "bf16_r": - return [datatype_map('bf16_r')] + return [datatype_map('f32_r'), datatype_map('bf16_r')] elif dtype == "f8_r": - return [datatype_map('f8_r')] + return [datatype_map('f32_r'), datatype_map('f8_r')] else: return [] @@ -296,7 +296,7 @@ def find_matmul_instruction(mfma_instruction, size): matmul_instruction[-4] = wave_tile_m // (2**k) matmul_instruction[-2] = 2**k - for l in reversed(range(3-k)): + for l in reversed(range(3)): if wave_tile_n // (2**l) >= 1 and wave_tile_n // (2**l) <= 32: matmul_instruction[-3] = wave_tile_n // (2**l) matmul_instruction[-1] = 2**l @@ -313,7 +313,7 @@ def get_groups(matmul_instruction_gen): ratio = mt[0] / mt[1] if ratio > NONTEMPORALRATIO: mi_groups0.append(mi) - elif ratio < (1/NONTEMPORALRATIO): + elif ratio < (1 / NONTEMPORALRATIO): mi_groups1.append(mi) else: mi_left.append(mi) @@ -442,10 +442,14 @@ def dump_yaml(gpu_idx, gemm_group, yaml_file, m_sum, n_sum, batch_sum, k_sum, it for line in f: match = match_pattern(line) if match: - if line in unique_gemms: - unique_gemms[line] += 1 + size = extract_problem_size(match) + dtype = extract_dtype(match) + size_str = json.dumps(size) + dtype_str = json.dumps(dtype) + if (size_str, dtype_str) in unique_gemms: + unique_gemms[(size_str, dtype_str)] += 1 else: - unique_gemms[line] = 1 + unique_gemms[(size_str, dtype_str)] = 1 unique_gemms = {k: v for k, v in sorted(unique_gemms.items(), key=lambda item: item[1], reverse=True)[:args.topk]} for k, v in unique_gemms.items(): @@ -466,71 +470,70 @@ def dump_yaml(gpu_idx, gemm_group, yaml_file, m_sum, n_sum, batch_sum, k_sum, it k_sum = 0 for k, v in unique_gemms_subgroup: - match = match_pattern(k) - if match: - size = extract_problem_size(match) - original_size = copy.deepcopy(size) - dtype = extract_dtype(match) - mfma_instructions = instruction_map(dtype) - dtype_str = json.dumps(dtype) - if mfma_instructions is None: - continue + size_str, dtype_str = k + size = json.loads(size_str) + dtype = json.loads(dtype_str) + original_size = copy.deepcopy(size) + mfma_instructions = instruction_map(dtype) - matmul_instruction_found = False - for mfma_instruction in mfma_instructions: - for _ in range(NUM_STAGES): - matmul_instruction_gen = list(find_matmul_instruction(mfma_instruction, size)) - if args.groups: - mi_groups0, mi_groups1, matmul_instruction_gen = get_groups(matmul_instruction_gen) - else: - mi_groups0 = [] - mi_groups1 = [] - - total_inst = min(len(matmul_instruction_gen) // DIV_MI, MIN_MI) # At least 5 insts and max of 33.3% of insts. - for index, matmul_instruction in enumerate(matmul_instruction_gen): - if matmul_instruction is not None: - if dtype_str not in matmul_instructions: - matmul_instructions[dtype_str] = dict() - matmul_instructions[dtype_str][str(matmul_instruction)] = matmul_instruction - if args.fast and (index > total_inst): - break - total_inst = min(len(mi_groups0) // DIV_MI, MIN_MI) - for index, mi_0 in enumerate(mi_groups0): - if dtype_str not in groups: - groups[dtype_str] = [{},{}] - groups[dtype_str][0]["MatrixInstruction"] = {} - groups[dtype_str][1]["MatrixInstruction"] = {} - groups[dtype_str][0]["MatrixInstruction"][str(mi_0)] = mi_0 - if args.fast and (index > total_inst): - break - total_inst = min(len(mi_groups1) // DIV_MI, MIN_MI) - for index, mi_1 in enumerate(mi_groups1): - if dtype_str not in groups: - groups[dtype_str] = [{},{}] - groups[dtype_str][0]["MatrixInstruction"] = {} - groups[dtype_str][1]["MatrixInstruction"] = {} - groups[dtype_str][1]["MatrixInstruction"][str(mi_1)] = mi_1 + if mfma_instructions is None: + continue + + matmul_instruction_found = False + for mfma_instruction in mfma_instructions: + for _ in range(NUM_STAGES): + matmul_instruction_gen = list(find_matmul_instruction(mfma_instruction, size)) + if args.groups: + mi_groups0, mi_groups1, matmul_instruction_gen = get_groups(matmul_instruction_gen) + else: + mi_groups0 = [] + mi_groups1 = [] + + total_inst = min(len(matmul_instruction_gen) // DIV_MI, MIN_MI) # At least 5 insts and max of 33.3% of insts. + for index, matmul_instruction in enumerate(matmul_instruction_gen): + if matmul_instruction is not None: + if dtype_str not in matmul_instructions: + matmul_instructions[dtype_str] = dict() + matmul_instructions[dtype_str][str(matmul_instruction)] = matmul_instruction if args.fast and (index > total_inst): break - if len(matmul_instruction_gen) > 0 or len(mi_groups0) > 0 or len(mi_groups1) > 0: - matmul_instruction_found = True - if not args.full_stage: - break + total_inst = min(len(mi_groups0) // DIV_MI, MIN_MI) + for index, mi_0 in enumerate(mi_groups0): + if dtype_str not in groups: + groups[dtype_str] = [{},{}] + groups[dtype_str][0]["MatrixInstruction"] = {} + groups[dtype_str][1]["MatrixInstruction"] = {} + groups[dtype_str][0]["MatrixInstruction"][str(mi_0)] = mi_0 + if args.fast and (index > total_inst): + break + total_inst = min(len(mi_groups1) // DIV_MI, MIN_MI) + for index, mi_1 in enumerate(mi_groups1): + if dtype_str not in groups: + groups[dtype_str] = [{},{}] + groups[dtype_str][0]["MatrixInstruction"] = {} + groups[dtype_str][1]["MatrixInstruction"] = {} + groups[dtype_str][1]["MatrixInstruction"][str(mi_1)] = mi_1 + if args.fast and (index > total_inst): + break + if len(matmul_instruction_gen) > 0 or len(mi_groups0) > 0 or len(mi_groups1) > 0: + matmul_instruction_found = True + if not args.full_stage: + break - max_dim = int(np.argmax(size[:2])) - size[max_dim] = size[max_dim] // 2 + max_dim = int(np.argmax(size[:2])) + size[max_dim] = size[max_dim] // 2 - if not matmul_instruction_found: - print(f"Can't find mfma instructions for {original_size}, please contact hipblaslt expert") + if not matmul_instruction_found: + print(f"Can't find mfma instructions for {original_size}, please contact hipblaslt expert") + else: + if dtype_str in gemm_group: + gemm_group[dtype_str].append({'Exact': list(original_size)}) else: - if dtype_str in gemm_group: - gemm_group[dtype_str].append({'Exact': list(original_size)}) - else: - gemm_group[dtype_str] = [{'Exact': list(original_size)}] - m_sum += original_size[0] - n_sum += original_size[1] - batch_sum += original_size[2] - k_sum += original_size[3] + gemm_group[dtype_str] = [{'Exact': list(original_size)}] + m_sum += original_size[0] + n_sum += original_size[1] + batch_sum += original_size[2] + k_sum += original_size[3] dump_yaml(gpu_idx, gemm_group, args.tensile_config, m_sum, n_sum, batch_sum, k_sum, args.iters, groups) From c339f2d2947d33e6cbe0bb2158d7a8a8237b2b7c Mon Sep 17 00:00:00 2001 From: fsx950223 Date: Thu, 28 Nov 2024 07:52:44 +0000 Subject: [PATCH 19/23] pass unsupported gemm config --- .../tensile_generator/tensile_config_generator.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py b/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py index bdece6bee5..9b5022f26c 100644 --- a/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py +++ b/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py @@ -244,6 +244,8 @@ def extract_dtype(match): ComputeDataType = datatype_map(gdict.get('COMPUTE_TYPE', '').strip()) TransposeA = trans_map(gdict.get('TRANS_A', '').strip()) TransposeB = trans_map(gdict.get('TRANS_B', '').strip()) + if None in [DataType, DestDataType, ComputeDataType, TransposeA, TransposeB]: + return None scaleA = gdict.get("SCALE_A").strip() scaleB = gdict.get("SCALE_B").strip() activation_type = gdict.get("ACTIVATION_TYPE").strip() @@ -265,10 +267,10 @@ def extract_dtype(match): if activation_type != "none": res["Activation"] = True res["ActivationType"] = "hipblaslt_all" - if scaleA == "1" and scaleB == "1" is not None: + if scaleA == "1" and scaleB == "1": res["UseScaleAB"] = "Scalar" res["UseScaleAlphaVec"] = 1 - elif scaleA == "2" and scaleB == "2" is not None: + elif scaleA == "2" and scaleB == "2": res["UseScaleAB"] = "Vector" res["UseScaleAlphaVec"] = 1 return res @@ -444,6 +446,9 @@ def dump_yaml(gpu_idx, gemm_group, yaml_file, m_sum, n_sum, batch_sum, k_sum, it if match: size = extract_problem_size(match) dtype = extract_dtype(match) + if dtype is None: + print(f"Can't find dtype for {line}, please contact hipblaslt expert") + continue size_str = json.dumps(size) dtype_str = json.dumps(dtype) if (size_str, dtype_str) in unique_gemms: From 34bad15fe701b3cf7c9126aa87a47534a3eb713b Mon Sep 17 00:00:00 2001 From: fsx950223 Date: Mon, 2 Dec 2024 05:12:32 +0000 Subject: [PATCH 20/23] calculate gsu --- .../tensile_config_generator.py | 86 +++++++++++-------- 1 file changed, 49 insertions(+), 37 deletions(-) diff --git a/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py b/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py index 9b5022f26c..225e8ae128 100644 --- a/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py +++ b/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py @@ -106,15 +106,12 @@ res = subprocess.run(["cat", "/sys/class/drm/card1/device/current_compute_partition"], stdout=subprocess.PIPE) if res.stdout.decode("utf-8").strip() == "CPX": XCC = 1 - GSU = [1,2,3,4,5,6,7,8] else: XCC = 4 - GSU = [1,2,3,4] DeviceNames = ["Device 0049", "Device 0050"] ScheduleName = "aquavanjaram" elif ArchitectureName == 'gfx90a': XCC = 1 - GSU = [1,2,3,4] DeviceNames = ["Device 0050", "Device 0051", "Device 0052", "Device 0054", "Device 0062", "Device 7400", "Device 740c"] ScheduleName = "aldebaran" @@ -237,6 +234,18 @@ def bias_datatype_map(dtype): else: return [] +def get_high_precision_accumulate(DataType): + if DataType in ["H", "B", "F8"]: + return True + else: + return False + +def adapt_xf32(ComputeDataType): + if ComputeDataType == "XS": + return 'S', 'x' + else: + return ComputeDataType, 0 + def extract_dtype(match): gdict = match.groupdict() DataType = datatype_map(gdict.get('A_TYPE', '').strip()) @@ -250,14 +259,8 @@ def extract_dtype(match): scaleB = gdict.get("SCALE_B").strip() activation_type = gdict.get("ACTIVATION_TYPE").strip() bias_source = gdict.get('BIAS_SOURCE', '').strip().upper() - if DataType in ["H", "B", "F8"]: - HighPrecisionAccumulate = True - else: - HighPrecisionAccumulate = False - F32XdlMathOp = 0 - if ComputeDataType == "XS": - ComputeDataType = "S" - F32XdlMathOp = 'x' + HighPrecisionAccumulate = get_high_precision_accumulate(DataType) + ComputeDataType, F32XdlMathOp = adapt_xf32(ComputeDataType) res = {"Batched": True, "DataType": DataType, "DestDataType": DestDataType, "ComputeDataType": ComputeDataType, "TransposeA": TransposeA, "TransposeB": TransposeB, "HighPrecisionAccumulate": HighPrecisionAccumulate, "F32XdlMathOp": F32XdlMathOp, "OperationType": "GEMM", "UseBeta": True} if bias_source: @@ -359,7 +362,12 @@ def calculate_min_flops(m_sum, n_sum, batch_sum, k_sum, iters): return (ENQUEUES_PER_SYNC + iters) * m_avg * n_avg * batch_avg * k_avg / 2 -def dump_yaml(gpu_idx, gemm_group, yaml_file, m_sum, n_sum, batch_sum, k_sum, iters, groups): +def calculate_gsu(matmul_instruction, size): + mt0 = matmul_instruction[0] * matmul_instruction[5] * matmul_instruction[7] + mt1 = matmul_instruction[1] * matmul_instruction[6] * matmul_instruction[8] + return max(1, CU // (math.ceil(size[0] / mt0) * math.ceil(size[1] / mt1))) + +def dump_yaml(gpu_idx, gemm_group, yaml_file, m_sum, n_sum, batch_sum, k_sum, iters, groups, gsu): MinFlopsPerSync = calculate_min_flops(m_sum, n_sum, batch_sum, k_sum, iters) # Read the YAML file with open(yaml_file, 'r') as f: @@ -423,7 +431,7 @@ def dump_yaml(gpu_idx, gemm_group, yaml_file, m_sum, n_sum, batch_sum, k_sum, it if "WorkGroupMappingXCC" in item: item["WorkGroupMappingXCC"] = [XCC] if "GlobalSplitU" in item: - item["GlobalSplitU"] = list(GSU) + item["GlobalSplitU"] = list(gsu) data["BenchmarkProblems"][i][0] = dtype data["LibraryLogic"]["DeviceNames"] = DeviceNames data["LibraryLogic"]["ScheduleName"] = ScheduleName @@ -484,8 +492,10 @@ def dump_yaml(gpu_idx, gemm_group, yaml_file, m_sum, n_sum, batch_sum, k_sum, it if mfma_instructions is None: continue + gsu = set() matmul_instruction_found = False for mfma_instruction in mfma_instructions: + size = copy.deepcopy(original_size) for _ in range(NUM_STAGES): matmul_instruction_gen = list(find_matmul_instruction(mfma_instruction, size)) if args.groups: @@ -497,6 +507,7 @@ def dump_yaml(gpu_idx, gemm_group, yaml_file, m_sum, n_sum, batch_sum, k_sum, it total_inst = min(len(matmul_instruction_gen) // DIV_MI, MIN_MI) # At least 5 insts and max of 33.3% of insts. for index, matmul_instruction in enumerate(matmul_instruction_gen): if matmul_instruction is not None: + gsu.add(calculate_gsu(matmul_instruction, size)) if dtype_str not in matmul_instructions: matmul_instructions[dtype_str] = dict() matmul_instructions[dtype_str][str(matmul_instruction)] = matmul_instruction @@ -504,22 +515,26 @@ def dump_yaml(gpu_idx, gemm_group, yaml_file, m_sum, n_sum, batch_sum, k_sum, it break total_inst = min(len(mi_groups0) // DIV_MI, MIN_MI) for index, mi_0 in enumerate(mi_groups0): - if dtype_str not in groups: - groups[dtype_str] = [{},{}] - groups[dtype_str][0]["MatrixInstruction"] = {} - groups[dtype_str][1]["MatrixInstruction"] = {} - groups[dtype_str][0]["MatrixInstruction"][str(mi_0)] = mi_0 - if args.fast and (index > total_inst): - break + if mi_0 is not None: + gsu.add(calculate_gsu(mi_0, size)) + if dtype_str not in groups: + groups[dtype_str] = [{},{}] + groups[dtype_str][0]["MatrixInstruction"] = {} + groups[dtype_str][1]["MatrixInstruction"] = {} + groups[dtype_str][0]["MatrixInstruction"][str(mi_0)] = mi_0 + if args.fast and (index > total_inst): + break total_inst = min(len(mi_groups1) // DIV_MI, MIN_MI) for index, mi_1 in enumerate(mi_groups1): - if dtype_str not in groups: - groups[dtype_str] = [{},{}] - groups[dtype_str][0]["MatrixInstruction"] = {} - groups[dtype_str][1]["MatrixInstruction"] = {} - groups[dtype_str][1]["MatrixInstruction"][str(mi_1)] = mi_1 - if args.fast and (index > total_inst): - break + if mi_1 is not None: + gsu.add(calculate_gsu(mi_1, size)) + if dtype_str not in groups: + groups[dtype_str] = [{},{}] + groups[dtype_str][0]["MatrixInstruction"] = {} + groups[dtype_str][1]["MatrixInstruction"] = {} + groups[dtype_str][1]["MatrixInstruction"][str(mi_1)] = mi_1 + if args.fast and (index > total_inst): + break if len(matmul_instruction_gen) > 0 or len(mi_groups0) > 0 or len(mi_groups1) > 0: matmul_instruction_found = True if not args.full_stage: @@ -540,7 +555,7 @@ def dump_yaml(gpu_idx, gemm_group, yaml_file, m_sum, n_sum, batch_sum, k_sum, it batch_sum += original_size[2] k_sum += original_size[3] - dump_yaml(gpu_idx, gemm_group, args.tensile_config, m_sum, n_sum, batch_sum, k_sum, args.iters, groups) + dump_yaml(gpu_idx, gemm_group, args.tensile_config, m_sum, n_sum, batch_sum, k_sum, args.iters, groups, gsu) elif args.gridbase_config and args.hipblaslt_log is None: LibraryType = "GridBased" @@ -559,14 +574,8 @@ def dump_yaml(gpu_idx, gemm_group, yaml_file, m_sum, n_sum, batch_sum, k_sum, it ComputeDataType = datatype_map(data['ComputeDataType'].strip()) TransposeA = trans_map(data['TransposeA']) TransposeB = trans_map(data['TransposeB']) - if DataType in ["H", "B"]: - HighPrecisionAccumulate = True - else: - HighPrecisionAccumulate = False - F32XdlMathOp = 0 - if ComputeDataType == "XS": - ComputeDataType = "S" - F32XdlMathOp = 'x' + HighPrecisionAccumulate = get_high_precision_accumulate(DataType) + ComputeDataType, F32XdlMathOp = adapt_xf32(ComputeDataType) dtype = {"Batched": True, "DataType": DataType, "DestDataType": DestDataType, "ComputeDataType": ComputeDataType, "TransposeA": TransposeA, "TransposeB": TransposeB, "HighPrecisionAccumulate": HighPrecisionAccumulate, "F32XdlMathOp": F32XdlMathOp, "OperationType": "GEMM", "UseBeta": True, "UseBias": 1, "Activation": True, "ActivationType": "hipblaslt_all", "UseScaleAlphaVec": 1} dtype_str = json.dumps(dtype) for m in m_shapes: @@ -588,6 +597,7 @@ def dump_yaml(gpu_idx, gemm_group, yaml_file, m_sum, n_sum, batch_sum, k_sum, it size = list(size) original_size = copy.deepcopy(size) dtype_str = k[0] + gsu = set() dtype = json.loads(dtype_str) mfma_instructions = instruction_map(dtype) @@ -595,11 +605,13 @@ def dump_yaml(gpu_idx, gemm_group, yaml_file, m_sum, n_sum, batch_sum, k_sum, it continue matmul_instruction_found = False for mfma_instruction in mfma_instructions: + size = copy.deepcopy(original_size) for _ in range(NUM_STAGES): matmul_instruction_gen = list(find_matmul_instruction(mfma_instruction, size)) total_inst = min(len(matmul_instruction_gen) // 3, 5) # At least 5 insts and max of 33.3% of insts. for index, matmul_instruction in enumerate(matmul_instruction_gen): if matmul_instruction is not None: + gsu.add(calculate_gsu(matmul_instruction, size)) if dtype_str not in matmul_instructions: matmul_instructions[dtype_str] = dict() matmul_instructions[dtype_str][str(matmul_instruction)] = matmul_instruction @@ -626,4 +638,4 @@ def dump_yaml(gpu_idx, gemm_group, yaml_file, m_sum, n_sum, batch_sum, k_sum, it batch_sum += original_size[2] k_sum += original_size[3] - dump_yaml(gpu_idx, gemm_group, args.tensile_config, m_sum, n_sum, batch_sum, k_sum, args.iters, {}) + dump_yaml(gpu_idx, gemm_group, args.tensile_config, m_sum, n_sum, batch_sum, k_sum, args.iters, {}, gsu) From 5e54bb390c7a79fcf8b3b985d08092aa40720380 Mon Sep 17 00:00:00 2001 From: fsx950223 Date: Mon, 2 Dec 2024 06:53:56 +0000 Subject: [PATCH 21/23] fix a bug --- .../tensile_config_generator.py | 22 ++++++++++--------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py b/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py index 225e8ae128..a588b736b3 100644 --- a/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py +++ b/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py @@ -367,7 +367,7 @@ def calculate_gsu(matmul_instruction, size): mt1 = matmul_instruction[1] * matmul_instruction[6] * matmul_instruction[8] return max(1, CU // (math.ceil(size[0] / mt0) * math.ceil(size[1] / mt1))) -def dump_yaml(gpu_idx, gemm_group, yaml_file, m_sum, n_sum, batch_sum, k_sum, iters, groups, gsu): +def dump_yaml(gpu_idx, gemm_group, yaml_file, m_sum, n_sum, batch_sum, k_sum, iters, groups, gsu_group): MinFlopsPerSync = calculate_min_flops(m_sum, n_sum, batch_sum, k_sum, iters) # Read the YAML file with open(yaml_file, 'r') as f: @@ -431,7 +431,7 @@ def dump_yaml(gpu_idx, gemm_group, yaml_file, m_sum, n_sum, batch_sum, k_sum, it if "WorkGroupMappingXCC" in item: item["WorkGroupMappingXCC"] = [XCC] if "GlobalSplitU" in item: - item["GlobalSplitU"] = list(gsu) + item["GlobalSplitU"] = gsu_group[dtype_str] data["BenchmarkProblems"][i][0] = dtype data["LibraryLogic"]["DeviceNames"] = DeviceNames data["LibraryLogic"]["ScheduleName"] = ScheduleName @@ -472,6 +472,7 @@ def dump_yaml(gpu_idx, gemm_group, yaml_file, m_sum, n_sum, batch_sum, k_sum, it for gpu_idx, unique_gemms_subgroup in enumerate(unique_gemms_subgroups): gemm_group = {} + gsu_group = {} matmul_instructions = {} groups = {} if unique_gemms_subgroup is None: @@ -484,9 +485,8 @@ def dump_yaml(gpu_idx, gemm_group, yaml_file, m_sum, n_sum, batch_sum, k_sum, it for k, v in unique_gemms_subgroup: size_str, dtype_str = k - size = json.loads(size_str) + original_size = json.loads(size_str) dtype = json.loads(dtype_str) - original_size = copy.deepcopy(size) mfma_instructions = instruction_map(dtype) if mfma_instructions is None: @@ -550,12 +550,13 @@ def dump_yaml(gpu_idx, gemm_group, yaml_file, m_sum, n_sum, batch_sum, k_sum, it gemm_group[dtype_str].append({'Exact': list(original_size)}) else: gemm_group[dtype_str] = [{'Exact': list(original_size)}] + gsu_group[dtype_str] = list(gsu) m_sum += original_size[0] n_sum += original_size[1] batch_sum += original_size[2] k_sum += original_size[3] - dump_yaml(gpu_idx, gemm_group, args.tensile_config, m_sum, n_sum, batch_sum, k_sum, args.iters, groups, gsu) + dump_yaml(gpu_idx, gemm_group, args.tensile_config, m_sum, n_sum, batch_sum, k_sum, args.iters, groups, gsu_group) elif args.gridbase_config and args.hipblaslt_log is None: LibraryType = "GridBased" @@ -589,20 +590,20 @@ def dump_yaml(gpu_idx, gemm_group, yaml_file, m_sum, n_sum, batch_sum, k_sum, it for gpu_idx, unique_gemms_subgroup in enumerate(unique_gemms_subgroups): gemm_group = {} matmul_instructions = {} + gsu_group = {} m_sum = 0 n_sum = 0 batch_sum = 0 k_sum = 0 for k, size in unique_gemms_subgroup: - size = list(size) - original_size = copy.deepcopy(size) + original_size = list(size) dtype_str = k[0] - gsu = set() - dtype = json.loads(dtype_str) mfma_instructions = instruction_map(dtype) if mfma_instructions is None: continue + + gsu = set() matmul_instruction_found = False for mfma_instruction in mfma_instructions: size = copy.deepcopy(original_size) @@ -633,9 +634,10 @@ def dump_yaml(gpu_idx, gemm_group, yaml_file, m_sum, n_sum, batch_sum, k_sum, it gemm_group[dtype_str].append({'Exact': list(original_size)}) else: gemm_group[dtype_str] = [{'Exact': list(original_size)}] + gsu_group[dtype_str] = list(gsu) m_sum += original_size[0] n_sum += original_size[1] batch_sum += original_size[2] k_sum += original_size[3] - dump_yaml(gpu_idx, gemm_group, args.tensile_config, m_sum, n_sum, batch_sum, k_sum, args.iters, {}, gsu) + dump_yaml(gpu_idx, gemm_group, args.tensile_config, m_sum, n_sum, batch_sum, k_sum, args.iters, {}, gsu_group) From adba89546e8f30568dcf47363b01a4a03a3ab490 Mon Sep 17 00:00:00 2001 From: fsx950223 Date: Mon, 2 Dec 2024 08:40:17 +0000 Subject: [PATCH 22/23] fix a bug --- .../tensile_config_generator.py | 20 ++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py b/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py index a588b736b3..816a0300c0 100644 --- a/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py +++ b/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py @@ -431,7 +431,7 @@ def dump_yaml(gpu_idx, gemm_group, yaml_file, m_sum, n_sum, batch_sum, k_sum, it if "WorkGroupMappingXCC" in item: item["WorkGroupMappingXCC"] = [XCC] if "GlobalSplitU" in item: - item["GlobalSplitU"] = gsu_group[dtype_str] + item["GlobalSplitU"] = list(gsu_group[dtype_str]) data["BenchmarkProblems"][i][0] = dtype data["LibraryLogic"]["DeviceNames"] = DeviceNames data["LibraryLogic"]["ScheduleName"] = ScheduleName @@ -492,7 +492,9 @@ def dump_yaml(gpu_idx, gemm_group, yaml_file, m_sum, n_sum, batch_sum, k_sum, it if mfma_instructions is None: continue - gsu = set() + if dtype_str not in gsu_group: + gsu_group[dtype_str] = set() + matmul_instruction_found = False for mfma_instruction in mfma_instructions: size = copy.deepcopy(original_size) @@ -507,7 +509,7 @@ def dump_yaml(gpu_idx, gemm_group, yaml_file, m_sum, n_sum, batch_sum, k_sum, it total_inst = min(len(matmul_instruction_gen) // DIV_MI, MIN_MI) # At least 5 insts and max of 33.3% of insts. for index, matmul_instruction in enumerate(matmul_instruction_gen): if matmul_instruction is not None: - gsu.add(calculate_gsu(matmul_instruction, size)) + gsu_group[dtype_str].add(calculate_gsu(matmul_instruction, size)) if dtype_str not in matmul_instructions: matmul_instructions[dtype_str] = dict() matmul_instructions[dtype_str][str(matmul_instruction)] = matmul_instruction @@ -516,7 +518,7 @@ def dump_yaml(gpu_idx, gemm_group, yaml_file, m_sum, n_sum, batch_sum, k_sum, it total_inst = min(len(mi_groups0) // DIV_MI, MIN_MI) for index, mi_0 in enumerate(mi_groups0): if mi_0 is not None: - gsu.add(calculate_gsu(mi_0, size)) + gsu_group[dtype_str].add(calculate_gsu(mi_0, size)) if dtype_str not in groups: groups[dtype_str] = [{},{}] groups[dtype_str][0]["MatrixInstruction"] = {} @@ -527,7 +529,7 @@ def dump_yaml(gpu_idx, gemm_group, yaml_file, m_sum, n_sum, batch_sum, k_sum, it total_inst = min(len(mi_groups1) // DIV_MI, MIN_MI) for index, mi_1 in enumerate(mi_groups1): if mi_1 is not None: - gsu.add(calculate_gsu(mi_1, size)) + gsu_group[dtype_str].add(calculate_gsu(mi_1, size)) if dtype_str not in groups: groups[dtype_str] = [{},{}] groups[dtype_str][0]["MatrixInstruction"] = {} @@ -550,7 +552,6 @@ def dump_yaml(gpu_idx, gemm_group, yaml_file, m_sum, n_sum, batch_sum, k_sum, it gemm_group[dtype_str].append({'Exact': list(original_size)}) else: gemm_group[dtype_str] = [{'Exact': list(original_size)}] - gsu_group[dtype_str] = list(gsu) m_sum += original_size[0] n_sum += original_size[1] batch_sum += original_size[2] @@ -603,7 +604,8 @@ def dump_yaml(gpu_idx, gemm_group, yaml_file, m_sum, n_sum, batch_sum, k_sum, it if mfma_instructions is None: continue - gsu = set() + if dtype_str not in gsu_group: + gsu_group[dtype_str] = set() matmul_instruction_found = False for mfma_instruction in mfma_instructions: size = copy.deepcopy(original_size) @@ -612,7 +614,7 @@ def dump_yaml(gpu_idx, gemm_group, yaml_file, m_sum, n_sum, batch_sum, k_sum, it total_inst = min(len(matmul_instruction_gen) // 3, 5) # At least 5 insts and max of 33.3% of insts. for index, matmul_instruction in enumerate(matmul_instruction_gen): if matmul_instruction is not None: - gsu.add(calculate_gsu(matmul_instruction, size)) + gsu_group[dtype_str].add(calculate_gsu(matmul_instruction, size)) if dtype_str not in matmul_instructions: matmul_instructions[dtype_str] = dict() matmul_instructions[dtype_str][str(matmul_instruction)] = matmul_instruction @@ -634,7 +636,7 @@ def dump_yaml(gpu_idx, gemm_group, yaml_file, m_sum, n_sum, batch_sum, k_sum, it gemm_group[dtype_str].append({'Exact': list(original_size)}) else: gemm_group[dtype_str] = [{'Exact': list(original_size)}] - gsu_group[dtype_str] = list(gsu) + m_sum += original_size[0] n_sum += original_size[1] batch_sum += original_size[2] From 82b8eb0b8ab3b137c7cdbdb3df6bfddec6880f5b Mon Sep 17 00:00:00 2001 From: fsx950223 Date: Mon, 2 Dec 2024 09:11:08 +0000 Subject: [PATCH 23/23] skip slow kernels --- .../Tensile/Utilities/tensile_generator/tuning_template.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/tensilelite/Tensile/Utilities/tensile_generator/tuning_template.yaml b/tensilelite/Tensile/Utilities/tensile_generator/tuning_template.yaml index bfefb87aae..77fbd932b6 100644 --- a/tensilelite/Tensile/Utilities/tensile_generator/tuning_template.yaml +++ b/tensilelite/Tensile/Utilities/tensile_generator/tuning_template.yaml @@ -272,6 +272,7 @@ BenchmarkProblems: InitialSolutionParameters: null JoinParameters: null GlobalParameters: + SkipSlowSolutionRatio: 0.5 BoundsCheck: false DataInitTypeAB: 12 DataInitTypeAlpha: 1