From b054c2cbdfc9b83c03d004a2a1fcaa732ec32c80 Mon Sep 17 00:00:00 2001
From: fsx950223 <fsx950223@outlook.com>
Date: Thu, 14 Nov 2024 08:24:39 +0000
Subject: [PATCH 01/23] add fp8 tuning config

---
 .../tensile_config_generator.py               | 196 +++++++++++-------
 1 file changed, 126 insertions(+), 70 deletions(-)
diff --git a/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py b/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py
index da8ff1859d..2f1aec0917 100644
--- a/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py
+++ b/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py
@@ -61,13 +61,17 @@
     help="If enabled, only tune the matrix instruction with min tile sizes, else, tune full matrix instructions")
 
 parser.add_argument(
-    "--groups", type=bool, default=True,
+    "--groups", type=bool, default=False,
     help="If enabled, will replace MatrixInstruction with GroupedMatrixInstruction")
 
 parser.add_argument(
     "--gridbase_config", type=str, default=None,
     help="Range config path")
 
+parser.add_argument(
+    "--full_mfma", type=bool, default=False,
+    help="If enabled, will search for all mfma instructions")
+
 args = parser.parse_args()
 
 NUM_WARM_UP = 20
@@ -81,6 +85,8 @@
 res = subprocess.run("rocminfo | grep Compute", stdout=subprocess.PIPE, shell=True, env={"ROCR_VISIBLE_DEVICES":"0"})
 match = re.search(CU_RE, res.stdout.decode("utf-8").split('\n')[-2])
 NUM_STAGES = 8
+DIV_MI = 3 # 33.3%
+MIN_MI = 5 # min 5 solutions
 CU = 0
 if match:
     CU = int(match.group('COMPUTE_UNIT').strip())
@@ -103,10 +109,18 @@
     DeviceNames = ["Device 0050", "Device 0051", "Device 0052", "Device 0054", "Device 0062", "Device 7400", "Device 740c"]
     ScheduleName = "aldebaran"
 
-fp16_instructions = [[16,16,16,1]]
-bf16_instructions = [[16,16,8,1]]
-tf32_instructions = [[16,16,8,1]]
-fp32_instructions = [[16,16,4,1]]
+if args.full_mfma:
+    fp16_instructions = [[32,32,4,2], [32,32,8,1], [16,16,4,4], [16,16,16,1], [4,4,4,16]]
+    bf16_instructions = [[32,32,4,2], [32,32,8,1], [16,16,4,4], [16,16,16,1], [4,4,4,16]]
+    tf32_instructions = [[32,32,2,2], [32,32,4,1], [16,16,2,4], [16,16,8,1], [4,4,2,16]]
+    fp32_instructions = [[32,32,1,2], [32,32,2,1], [16,16,1,4], [16,16,4,1], [4,4,1,16]]
+    fp8_instructions = [[32,32,16,1], [16,16,32,1]]
+else:
+    fp16_instructions = [[16,16,16,1]]
+    bf16_instructions = [[16,16,16,1],[32,32,8,1]]
+    tf32_instructions = [[16,16,8,1]]
+    fp32_instructions = [[16,16,4,1]]
+    fp8_instructions = [[16,16,32,1]]
 
 
 HIPBLASLT_BENCH_RE = (
@@ -136,6 +150,35 @@
     r"--compute_type (?P<COMPUTE_TYPE>[\w ]+)")
 
 
+HIPBLASLT_BENCH_RE2 = (
+    r"(?P<CMD>\w+) --api_method c "
+    r"-m (?P<M>[\d ]+)"
+    r"-n (?P<N>[\d ]+)"
+    r"-k (?P<K>[\d ]+)"
+    r"--lda (?P<LDA>[\d ]+)"
+    r"--ldb (?P<LDB>[\d ]+)"
+    r"--ldc (?P<LDC>[\d ]+)"
+    r"--ldd (?P<LDD>[\d ]+)"
+    r"--stride_a (?P<STRIDE_A>[\d ]+)"
+    r"--stride_b (?P<STRIDE_B>[\d ]+)"
+    r"--stride_c (?P<STRIDE_C>[\d ]+)"
+    r"--stride_d (?P<STRIDE_D>[\d ]+)"
+    r"--alpha (?P<ALPHA>[\d\. ]+)"
+    r"--beta (?P<BETA>[\d\. ]+)"
+    r"--transA (?P<TRANS_A>[\w ]+)"
+    r"--transB (?P<TRANS_B>[\w ]+)"
+    r"--batch_count (?P<BATCH_COUNT>[\d ]+)"
+    r"--scaleA (?P<SCALE_A>[\w ]+)"
+    r"--scaleB (?P<SCALE_B>[\w ]+)"
+    r"--bias_vector --bias_source (?P<BIAS_SOURCE>[\w ]+)"
+    r"--a_type (?P<A_TYPE>[\w ]+)"
+    r"--b_type (?P<B_TYPE>[\w ]+)"
+    r"--c_type (?P<C_TYPE>[\w ]+)"
+    r"--d_type (?P<D_TYPE>[\w ]+)"
+    r"--scale_type (?P<SCALE_TYPE>[\w ]+)"
+    r"--bias_type (?P<BIAS_TYPE>[\w ]+)"
+    r"--compute_type (?P<COMPUTE_TYPE>[\w ]+)")
+
 # Function to extract problem sizes from a line
 def extract_problem_size(match):
     return [int(match.group('M').strip()), int(match.group('N').strip()), int(match.group('BATCH_COUNT').strip()), int(match.group('K').strip())]
@@ -149,6 +192,8 @@ def instruction_map(dtype_dict):
         return fp16_instructions
     elif dtype_dict["DataType"] == 'B':
         return bf16_instructions
+    elif dtype_dict["DataType"] == 'F8':
+        return fp8_instructions
     else:
         return None
 
@@ -161,6 +206,8 @@ def datatype_map(dtype):
         return "XS"
     elif dtype == "bf16_r":
         return "B"
+    elif dtype == "f8_r":
+        return "F8"
     else:
         return None
 
@@ -178,7 +225,7 @@ def extract_dtype(match):
     ComputeDataType = datatype_map(match.group('COMPUTE_TYPE').strip())
     TransposeA = trans_map(match.group('TRANS_A').strip())
     TransposeB = trans_map(match.group('TRANS_B').strip())
-    if DataType in ["H", "B"]:
+    if DataType in ["H", "B", "F8"]:
         HighPrecisionAccumulate = True
     else:
         HighPrecisionAccumulate = False
@@ -343,9 +390,14 @@ def dump_yaml(gpu_idx, gemm_group, yaml_file, m_sum, n_sum, batch_sum, k_sum, it
     # Read problem sizes from the input file
     with open(args.hipblaslt_log, 'r') as f:
         for line in f:
-            match = re.search(
-                HIPBLASLT_BENCH_RE, line
-            )
+            if 'f8_r' in line:
+                match = re.search(
+                    HIPBLASLT_BENCH_RE2, line
+                )
+            else:
+                match = re.search(
+                    HIPBLASLT_BENCH_RE, line
+                )
             if match:
                 if line in unique_gemms:
                     unique_gemms[line] += 1
@@ -369,10 +421,16 @@ def dump_yaml(gpu_idx, gemm_group, yaml_file, m_sum, n_sum, batch_sum, k_sum, it
         n_sum = 0
         batch_sum = 0
         k_sum = 0
+
         for k, v in unique_gemms_subgroup:
-            match = re.search(
-                HIPBLASLT_BENCH_RE, k
-            )
+            if 'f8_r' in k:
+                match = re.search(
+                    HIPBLASLT_BENCH_RE2, k
+                )
+            else:
+                match = re.search(
+                    HIPBLASLT_BENCH_RE, k
+                )
 
             if match:
                 size = extract_problem_size(match)
@@ -382,51 +440,49 @@ def dump_yaml(gpu_idx, gemm_group, yaml_file, m_sum, n_sum, batch_sum, k_sum, it
                 dtype_str = json.dumps(dtype)
                 if mfma_instructions is None:
                     continue
+                
                 mfma_instruction_found = False
-                mfma_instruction = mfma_instructions[0]
-                for _ in range(NUM_STAGES):
-                    matmul_instruction_gen = list(find_matmul_instruction(mfma_instruction, size))
-                    if args.groups:
-                        mi_groups0, mi_groups1, matmul_instruction_gen = get_groups(matmul_instruction_gen)
-                    else:
-                        mi_groups0 = []
-                        mi_groups1 = []
-
-                    DIV_MI = 3 # 33.3%
-                    MIN_MI = 5 # min 5 solutions
-                        
-                    total_inst = min(len(matmul_instruction_gen) // DIV_MI, MIN_MI)  # At least 5 insts and max of 33.3% of insts.
-                    for index, matmul_instruction in enumerate(matmul_instruction_gen):
-                        if matmul_instruction is not None:
-                            if dtype_str not in matmul_instructions:
-                                matmul_instructions[dtype_str] = dict()
-                            matmul_instructions[dtype_str][str(matmul_instruction)] = matmul_instruction
+                for mfma_instruction in mfma_instructions:
+                    for _ in range(NUM_STAGES):
+                        matmul_instruction_gen = list(find_matmul_instruction(mfma_instruction, size))
+                        if args.groups:
+                            mi_groups0, mi_groups1, matmul_instruction_gen = get_groups(matmul_instruction_gen)
+                        else:
+                            mi_groups0 = []
+                            mi_groups1 = []
+
+                        total_inst = min(len(matmul_instruction_gen) // DIV_MI, MIN_MI)  # At least 5 insts and max of 33.3% of insts.
+                        for index, matmul_instruction in enumerate(matmul_instruction_gen):
+                            if matmul_instruction is not None:
+                                if dtype_str not in matmul_instructions:
+                                    matmul_instructions[dtype_str] = dict()
+                                matmul_instructions[dtype_str][str(matmul_instruction)] = matmul_instruction
+                                if args.fast and (index > total_inst):
+                                    break
+                        total_inst = min(len(mi_groups0) // DIV_MI, MIN_MI)
+                        for index, mi_0 in enumerate(mi_groups0):
+                            if dtype_str not in groups:
+                                groups[dtype_str] = [{},{}]
+                                groups[dtype_str][0]["MatrixInstruction"] = {}
+                                groups[dtype_str][1]["MatrixInstruction"] = {}
+                            groups[dtype_str][0]["MatrixInstruction"][str(mi_0)] = mi_0
                             if args.fast and (index > total_inst):
                                 break
-                    total_inst = min(len(mi_groups0) // DIV_MI, MIN_MI)
-                    for index, mi_0 in enumerate(mi_groups0):
-                        if dtype_str not in groups:
-                            groups[dtype_str] = [{},{}]
-                            groups[dtype_str][0]["MatrixInstruction"] = {}
-                            groups[dtype_str][1]["MatrixInstruction"] = {}
-                        groups[dtype_str][0]["MatrixInstruction"][str(mi_0)] = mi_0
-                        if args.fast and (index > total_inst):
-                            break
-                    total_inst = min(len(mi_groups1) // DIV_MI, MIN_MI)
-                    for index, mi_1 in enumerate(mi_groups1):
-                        if dtype_str not in groups:
-                            groups[dtype_str] = [{},{}]
-                            groups[dtype_str][0]["MatrixInstruction"] = {}
-                            groups[dtype_str][1]["MatrixInstruction"] = {}
-                        groups[dtype_str][1]["MatrixInstruction"][str(mi_1)] = mi_1
-                        if args.fast and (index > total_inst):
+                        total_inst = min(len(mi_groups1) // DIV_MI, MIN_MI)
+                        for index, mi_1 in enumerate(mi_groups1):
+                            if dtype_str not in groups:
+                                groups[dtype_str] = [{},{}]
+                                groups[dtype_str][0]["MatrixInstruction"] = {}
+                                groups[dtype_str][1]["MatrixInstruction"] = {}
+                            groups[dtype_str][1]["MatrixInstruction"][str(mi_1)] = mi_1
+                            if args.fast and (index > total_inst):
+                                break
+                        if len(matmul_instruction_gen) > 0 or len(mi_groups0) > 0 or len(mi_groups1) > 0:
+                            mfma_instruction_found = True
                             break
-                    if len(matmul_instruction_gen) > 0 or len(mi_groups0) > 0 or len(mi_groups1) > 0:
-                        mfma_instruction_found = True
-                        break
-                    else:
-                        max_dim = int(np.argmax(size))
-                        size[max_dim] = size[max_dim] // 2
+                        else:
+                            max_dim = int(np.argmax(size))
+                            size[max_dim] = size[max_dim] // 2
 
                 if not mfma_instruction_found:
                     print(f"Can't find mfma instructions for {original_size}, please contact hipblaslt expert")
@@ -494,23 +550,23 @@ def dump_yaml(gpu_idx, gemm_group, yaml_file, m_sum, n_sum, batch_sum, k_sum, it
             if mfma_instructions is None:
                 continue
             mfma_instruction_found = False
-            mfma_instruction = mfma_instructions[0]
-            for _ in range(NUM_STAGES):
-                matmul_instruction_gen = list(find_matmul_instruction(mfma_instruction, size))
-                total_inst = min(len(matmul_instruction_gen) // 3, 5)  # At least 5 insts and max of 33.3% of insts.
-                for index, matmul_instruction in enumerate(matmul_instruction_gen):
-                    if matmul_instruction is not None:
-                        if dtype_str not in matmul_instructions:
-                            matmul_instructions[dtype_str] = dict()
-                        matmul_instructions[dtype_str][str(matmul_instruction)] = matmul_instruction
-                        if args.fast and (index > total_inst):
-                            break
-                if len(matmul_instruction_gen) > 0:
-                    mfma_instruction_found = True
-                    break
-                else:
-                    max_dim = int(np.argmax(size))
-                    size[max_dim] = size[max_dim] // 2
+            for mfma_instruction in mfma_instructions:
+                for _ in range(NUM_STAGES):
+                    matmul_instruction_gen = list(find_matmul_instruction(mfma_instruction, size))
+                    total_inst = min(len(matmul_instruction_gen) // 3, 5)  # At least 5 insts and max of 33.3% of insts.
+                    for index, matmul_instruction in enumerate(matmul_instruction_gen):
+                        if matmul_instruction is not None:
+                            if dtype_str not in matmul_instructions:
+                                matmul_instructions[dtype_str] = dict()
+                            matmul_instructions[dtype_str][str(matmul_instruction)] = matmul_instruction
+                            if args.fast and (index > total_inst):
+                                break
+                    if len(matmul_instruction_gen) > 0:
+                        mfma_instruction_found = True
+                        break
+                    else:
+                        max_dim = int(np.argmax(size))
+                        size[max_dim] = size[max_dim] // 2
             if not mfma_instruction_found:
                 print(f"Can't find mfma instructions for {original_size}, please contact hipblaslt expert")
             else:

From 4e4952328d23bd8b00604bf1beb7508a671a8a19 Mon Sep 17 00:00:00 2001
From: fsx950223 <fsx950223@outlook.com>
Date: Fri, 15 Nov 2024 08:26:07 +0000
Subject: [PATCH 02/23] fix bugs

---
 .../Utilities/tensile_generator/tensile_config_generator.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py b/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py
index 2f1aec0917..8566009449 100644
--- a/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py
+++ b/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py
@@ -84,7 +84,7 @@
 ArchitectureName = res.stdout.decode("utf-8").strip()
 res = subprocess.run("rocminfo | grep Compute", stdout=subprocess.PIPE, shell=True, env={"ROCR_VISIBLE_DEVICES":"0"})
 match = re.search(CU_RE, res.stdout.decode("utf-8").split('\n')[-2])
-NUM_STAGES = 8
+NUM_STAGES = 32
 DIV_MI = 3 # 33.3%
 MIN_MI = 5 # min 5 solutions
 CU = 0
@@ -246,7 +246,7 @@ def find_matmul_instruction(mfma_instruction, size):
                 continue
             for n_tiles in reversed(range(1, CU+1)):
                 n_tile_size = size[1] // n_tiles
-                if n_tile_size > 256:
+                if n_tile_size > 256 // m_tile_size:
                     continue
                 wave_tile_n = math.ceil(n_tile_size / mfma_instruction[1])
                 if wave_tile_n <= 0:
@@ -307,7 +307,7 @@ def calculate_min_flops(m_sum, n_sum, batch_sum, k_sum, iters):
     batch_avg = batch_sum / len(unique_gemms_subgroup)
     k_avg = k_sum / len(unique_gemms_subgroup)
 
-    return (ENQUEUES_PER_SYNC + args.iters) * m_avg * n_avg * batch_avg * k_avg / 2
+    return (ENQUEUES_PER_SYNC + iters) * m_avg * n_avg * batch_avg * k_avg / 2
 
 def dump_yaml(gpu_idx, gemm_group, yaml_file, m_sum, n_sum, batch_sum, k_sum, iters, groups):
     MinFlopsPerSync = calculate_min_flops(m_sum, n_sum, batch_sum, k_sum, iters)

From ed12d74006f22f8db3995d8d1870191b26da4f06 Mon Sep 17 00:00:00 2001
From: fsx950223 <fsx950223@outlook.com>
Date: Fri, 15 Nov 2024 10:04:22 +0000
Subject: [PATCH 03/23] fix arch name for mi308 and mi210

---
 tensilelite/Tensile/Utilities/tensile_generator/README.md    | 5 +----
 .../Utilities/tensile_generator/tensile_config_generator.py  | 2 +-
 2 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/tensilelite/Tensile/Utilities/tensile_generator/README.md b/tensilelite/Tensile/Utilities/tensile_generator/README.md
index 183307eb06..dbf56a39f5 100644
--- a/tensilelite/Tensile/Utilities/tensile_generator/README.md
+++ b/tensilelite/Tensile/Utilities/tensile_generator/README.md
@@ -60,15 +60,12 @@ To use the `tensile_config_generator.py` script, follow these steps:
 
    MI308:
 
-   Modify yamls under ```<tune result directory>/3_LibraryLogic/```. ```- gfx942 -> - {Architecture: gfx942, CUCount: {20|80}}```
-
    For cpx, use the gfx942_20cu folder; for spx, use the gfx942_80cu folder.
    ```
    python3 ./tensilelite/Tensile/Utilities/merge.py --no_eff library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/aquavanjaram/{gfx942_20cu|gfx942_80cu}/{Equality|GridBased}/ <tune result directory>/3_LibraryLogic/ library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/aquavanjaram/{gfx942_20cu|gfx942_80cu}/{Equality|GridBased}/
    ```
    MI210:
-
-   Modify yamls under ```<tune result directory>/3_LibraryLogic/```. ```- gfx90a -> - {Architecture: gfx90a, CUCount: 104}```
+   
    ```
    python3 ./tensilelite/Tensile/Utilities/merge.py --no_eff library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/aldebaran/104CU/{Equality|GridBased}/ <tune result directory>/3_LibraryLogic/ library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/aldebaran/104CU/{Equality|GridBased}/
    ```
diff --git a/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py b/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py
index 8566009449..973df6961c 100644
--- a/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py
+++ b/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py
@@ -375,7 +375,7 @@ def dump_yaml(gpu_idx, gemm_group, yaml_file, m_sum, n_sum, batch_sum, k_sum, it
         data["BenchmarkProblems"][i][0] = dtype
     data["LibraryLogic"]["DeviceNames"] = DeviceNames
     data["LibraryLogic"]["ScheduleName"] = ScheduleName
-    data["LibraryLogic"]["ArchitectureName"] = ArchitectureName
+    data["LibraryLogic"]["ArchitectureName"] = {"Architecture": ArchitectureName, "CUCount": CU}
     data["LibraryLogic"]["LibraryType"] = LibraryType
     # Write the updated YAML file
     yaml_file = os.path.basename(yaml_file)

From 554e13e20d4af015a97b425c8447df1cfe0deac4 Mon Sep 17 00:00:00 2001
From: fsx950223 <fsx950223@outlook.com>
Date: Mon, 18 Nov 2024 02:23:17 +0000
Subject: [PATCH 04/23] add scale and bias

---
 .../tensile_config_generator.py               | 138 ++++++++++++++----
 1 file changed, 111 insertions(+), 27 deletions(-)

diff --git a/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py b/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py
index 973df6961c..244f84dfd8 100644
--- a/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py
+++ b/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py
@@ -149,8 +149,62 @@
     r"--bias_type (?P<BIAS_TYPE>[\w ]+)"
     r"--compute_type (?P<COMPUTE_TYPE>[\w ]+)")
 
+HIPBLASLT_BENCH_RE_SAB = (
+    r"(?P<CMD>\w+) --api_method c "
+    r"-m (?P<M>[\d ]+)"
+    r"-n (?P<N>[\d ]+)"
+    r"-k (?P<K>[\d ]+)"
+    r"--lda (?P<LDA>[\d ]+)"
+    r"--ldb (?P<LDB>[\d ]+)"
+    r"--ldc (?P<LDC>[\d ]+)"
+    r"--ldd (?P<LDD>[\d ]+)"
+    r"--stride_a (?P<STRIDE_A>[\d ]+)"
+    r"--stride_b (?P<STRIDE_B>[\d ]+)"
+    r"--stride_c (?P<STRIDE_C>[\d ]+)"
+    r"--stride_d (?P<STRIDE_D>[\d ]+)"
+    r"--alpha (?P<ALPHA>[\d\. ]+)"
+    r"--beta (?P<BETA>[\d\. ]+)"
+    r"--transA (?P<TRANS_A>[\w ]+)"
+    r"--transB (?P<TRANS_B>[\w ]+)"
+    r"--batch_count (?P<BATCH_COUNT>[\d ]+)"
+    r"--scaleA (?P<SCALE_A>[\w ]+)"
+    r"--scaleB (?P<SCALE_B>[\w ]+)"
+    r"--a_type (?P<A_TYPE>[\w ]+)"
+    r"--b_type (?P<B_TYPE>[\w ]+)"
+    r"--c_type (?P<C_TYPE>[\w ]+)"
+    r"--d_type (?P<D_TYPE>[\w ]+)"
+    r"--scale_type (?P<SCALE_TYPE>[\w ]+)"
+    r"--bias_type (?P<BIAS_TYPE>[\w ]+)"
+    r"--compute_type (?P<COMPUTE_TYPE>[\w ]+)")
+
+HIPBLASLT_BENCH_RE_BIAS = (
+    r"(?P<CMD>\w+) --api_method c "
+    r"-m (?P<M>[\d ]+)"
+    r"-n (?P<N>[\d ]+)"
+    r"-k (?P<K>[\d ]+)"
+    r"--lda (?P<LDA>[\d ]+)"
+    r"--ldb (?P<LDB>[\d ]+)"
+    r"--ldc (?P<LDC>[\d ]+)"
+    r"--ldd (?P<LDD>[\d ]+)"
+    r"--stride_a (?P<STRIDE_A>[\d ]+)"
+    r"--stride_b (?P<STRIDE_B>[\d ]+)"
+    r"--stride_c (?P<STRIDE_C>[\d ]+)"
+    r"--stride_d (?P<STRIDE_D>[\d ]+)"
+    r"--alpha (?P<ALPHA>[\d\. ]+)"
+    r"--beta (?P<BETA>[\d\. ]+)"
+    r"--transA (?P<TRANS_A>[\w ]+)"
+    r"--transB (?P<TRANS_B>[\w ]+)"
+    r"--batch_count (?P<BATCH_COUNT>[\d ]+)"
+    r"--bias_vector --bias_source (?P<BIAS_SOURCE>[\w ]+)"
+    r"--a_type (?P<A_TYPE>[\w ]+)"
+    r"--b_type (?P<B_TYPE>[\w ]+)"
+    r"--c_type (?P<C_TYPE>[\w ]+)"
+    r"--d_type (?P<D_TYPE>[\w ]+)"
+    r"--scale_type (?P<SCALE_TYPE>[\w ]+)"
+    r"--bias_type (?P<BIAS_TYPE>[\w ]+)"
+    r"--compute_type (?P<COMPUTE_TYPE>[\w ]+)")
 
-HIPBLASLT_BENCH_RE2 = (
+HIPBLASLT_BENCH_RE_SAB_BIAS = (
     r"(?P<CMD>\w+) --api_method c "
     r"-m (?P<M>[\d ]+)"
     r"-n (?P<N>[\d ]+)"
@@ -213,18 +267,33 @@ def datatype_map(dtype):
 
 def trans_map(trans):
     if trans == "T":
-        return True
+        return 1
     elif trans == "N":
-        return False
+        return 0
     else:
         return None
 
+def bias_datatype_map(dtype):
+    if dtype == "f16_r":
+        return [datatype_map('f32_r'), datatype_map('f16_r')]
+    elif dtype == "f32_r":
+        return [datatype_map('f32_r')]
+    elif dtype == "xf32_r":
+        return [datatype_map('f32_r'), datatype_map('xf32_r')]
+    elif dtype == "bf16_r":
+        return [datatype_map('f32_r'), datatype_map('bf16_r')]
+    elif dtype == "f8_r":
+        return [datatype_map('f8_r')]
+    else:
+        return []
+
 def extract_dtype(match):
-    DataType = datatype_map(match.group('A_TYPE').strip())
-    DestDataType = datatype_map(match.group('C_TYPE').strip())
-    ComputeDataType = datatype_map(match.group('COMPUTE_TYPE').strip())
-    TransposeA = trans_map(match.group('TRANS_A').strip())
-    TransposeB = trans_map(match.group('TRANS_B').strip())
+    gdict = match.groupdict()
+    DataType = datatype_map(gdict.get('A_TYPE', '').strip())
+    DestDataType = datatype_map(gdict.get('C_TYPE', '').strip())
+    ComputeDataType = datatype_map(gdict.get('COMPUTE_TYPE', '').strip())
+    TransposeA = trans_map(gdict.get('TRANS_A', '').strip())
+    TransposeB = trans_map(gdict.get('TRANS_B', '').strip())
     if DataType in ["H", "B", "F8"]:
         HighPrecisionAccumulate = True
     else:
@@ -233,7 +302,16 @@ def extract_dtype(match):
     if ComputeDataType == "XS":
         ComputeDataType = "S"
         F32XdlMathOp = 'x'
-    return {"Batched": True, "DataType": DataType, "DestDataType": DestDataType, "ComputeDataType": ComputeDataType, "TransposeA": TransposeA, "TransposeB": TransposeB, "HighPrecisionAccumulate": HighPrecisionAccumulate, "F32XdlMathOp": F32XdlMathOp, "OperationType": "GEMM", "UseBeta": True}
+    res = {"Batched": True, "DataType": DataType, "DestDataType": DestDataType, "ComputeDataType": ComputeDataType, "TransposeA": TransposeA, "TransposeB": TransposeB, "HighPrecisionAccumulate": HighPrecisionAccumulate, "F32XdlMathOp": F32XdlMathOp, "OperationType": "GEMM", "UseBeta": True}
+
+    if gdict.get("BIAS_SOURCE"):
+        res["UseBias"] = 1
+        res["BiasSrc"] = gdict.get('BIAS_SOURCE', '').strip().upper()
+        res["BiasDataTypeList"] = list(bias_datatype_map(gdict.get("BIAS_TYPE", '').strip()))
+    if gdict.get("SCALE_A") is not None and gdict.get("SCALE_B") is not None:
+        res["UseScaleAB"] = "Scalar"
+        res["UseScaleAlphaVec"] = 1
+    return res
 
 def find_matmul_instruction(mfma_instruction, size):
     for bm in range(int(math.log(mfma_instruction[3],2))+1):
@@ -282,6 +360,25 @@ def get_groups(matmul_instruction_gen):
                 mi_left.append(mi)
     return mi_groups0, mi_groups1, mi_left
 
+def match_pattern(line):
+    if 'bias_vector' in line and 'scaleA' in line and 'scaleB' in line:
+        match = re.search(
+            HIPBLASLT_BENCH_RE_SAB_BIAS, line
+        )
+    elif 'bias_vector' in line:
+        match = re.search(
+            HIPBLASLT_BENCH_RE_BIAS, line
+        )
+    elif 'scaleA' in line and 'scaleB' in line:
+        match = re.search(
+            HIPBLASLT_BENCH_RE_SAB, line
+        )
+    else:
+        match = re.search(
+            HIPBLASLT_BENCH_RE, line
+        )
+    return match
+
 def extract_range(data):
     shapes = []
     if 'Exact' in data:
@@ -326,6 +423,8 @@ def dump_yaml(gpu_idx, gemm_group, yaml_file, m_sum, n_sum, batch_sum, k_sum, it
         if i >= len(data["BenchmarkProblems"]):
             data["BenchmarkProblems"].append(copy.deepcopy(data["BenchmarkProblems"][0]))
         data["BenchmarkProblems"][i][1]["BenchmarkFinalParameters"][0]["ProblemSizes"] = gemm_group[dtype_str]
+        if "BiasDataTypeList" in dtype:
+            data["BenchmarkProblems"][i][1]["BenchmarkFinalParameters"].append({"BiasTypeArgs": list(dtype["BiasDataTypeList"])})
 
         # Add groupd here if needed
         group_params = [[]]
@@ -390,14 +489,7 @@ def dump_yaml(gpu_idx, gemm_group, yaml_file, m_sum, n_sum, batch_sum, k_sum, it
     # Read problem sizes from the input file
     with open(args.hipblaslt_log, 'r') as f:
         for line in f:
-            if 'f8_r' in line:
-                match = re.search(
-                    HIPBLASLT_BENCH_RE2, line
-                )
-            else:
-                match = re.search(
-                    HIPBLASLT_BENCH_RE, line
-                )
+            match = match_pattern(line)
             if match:
                 if line in unique_gemms:
                     unique_gemms[line] += 1
@@ -423,15 +515,7 @@ def dump_yaml(gpu_idx, gemm_group, yaml_file, m_sum, n_sum, batch_sum, k_sum, it
         k_sum = 0
 
         for k, v in unique_gemms_subgroup:
-            if 'f8_r' in k:
-                match = re.search(
-                    HIPBLASLT_BENCH_RE2, k
-                )
-            else:
-                match = re.search(
-                    HIPBLASLT_BENCH_RE, k
-                )
-
+            match = match_pattern(k)
             if match:
                 size = extract_problem_size(match)
                 original_size = copy.deepcopy(size)
@@ -440,7 +524,7 @@ def dump_yaml(gpu_idx, gemm_group, yaml_file, m_sum, n_sum, batch_sum, k_sum, it
                 dtype_str = json.dumps(dtype)
                 if mfma_instructions is None:
                     continue
-                
+
                 mfma_instruction_found = False
                 for mfma_instruction in mfma_instructions:
                     for _ in range(NUM_STAGES):

From 14a4fc66be909eb24f6e6fb02174e0fed4b7ed46 Mon Sep 17 00:00:00 2001
From: fsx950223 <fsx950223@outlook.com>
Date: Mon, 18 Nov 2024 07:10:00 +0000
Subject: [PATCH 05/23] add full mi

---
 .../Utilities/tensile_generator/README.md     |  6 +++---
 .../tensile_config_generator.py               | 19 ++++++++++++-------
 2 files changed, 15 insertions(+), 10 deletions(-)

diff --git a/tensilelite/Tensile/Utilities/tensile_generator/README.md b/tensilelite/Tensile/Utilities/tensile_generator/README.md
index dbf56a39f5..d4b4fdeb84 100644
--- a/tensilelite/Tensile/Utilities/tensile_generator/README.md
+++ b/tensilelite/Tensile/Utilities/tensile_generator/README.md
@@ -48,7 +48,7 @@ To use the `tensile_config_generator.py` script, follow these steps:
 
 3. Install hipBLASLt and Tensile (change the path to the hipBLASLt repo):
    ```
-   bash ./install.sh -idc -a $(/opt/rocm/llvm/bin/offload-arch) --keep-build-tmp
+   bash ./install.sh -idc -a $(/opt/rocm/llvm/bin/offload-arch) --cpu_ref_lib=lapack
    ```
 
 4. Tune GEMM kernels using the generated YAML files:
@@ -65,14 +65,14 @@ To use the `tensile_config_generator.py` script, follow these steps:
    python3 ./tensilelite/Tensile/Utilities/merge.py --no_eff library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/aquavanjaram/{gfx942_20cu|gfx942_80cu}/{Equality|GridBased}/ <tune result directory>/3_LibraryLogic/ library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/aquavanjaram/{gfx942_20cu|gfx942_80cu}/{Equality|GridBased}/
    ```
    MI210:
-   
+
    ```
    python3 ./tensilelite/Tensile/Utilities/merge.py --no_eff library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/aldebaran/104CU/{Equality|GridBased}/ <tune result directory>/3_LibraryLogic/ library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/aldebaran/104CU/{Equality|GridBased}/
    ```
 
 6. Rebuild hipBLASLt with the merged results:
    ```
-   bash ./install.sh -idc -a $(/opt/rocm/llvm/bin/offload-arch) --keep-build-tmp
+   bash ./install.sh -idc -a $(/opt/rocm/llvm/bin/offload-arch) --cpu_ref_lib=lapack
    ```
 
 For more detailed information on the script's functionality and advanced usage, please refer to the comments within the `tensile_config_generator.py` file.
diff --git a/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py b/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py
index 244f84dfd8..d9e260d54a 100644
--- a/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py
+++ b/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py
@@ -72,6 +72,10 @@
     "--full_mfma", type=bool, default=False,
     help="If enabled, will search for all mfma instructions")
 
+parser.add_argument(
+    "--full_mi", type=bool, default=False,
+    help="If enabled, will search for all mi instructions")
+
 args = parser.parse_args()
 
 NUM_WARM_UP = 20
@@ -525,7 +529,7 @@ def dump_yaml(gpu_idx, gemm_group, yaml_file, m_sum, n_sum, batch_sum, k_sum, it
                 if mfma_instructions is None:
                     continue
 
-                mfma_instruction_found = False
+                matmul_instruction_found = False
                 for mfma_instruction in mfma_instructions:
                     for _ in range(NUM_STAGES):
                         matmul_instruction_gen = list(find_matmul_instruction(mfma_instruction, size))
@@ -562,13 +566,14 @@ def dump_yaml(gpu_idx, gemm_group, yaml_file, m_sum, n_sum, batch_sum, k_sum, it
                             if args.fast and (index > total_inst):
                                 break
                         if len(matmul_instruction_gen) > 0 or len(mi_groups0) > 0 or len(mi_groups1) > 0:
-                            mfma_instruction_found = True
-                            break
+                            matmul_instruction_found = True
+                            if not args.full_mi:
+                                break
                         else:
                             max_dim = int(np.argmax(size))
                             size[max_dim] = size[max_dim] // 2
 
-                if not mfma_instruction_found:
+                if not matmul_instruction_found:
                     print(f"Can't find mfma instructions for {original_size}, please contact hipblaslt expert")
                 else:
                     if dtype_str in gemm_group:
@@ -633,7 +638,7 @@ def dump_yaml(gpu_idx, gemm_group, yaml_file, m_sum, n_sum, batch_sum, k_sum, it
             mfma_instructions = instruction_map(dtype)
             if mfma_instructions is None:
                 continue
-            mfma_instruction_found = False
+            matmul_instruction_found = False
             for mfma_instruction in mfma_instructions:
                 for _ in range(NUM_STAGES):
                     matmul_instruction_gen = list(find_matmul_instruction(mfma_instruction, size))
@@ -646,12 +651,12 @@ def dump_yaml(gpu_idx, gemm_group, yaml_file, m_sum, n_sum, batch_sum, k_sum, it
                             if args.fast and (index > total_inst):
                                 break
                     if len(matmul_instruction_gen) > 0:
-                        mfma_instruction_found = True
+                        matmul_instruction_found = True
                         break
                     else:
                         max_dim = int(np.argmax(size))
                         size[max_dim] = size[max_dim] // 2
-            if not mfma_instruction_found:
+            if not matmul_instruction_found:
                 print(f"Can't find mfma instructions for {original_size}, please contact hipblaslt expert")
             else:
                 if dtype_str in gemm_group:

From d41b250725ff5bbb36e1e11bb4b40b9d27eb5c5a Mon Sep 17 00:00:00 2001
From: fsx950223 <fsx950223@outlook.com>
Date: Tue, 19 Nov 2024 02:28:29 +0000
Subject: [PATCH 06/23] fix bugs

---
 .../tensile_config_generator.py                  | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py b/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py
index d9e260d54a..1789ef49c9 100644
--- a/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py
+++ b/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py
@@ -73,9 +73,13 @@
     help="If enabled, will search for all mfma instructions")
 
 parser.add_argument(
-    "--full_mi", type=bool, default=False,
+    "--full_stage", type=bool, default=False,
     help="If enabled, will search for all mi instructions")
 
+parser.add_argument(
+    "--num_stages", type=int, default=8,
+    help="How many times to divide matrix")
+
 args = parser.parse_args()
 
 NUM_WARM_UP = 20
@@ -88,7 +92,7 @@
 ArchitectureName = res.stdout.decode("utf-8").strip()
 res = subprocess.run("rocminfo | grep Compute", stdout=subprocess.PIPE, shell=True, env={"ROCR_VISIBLE_DEVICES":"0"})
 match = re.search(CU_RE, res.stdout.decode("utf-8").split('\n')[-2])
-NUM_STAGES = 32
+NUM_STAGES = args.num_stages
 DIV_MI = 3 # 33.3%
 MIN_MI = 5 # min 5 solutions
 CU = 0
@@ -124,7 +128,7 @@
     bf16_instructions = [[16,16,16,1],[32,32,8,1]]
     tf32_instructions = [[16,16,8,1]]
     fp32_instructions = [[16,16,4,1]]
-    fp8_instructions = [[16,16,32,1]]
+    fp8_instructions = [[32,32,16,1], [16,16,32,1]]
 
 
 HIPBLASLT_BENCH_RE = (
@@ -328,7 +332,7 @@ def find_matmul_instruction(mfma_instruction, size):
                 continue
             for n_tiles in reversed(range(1, CU+1)):
                 n_tile_size = size[1] // n_tiles
-                if n_tile_size > 256 // m_tile_size:
+                if n_tile_size > 256:
                     continue
                 wave_tile_n = math.ceil(n_tile_size / mfma_instruction[1])
                 if wave_tile_n <= 0:
@@ -567,10 +571,10 @@ def dump_yaml(gpu_idx, gemm_group, yaml_file, m_sum, n_sum, batch_sum, k_sum, it
                                 break
                         if len(matmul_instruction_gen) > 0 or len(mi_groups0) > 0 or len(mi_groups1) > 0:
                             matmul_instruction_found = True
-                            if not args.full_mi:
+                            if not args.full_stage:
                                 break
                         else:
-                            max_dim = int(np.argmax(size))
+                            max_dim = int(np.argmax(size[:2]))
                             size[max_dim] = size[max_dim] // 2
 
                 if not matmul_instruction_found:

From 3c5fb0a47218242f100b76662c465cd391141878 Mon Sep 17 00:00:00 2001
From: fsx950223 <fsx950223@outlook.com>
Date: Tue, 19 Nov 2024 06:05:12 +0000
Subject: [PATCH 07/23] fix bugs

---
 .../tensile_generator/tensile_config_generator.py         | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py b/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py
index 1789ef49c9..fa2dbb3509 100644
--- a/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py
+++ b/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py
@@ -95,6 +95,7 @@
 NUM_STAGES = args.num_stages
 DIV_MI = 3 # 33.3%
 MIN_MI = 5 # min 5 solutions
+NONTEMPORALRATIO = 8
 CU = 0
 if match:
     CU = int(match.group('COMPUTE_UNIT').strip())
@@ -283,13 +284,13 @@ def trans_map(trans):
 
 def bias_datatype_map(dtype):
     if dtype == "f16_r":
-        return [datatype_map('f32_r'), datatype_map('f16_r')]
+        return [datatype_map('f16_r')]
     elif dtype == "f32_r":
         return [datatype_map('f32_r')]
     elif dtype == "xf32_r":
-        return [datatype_map('f32_r'), datatype_map('xf32_r')]
+        return [datatype_map('xf32_r')]
     elif dtype == "bf16_r":
-        return [datatype_map('f32_r'), datatype_map('bf16_r')]
+        return [datatype_map('bf16_r')]
     elif dtype == "f8_r":
         return [datatype_map('f8_r')]
     else:
@@ -352,7 +353,6 @@ def find_matmul_instruction(mfma_instruction, size):
 
 def get_groups(matmul_instruction_gen):
     # Extract skinny MTs for Groups
-    NONTEMPORALRATIO = 8
     mi_groups0 = []
     mi_groups1 = []
     mi_left = []

From 7e4a03547e78cc5ff4683169e53b016aae888e75 Mon Sep 17 00:00:00 2001
From: fsx950223 <fsx950223@outlook.com>
Date: Tue, 19 Nov 2024 08:21:41 +0000
Subject: [PATCH 08/23] add activation pattern

---
 .../tensile_config_generator.py               | 245 ++++++++++++------
 1 file changed, 159 insertions(+), 86 deletions(-)

diff --git a/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py b/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py
index fa2dbb3509..c692eef4b4 100644
--- a/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py
+++ b/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py
@@ -131,8 +131,7 @@
     fp32_instructions = [[16,16,4,1]]
     fp8_instructions = [[32,32,16,1], [16,16,32,1]]
 
-
-HIPBLASLT_BENCH_RE = (
+HIPBLASLT_BENCH_BASE = (
     r"(?P<CMD>\w+) --api_method c "
     r"-m (?P<M>[\d ]+)"
     r"-n (?P<N>[\d ]+)"
@@ -150,97 +149,156 @@
     r"--transA (?P<TRANS_A>[\w ]+)"
     r"--transB (?P<TRANS_B>[\w ]+)"
     r"--batch_count (?P<BATCH_COUNT>[\d ]+)"
-    r"--a_type (?P<A_TYPE>[\w ]+)"
-    r"--b_type (?P<B_TYPE>[\w ]+)"
-    r"--c_type (?P<C_TYPE>[\w ]+)"
-    r"--d_type (?P<D_TYPE>[\w ]+)"
-    r"--scale_type (?P<SCALE_TYPE>[\w ]+)"
-    r"--bias_type (?P<BIAS_TYPE>[\w ]+)"
-    r"--compute_type (?P<COMPUTE_TYPE>[\w ]+)")
+)
 
-HIPBLASLT_BENCH_RE_SAB = (
-    r"(?P<CMD>\w+) --api_method c "
-    r"-m (?P<M>[\d ]+)"
-    r"-n (?P<N>[\d ]+)"
-    r"-k (?P<K>[\d ]+)"
-    r"--lda (?P<LDA>[\d ]+)"
-    r"--ldb (?P<LDB>[\d ]+)"
-    r"--ldc (?P<LDC>[\d ]+)"
-    r"--ldd (?P<LDD>[\d ]+)"
-    r"--stride_a (?P<STRIDE_A>[\d ]+)"
-    r"--stride_b (?P<STRIDE_B>[\d ]+)"
-    r"--stride_c (?P<STRIDE_C>[\d ]+)"
-    r"--stride_d (?P<STRIDE_D>[\d ]+)"
-    r"--alpha (?P<ALPHA>[\d\. ]+)"
-    r"--beta (?P<BETA>[\d\. ]+)"
-    r"--transA (?P<TRANS_A>[\w ]+)"
-    r"--transB (?P<TRANS_B>[\w ]+)"
-    r"--batch_count (?P<BATCH_COUNT>[\d ]+)"
-    r"--scaleA (?P<SCALE_A>[\w ]+)"
-    r"--scaleB (?P<SCALE_B>[\w ]+)"
-    r"--a_type (?P<A_TYPE>[\w ]+)"
-    r"--b_type (?P<B_TYPE>[\w ]+)"
-    r"--c_type (?P<C_TYPE>[\w ]+)"
-    r"--d_type (?P<D_TYPE>[\w ]+)"
-    r"--scale_type (?P<SCALE_TYPE>[\w ]+)"
-    r"--bias_type (?P<BIAS_TYPE>[\w ]+)"
-    r"--compute_type (?P<COMPUTE_TYPE>[\w ]+)")
+# Optional patterns for scale and bias
+SCALE_PATTERN = r"--scaleA (?P<SCALE_A>[\w ]+)--scaleB (?P<SCALE_B>[\w ]+)"
+BIAS_PATTERN = r"--bias_vector --bias_source (?P<BIAS_SOURCE>[\w ]+)"
+ACTIVATION_PATTERN = r"--activation_type (?P<ACTIVATION_TYPE>[\w ]+)"
 
-HIPBLASLT_BENCH_RE_BIAS = (
-    r"(?P<CMD>\w+) --api_method c "
-    r"-m (?P<M>[\d ]+)"
-    r"-n (?P<N>[\d ]+)"
-    r"-k (?P<K>[\d ]+)"
-    r"--lda (?P<LDA>[\d ]+)"
-    r"--ldb (?P<LDB>[\d ]+)"
-    r"--ldc (?P<LDC>[\d ]+)"
-    r"--ldd (?P<LDD>[\d ]+)"
-    r"--stride_a (?P<STRIDE_A>[\d ]+)"
-    r"--stride_b (?P<STRIDE_B>[\d ]+)"
-    r"--stride_c (?P<STRIDE_C>[\d ]+)"
-    r"--stride_d (?P<STRIDE_D>[\d ]+)"
-    r"--alpha (?P<ALPHA>[\d\. ]+)"
-    r"--beta (?P<BETA>[\d\. ]+)"
-    r"--transA (?P<TRANS_A>[\w ]+)"
-    r"--transB (?P<TRANS_B>[\w ]+)"
-    r"--batch_count (?P<BATCH_COUNT>[\d ]+)"
-    r"--bias_vector --bias_source (?P<BIAS_SOURCE>[\w ]+)"
-    r"--a_type (?P<A_TYPE>[\w ]+)"
-    r"--b_type (?P<B_TYPE>[\w ]+)"
-    r"--c_type (?P<C_TYPE>[\w ]+)"
-    r"--d_type (?P<D_TYPE>[\w ]+)"
-    r"--scale_type (?P<SCALE_TYPE>[\w ]+)"
-    r"--bias_type (?P<BIAS_TYPE>[\w ]+)"
-    r"--compute_type (?P<COMPUTE_TYPE>[\w ]+)")
-
-HIPBLASLT_BENCH_RE_SAB_BIAS = (
-    r"(?P<CMD>\w+) --api_method c "
-    r"-m (?P<M>[\d ]+)"
-    r"-n (?P<N>[\d ]+)"
-    r"-k (?P<K>[\d ]+)"
-    r"--lda (?P<LDA>[\d ]+)"
-    r"--ldb (?P<LDB>[\d ]+)"
-    r"--ldc (?P<LDC>[\d ]+)"
-    r"--ldd (?P<LDD>[\d ]+)"
-    r"--stride_a (?P<STRIDE_A>[\d ]+)"
-    r"--stride_b (?P<STRIDE_B>[\d ]+)"
-    r"--stride_c (?P<STRIDE_C>[\d ]+)"
-    r"--stride_d (?P<STRIDE_D>[\d ]+)"
-    r"--alpha (?P<ALPHA>[\d\. ]+)"
-    r"--beta (?P<BETA>[\d\. ]+)"
-    r"--transA (?P<TRANS_A>[\w ]+)"
-    r"--transB (?P<TRANS_B>[\w ]+)"
-    r"--batch_count (?P<BATCH_COUNT>[\d ]+)"
-    r"--scaleA (?P<SCALE_A>[\w ]+)"
-    r"--scaleB (?P<SCALE_B>[\w ]+)"
-    r"--bias_vector --bias_source (?P<BIAS_SOURCE>[\w ]+)"
+# Common ending pattern
+TYPE_PATTERN = (
     r"--a_type (?P<A_TYPE>[\w ]+)"
     r"--b_type (?P<B_TYPE>[\w ]+)"
     r"--c_type (?P<C_TYPE>[\w ]+)"
     r"--d_type (?P<D_TYPE>[\w ]+)"
     r"--scale_type (?P<SCALE_TYPE>[\w ]+)"
     r"--bias_type (?P<BIAS_TYPE>[\w ]+)"
-    r"--compute_type (?P<COMPUTE_TYPE>[\w ]+)")
+    r"--compute_type (?P<COMPUTE_TYPE>[\w ]+)"
+)
+
+# Build the combined pattern with optional parts
+def build_pattern(has_scale=False, has_bias=False, has_activation=False):
+    pattern = HIPBLASLT_BENCH_BASE
+    if has_scale:
+        pattern += SCALE_PATTERN
+    if has_bias:
+        pattern += BIAS_PATTERN
+    pattern += TYPE_PATTERN
+    if has_activation:
+        pattern += ACTIVATION_PATTERN
+    return pattern
+
+# Create the four variations
+HIPBLASLT_BENCH_RE = build_pattern()
+HIPBLASLT_BENCH_RE_SAB = build_pattern(has_scale=True)
+HIPBLASLT_BENCH_RE_BIAS = build_pattern(has_bias=True)
+HIPBLASLT_BENCH_RE_ACT = build_pattern(has_activation=True)
+HIPBLASLT_BENCH_RE_SAB_ACT = build_pattern(has_scale=True, has_activation=True)
+HIPBLASLT_BENCH_RE_BIAS_ACT = build_pattern(has_bias=True, has_activation=True)
+HIPBLASLT_BENCH_RE_SAB_BIAS = build_pattern(has_scale=True, has_bias=True)
+HIPBLASLT_BENCH_RE_SAB_BIAS_ACT = build_pattern(has_scale=True, has_bias=True, has_activation=True)
+
+
+# HIPBLASLT_BENCH_RE = (
+#     r"(?P<CMD>\w+) --api_method c "
+#     r"-m (?P<M>[\d ]+)"
+#     r"-n (?P<N>[\d ]+)"
+#     r"-k (?P<K>[\d ]+)"
+#     r"--lda (?P<LDA>[\d ]+)"
+#     r"--ldb (?P<LDB>[\d ]+)"
+#     r"--ldc (?P<LDC>[\d ]+)"
+#     r"--ldd (?P<LDD>[\d ]+)"
+#     r"--stride_a (?P<STRIDE_A>[\d ]+)"
+#     r"--stride_b (?P<STRIDE_B>[\d ]+)"
+#     r"--stride_c (?P<STRIDE_C>[\d ]+)"
+#     r"--stride_d (?P<STRIDE_D>[\d ]+)"
+#     r"--alpha (?P<ALPHA>[\d\. ]+)"
+#     r"--beta (?P<BETA>[\d\. ]+)"
+#     r"--transA (?P<TRANS_A>[\w ]+)"
+#     r"--transB (?P<TRANS_B>[\w ]+)"
+#     r"--batch_count (?P<BATCH_COUNT>[\d ]+)"
+#     r"--a_type (?P<A_TYPE>[\w ]+)"
+#     r"--b_type (?P<B_TYPE>[\w ]+)"
+#     r"--c_type (?P<C_TYPE>[\w ]+)"
+#     r"--d_type (?P<D_TYPE>[\w ]+)"
+#     r"--scale_type (?P<SCALE_TYPE>[\w ]+)"
+#     r"--bias_type (?P<BIAS_TYPE>[\w ]+)"
+#     r"--compute_type (?P<COMPUTE_TYPE>[\w ]+)")
+
+# HIPBLASLT_BENCH_RE_SAB = (
+#     r"(?P<CMD>\w+) --api_method c "
+#     r"-m (?P<M>[\d ]+)"
+#     r"-n (?P<N>[\d ]+)"
+#     r"-k (?P<K>[\d ]+)"
+#     r"--lda (?P<LDA>[\d ]+)"
+#     r"--ldb (?P<LDB>[\d ]+)"
+#     r"--ldc (?P<LDC>[\d ]+)"
+#     r"--ldd (?P<LDD>[\d ]+)"
+#     r"--stride_a (?P<STRIDE_A>[\d ]+)"
+#     r"--stride_b (?P<STRIDE_B>[\d ]+)"
+#     r"--stride_c (?P<STRIDE_C>[\d ]+)"
+#     r"--stride_d (?P<STRIDE_D>[\d ]+)"
+#     r"--alpha (?P<ALPHA>[\d\. ]+)"
+#     r"--beta (?P<BETA>[\d\. ]+)"
+#     r"--transA (?P<TRANS_A>[\w ]+)"
+#     r"--transB (?P<TRANS_B>[\w ]+)"
+#     r"--batch_count (?P<BATCH_COUNT>[\d ]+)"
+#     r"--scaleA (?P<SCALE_A>[\w ]+)"
+#     r"--scaleB (?P<SCALE_B>[\w ]+)"
+#     r"--a_type (?P<A_TYPE>[\w ]+)"
+#     r"--b_type (?P<B_TYPE>[\w ]+)"
+#     r"--c_type (?P<C_TYPE>[\w ]+)"
+#     r"--d_type (?P<D_TYPE>[\w ]+)"
+#     r"--scale_type (?P<SCALE_TYPE>[\w ]+)"
+#     r"--bias_type (?P<BIAS_TYPE>[\w ]+)"
+#     r"--compute_type (?P<COMPUTE_TYPE>[\w ]+)")
+
+# HIPBLASLT_BENCH_RE_BIAS = (
+#     r"(?P<CMD>\w+) --api_method c "
+#     r"-m (?P<M>[\d ]+)"
+#     r"-n (?P<N>[\d ]+)"
+#     r"-k (?P<K>[\d ]+)"
+#     r"--lda (?P<LDA>[\d ]+)"
+#     r"--ldb (?P<LDB>[\d ]+)"
+#     r"--ldc (?P<LDC>[\d ]+)"
+#     r"--ldd (?P<LDD>[\d ]+)"
+#     r"--stride_a (?P<STRIDE_A>[\d ]+)"
+#     r"--stride_b (?P<STRIDE_B>[\d ]+)"
+#     r"--stride_c (?P<STRIDE_C>[\d ]+)"
+#     r"--stride_d (?P<STRIDE_D>[\d ]+)"
+#     r"--alpha (?P<ALPHA>[\d\. ]+)"
+#     r"--beta (?P<BETA>[\d\. ]+)"
+#     r"--transA (?P<TRANS_A>[\w ]+)"
+#     r"--transB (?P<TRANS_B>[\w ]+)"
+#     r"--batch_count (?P<BATCH_COUNT>[\d ]+)"
+#     r"--bias_vector --bias_source (?P<BIAS_SOURCE>[\w ]+)"
+#     r"--a_type (?P<A_TYPE>[\w ]+)"
+#     r"--b_type (?P<B_TYPE>[\w ]+)"
+#     r"--c_type (?P<C_TYPE>[\w ]+)"
+#     r"--d_type (?P<D_TYPE>[\w ]+)"
+#     r"--scale_type (?P<SCALE_TYPE>[\w ]+)"
+#     r"--bias_type (?P<BIAS_TYPE>[\w ]+)"
+#     r"--compute_type (?P<COMPUTE_TYPE>[\w ]+)")
+
+# HIPBLASLT_BENCH_RE_SAB_BIAS = (
+#     r"(?P<CMD>\w+) --api_method c "
+#     r"-m (?P<M>[\d ]+)"
+#     r"-n (?P<N>[\d ]+)"
+#     r"-k (?P<K>[\d ]+)"
+#     r"--lda (?P<LDA>[\d ]+)"
+#     r"--ldb (?P<LDB>[\d ]+)"
+#     r"--ldc (?P<LDC>[\d ]+)"
+#     r"--ldd (?P<LDD>[\d ]+)"
+#     r"--stride_a (?P<STRIDE_A>[\d ]+)"
+#     r"--stride_b (?P<STRIDE_B>[\d ]+)"
+#     r"--stride_c (?P<STRIDE_C>[\d ]+)"
+#     r"--stride_d (?P<STRIDE_D>[\d ]+)"
+#     r"--alpha (?P<ALPHA>[\d\. ]+)"
+#     r"--beta (?P<BETA>[\d\. ]+)"
+#     r"--transA (?P<TRANS_A>[\w ]+)"
+#     r"--transB (?P<TRANS_B>[\w ]+)"
+#     r"--batch_count (?P<BATCH_COUNT>[\d ]+)"
+#     r"--scaleA (?P<SCALE_A>[\w ]+)"
+#     r"--scaleB (?P<SCALE_B>[\w ]+)"
+#     r"--bias_vector --bias_source (?P<BIAS_SOURCE>[\w ]+)"
+#     r"--a_type (?P<A_TYPE>[\w ]+)"
+#     r"--b_type (?P<B_TYPE>[\w ]+)"
+#     r"--c_type (?P<C_TYPE>[\w ]+)"
+#     r"--d_type (?P<D_TYPE>[\w ]+)"
+#     r"--scale_type (?P<SCALE_TYPE>[\w ]+)"
+#     r"--bias_type (?P<BIAS_TYPE>[\w ]+)"
+#     r"--compute_type (?P<COMPUTE_TYPE>[\w ]+)")
 
 # Function to extract problem sizes from a line
 def extract_problem_size(match):
@@ -317,6 +375,9 @@ def extract_dtype(match):
         res["UseBias"] = 1
         res["BiasSrc"] = gdict.get('BIAS_SOURCE', '').strip().upper()
         res["BiasDataTypeList"] = list(bias_datatype_map(gdict.get("BIAS_TYPE", '').strip()))
+    if gdict.get("ACTIVATION_TYPE"):
+        res["Activation"] = True
+        res["ActivationType"] = "hipblaslt_all"
     if gdict.get("SCALE_A") is not None and gdict.get("SCALE_B") is not None:
         res["UseScaleAB"] = "Scalar"
         res["UseScaleAlphaVec"] = 1
@@ -369,10 +430,22 @@ def get_groups(matmul_instruction_gen):
     return mi_groups0, mi_groups1, mi_left
 
 def match_pattern(line):
-    if 'bias_vector' in line and 'scaleA' in line and 'scaleB' in line:
+    if 'activation_type' in line and 'bias_vector' in line and 'scaleA' in line and 'scaleB' in line:
+        match = re.search(
+            HIPBLASLT_BENCH_RE_SAB_BIAS_ACT, line
+        )
+    elif 'bias_vector' in line and 'scaleA' in line and 'scaleB' in line:
         match = re.search(
             HIPBLASLT_BENCH_RE_SAB_BIAS, line
         )
+    elif 'activation_type' in line and 'scaleA' in line and 'scaleB' in line:
+        match = re.search(
+            HIPBLASLT_BENCH_RE_SAB_ACT, line
+        )
+    elif 'bias_vector' in line and 'activation_type' in line:
+        match = re.search(
+            HIPBLASLT_BENCH_RE_BIAS_ACT, line
+        )
     elif 'bias_vector' in line:
         match = re.search(
             HIPBLASLT_BENCH_RE_BIAS, line

From 81d4a91e4fa204e8445ba514a68971cf372f1fcf Mon Sep 17 00:00:00 2001
From: fsx950223 <fsx950223@outlook.com>
Date: Tue, 19 Nov 2024 10:04:25 +0000
Subject: [PATCH 09/23] fix a bug

---
 .../tensile_config_generator.py               | 114 +-----------------
 1 file changed, 2 insertions(+), 112 deletions(-)

diff --git a/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py b/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py
index c692eef4b4..e9feea2fd6 100644
--- a/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py
+++ b/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py
@@ -190,116 +190,6 @@ def build_pattern(has_scale=False, has_bias=False, has_activation=False):
 HIPBLASLT_BENCH_RE_SAB_BIAS_ACT = build_pattern(has_scale=True, has_bias=True, has_activation=True)
 
 
-# HIPBLASLT_BENCH_RE = (
-#     r"(?P<CMD>\w+) --api_method c "
-#     r"-m (?P<M>[\d ]+)"
-#     r"-n (?P<N>[\d ]+)"
-#     r"-k (?P<K>[\d ]+)"
-#     r"--lda (?P<LDA>[\d ]+)"
-#     r"--ldb (?P<LDB>[\d ]+)"
-#     r"--ldc (?P<LDC>[\d ]+)"
-#     r"--ldd (?P<LDD>[\d ]+)"
-#     r"--stride_a (?P<STRIDE_A>[\d ]+)"
-#     r"--stride_b (?P<STRIDE_B>[\d ]+)"
-#     r"--stride_c (?P<STRIDE_C>[\d ]+)"
-#     r"--stride_d (?P<STRIDE_D>[\d ]+)"
-#     r"--alpha (?P<ALPHA>[\d\. ]+)"
-#     r"--beta (?P<BETA>[\d\. ]+)"
-#     r"--transA (?P<TRANS_A>[\w ]+)"
-#     r"--transB (?P<TRANS_B>[\w ]+)"
-#     r"--batch_count (?P<BATCH_COUNT>[\d ]+)"
-#     r"--a_type (?P<A_TYPE>[\w ]+)"
-#     r"--b_type (?P<B_TYPE>[\w ]+)"
-#     r"--c_type (?P<C_TYPE>[\w ]+)"
-#     r"--d_type (?P<D_TYPE>[\w ]+)"
-#     r"--scale_type (?P<SCALE_TYPE>[\w ]+)"
-#     r"--bias_type (?P<BIAS_TYPE>[\w ]+)"
-#     r"--compute_type (?P<COMPUTE_TYPE>[\w ]+)")
-
-# HIPBLASLT_BENCH_RE_SAB = (
-#     r"(?P<CMD>\w+) --api_method c "
-#     r"-m (?P<M>[\d ]+)"
-#     r"-n (?P<N>[\d ]+)"
-#     r"-k (?P<K>[\d ]+)"
-#     r"--lda (?P<LDA>[\d ]+)"
-#     r"--ldb (?P<LDB>[\d ]+)"
-#     r"--ldc (?P<LDC>[\d ]+)"
-#     r"--ldd (?P<LDD>[\d ]+)"
-#     r"--stride_a (?P<STRIDE_A>[\d ]+)"
-#     r"--stride_b (?P<STRIDE_B>[\d ]+)"
-#     r"--stride_c (?P<STRIDE_C>[\d ]+)"
-#     r"--stride_d (?P<STRIDE_D>[\d ]+)"
-#     r"--alpha (?P<ALPHA>[\d\. ]+)"
-#     r"--beta (?P<BETA>[\d\. ]+)"
-#     r"--transA (?P<TRANS_A>[\w ]+)"
-#     r"--transB (?P<TRANS_B>[\w ]+)"
-#     r"--batch_count (?P<BATCH_COUNT>[\d ]+)"
-#     r"--scaleA (?P<SCALE_A>[\w ]+)"
-#     r"--scaleB (?P<SCALE_B>[\w ]+)"
-#     r"--a_type (?P<A_TYPE>[\w ]+)"
-#     r"--b_type (?P<B_TYPE>[\w ]+)"
-#     r"--c_type (?P<C_TYPE>[\w ]+)"
-#     r"--d_type (?P<D_TYPE>[\w ]+)"
-#     r"--scale_type (?P<SCALE_TYPE>[\w ]+)"
-#     r"--bias_type (?P<BIAS_TYPE>[\w ]+)"
-#     r"--compute_type (?P<COMPUTE_TYPE>[\w ]+)")
-
-# HIPBLASLT_BENCH_RE_BIAS = (
-#     r"(?P<CMD>\w+) --api_method c "
-#     r"-m (?P<M>[\d ]+)"
-#     r"-n (?P<N>[\d ]+)"
-#     r"-k (?P<K>[\d ]+)"
-#     r"--lda (?P<LDA>[\d ]+)"
-#     r"--ldb (?P<LDB>[\d ]+)"
-#     r"--ldc (?P<LDC>[\d ]+)"
-#     r"--ldd (?P<LDD>[\d ]+)"
-#     r"--stride_a (?P<STRIDE_A>[\d ]+)"
-#     r"--stride_b (?P<STRIDE_B>[\d ]+)"
-#     r"--stride_c (?P<STRIDE_C>[\d ]+)"
-#     r"--stride_d (?P<STRIDE_D>[\d ]+)"
-#     r"--alpha (?P<ALPHA>[\d\. ]+)"
-#     r"--beta (?P<BETA>[\d\. ]+)"
-#     r"--transA (?P<TRANS_A>[\w ]+)"
-#     r"--transB (?P<TRANS_B>[\w ]+)"
-#     r"--batch_count (?P<BATCH_COUNT>[\d ]+)"
-#     r"--bias_vector --bias_source (?P<BIAS_SOURCE>[\w ]+)"
-#     r"--a_type (?P<A_TYPE>[\w ]+)"
-#     r"--b_type (?P<B_TYPE>[\w ]+)"
-#     r"--c_type (?P<C_TYPE>[\w ]+)"
-#     r"--d_type (?P<D_TYPE>[\w ]+)"
-#     r"--scale_type (?P<SCALE_TYPE>[\w ]+)"
-#     r"--bias_type (?P<BIAS_TYPE>[\w ]+)"
-#     r"--compute_type (?P<COMPUTE_TYPE>[\w ]+)")
-
-# HIPBLASLT_BENCH_RE_SAB_BIAS = (
-#     r"(?P<CMD>\w+) --api_method c "
-#     r"-m (?P<M>[\d ]+)"
-#     r"-n (?P<N>[\d ]+)"
-#     r"-k (?P<K>[\d ]+)"
-#     r"--lda (?P<LDA>[\d ]+)"
-#     r"--ldb (?P<LDB>[\d ]+)"
-#     r"--ldc (?P<LDC>[\d ]+)"
-#     r"--ldd (?P<LDD>[\d ]+)"
-#     r"--stride_a (?P<STRIDE_A>[\d ]+)"
-#     r"--stride_b (?P<STRIDE_B>[\d ]+)"
-#     r"--stride_c (?P<STRIDE_C>[\d ]+)"
-#     r"--stride_d (?P<STRIDE_D>[\d ]+)"
-#     r"--alpha (?P<ALPHA>[\d\. ]+)"
-#     r"--beta (?P<BETA>[\d\. ]+)"
-#     r"--transA (?P<TRANS_A>[\w ]+)"
-#     r"--transB (?P<TRANS_B>[\w ]+)"
-#     r"--batch_count (?P<BATCH_COUNT>[\d ]+)"
-#     r"--scaleA (?P<SCALE_A>[\w ]+)"
-#     r"--scaleB (?P<SCALE_B>[\w ]+)"
-#     r"--bias_vector --bias_source (?P<BIAS_SOURCE>[\w ]+)"
-#     r"--a_type (?P<A_TYPE>[\w ]+)"
-#     r"--b_type (?P<B_TYPE>[\w ]+)"
-#     r"--c_type (?P<C_TYPE>[\w ]+)"
-#     r"--d_type (?P<D_TYPE>[\w ]+)"
-#     r"--scale_type (?P<SCALE_TYPE>[\w ]+)"
-#     r"--bias_type (?P<BIAS_TYPE>[\w ]+)"
-#     r"--compute_type (?P<COMPUTE_TYPE>[\w ]+)")
-
 # Function to extract problem sizes from a line
 def extract_problem_size(match):
     return [int(match.group('M').strip()), int(match.group('N').strip()), int(match.group('BATCH_COUNT').strip()), int(match.group('K').strip())]
@@ -387,6 +277,7 @@ def find_matmul_instruction(mfma_instruction, size):
     for bm in range(int(math.log(mfma_instruction[3],2))+1):
         for m_tiles in reversed(range(1, CU+1)):
             m_tile_size = size[0] // m_tiles
+            # TODO:fp8 384x384
             if m_tile_size > 256:
                 continue
             wave_tile_m = math.ceil(m_tile_size / mfma_instruction[0])
@@ -409,8 +300,7 @@ def find_matmul_instruction(mfma_instruction, size):
                             if wave_tile_n // (2**l) >= 1 and wave_tile_n // (2**l) <= 32:
                                 matmul_instruction[-3] = wave_tile_n // (2**l)
                                 matmul_instruction[-1] = 2**l
-
-                                yield matmul_instruction
+                                yield copy.deepcopy(matmul_instruction)
 
 def get_groups(matmul_instruction_gen):
     # Extract skinny MTs for Groups

From e2c139c4e3c7f552c21528fb6a3343c006032376 Mon Sep 17 00:00:00 2001
From: fsx950223 <fsx950223@outlook.com>
Date: Thu, 21 Nov 2024 05:19:33 +0000
Subject: [PATCH 10/23] fix merge bug when inData has no soltion

---
 tensilelite/Tensile/Utilities/merge.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tensilelite/Tensile/Utilities/merge.py b/tensilelite/Tensile/Utilities/merge.py
index 767365a775..14cad76eae 100644
--- a/tensilelite/Tensile/Utilities/merge.py
+++ b/tensilelite/Tensile/Utilities/merge.py
@@ -309,6 +309,7 @@ def mergeLogic(oriData, incData, forceMerge, trimSize=True, addSolutionTags=Fals
     origNumSizes = len(oriData[7])
     origNumSolutions = len(oriData[5])
 
+    incData[7] = incData[7] or []
     incNumSizes = len(incData[7])
     incNumSolutions = len(incData[5])
 
@@ -320,7 +321,7 @@ def mergeLogic(oriData, incData, forceMerge, trimSize=True, addSolutionTags=Fals
     incTaggedSizes  = addSolutionTagToKeys(incData[7],  incData[5])
     if addSolutionTags:
         oriData[7] = origTaggedSizes
-        incData[7]  = incTaggedSizes
+        incData[7] = incTaggedSizes
     # Print warning if addSolutionTags=False results in removed sizes
     else:
         origSet       = {tuple(size) for size, [_, _] in oriData[7]}

From de87a414aa5e48869ed70f71e9722fbdd1763cd3 Mon Sep 17 00:00:00 2001
From: fsx950223 <fsx950223@outlook.com>
Date: Fri, 22 Nov 2024 09:38:25 +0000
Subject: [PATCH 11/23] limit wave size less or equal than 4

---
 .../Utilities/tensile_generator/tensile_config_generator.py   | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py b/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py
index e9feea2fd6..3cbda1fe95 100644
--- a/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py
+++ b/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py
@@ -129,7 +129,7 @@
     bf16_instructions = [[16,16,16,1],[32,32,8,1]]
     tf32_instructions = [[16,16,8,1]]
     fp32_instructions = [[16,16,4,1]]
-    fp8_instructions = [[32,32,16,1], [16,16,32,1]]
+    fp8_instructions = [[16,16,32,1]]
 
 HIPBLASLT_BENCH_BASE = (
     r"(?P<CMD>\w+) --api_method c "
@@ -296,7 +296,7 @@ def find_matmul_instruction(mfma_instruction, size):
                         matmul_instruction[-4] = wave_tile_m // (2**k)
                         matmul_instruction[-2] = 2**k
 
-                        for l in reversed(range(3)):
+                        for l in reversed(range(3-k)):
                             if wave_tile_n // (2**l) >= 1 and wave_tile_n // (2**l) <= 32:
                                 matmul_instruction[-3] = wave_tile_n // (2**l)
                                 matmul_instruction[-1] = 2**l

From c4dac5de53b9d40133c246eee7bd8a80c1405f0e Mon Sep 17 00:00:00 2001
From: fsx950223 <fsx950223@outlook.com>
Date: Mon, 25 Nov 2024 06:28:08 +0000
Subject: [PATCH 12/23] update readme

---
 tensilelite/Tensile/Utilities/tensile_generator/README.md      | 3 +++
 .../Utilities/tensile_generator/tensile_config_generator.py    | 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/tensilelite/Tensile/Utilities/tensile_generator/README.md b/tensilelite/Tensile/Utilities/tensile_generator/README.md
index d4b4fdeb84..0dd4df9935 100644
--- a/tensilelite/Tensile/Utilities/tensile_generator/README.md
+++ b/tensilelite/Tensile/Utilities/tensile_generator/README.md
@@ -35,6 +35,9 @@ To use the `tensile_config_generator.py` script, follow these steps:
    | `--iters ITERS` | Max tuning iterations |
    | `--fast BOOL` | If enabled, only tune the matrix instruction with min tile sizes, else, tune full matrix instructions |
    | `--gridbase_config GRIDBASE_CONFIG` | Path to gridbase config file |
+   | `--full_mfma BOOL` | If enabled, will search for all mfma instructions |
+   | `--full_stage BOOL` | If enabled, will search for all stages instructions |
+   | `--num_stages STAGES` | How many times to divide matrix |
 
    Equality tuning example:
    ```
diff --git a/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py b/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py
index 3cbda1fe95..940491f6f8 100644
--- a/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py
+++ b/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py
@@ -74,7 +74,7 @@
 
 parser.add_argument(
     "--full_stage", type=bool, default=False,
-    help="If enabled, will search for all mi instructions")
+    help="If enabled, will search for all stages instructions")
 
 parser.add_argument(
     "--num_stages", type=int, default=8,

From 7702661f62bcc3292cc06f4270e0d66874948b93 Mon Sep 17 00:00:00 2001
From: fsx950223 <fsx950223@outlook.com>
Date: Mon, 25 Nov 2024 11:51:47 +0000
Subject: [PATCH 13/23] fix full stage

---
 .../Utilities/tensile_generator/tensile_config_generator.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py b/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py
index 940491f6f8..e53e1cd12c 100644
--- a/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py
+++ b/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py
@@ -536,9 +536,9 @@ def dump_yaml(gpu_idx, gemm_group, yaml_file, m_sum, n_sum, batch_sum, k_sum, it
                             matmul_instruction_found = True
                             if not args.full_stage:
                                 break
-                        else:
-                            max_dim = int(np.argmax(size[:2]))
-                            size[max_dim] = size[max_dim] // 2
+
+                        max_dim = int(np.argmax(size[:2]))
+                        size[max_dim] = size[max_dim] // 2
 
                 if not matmul_instruction_found:
                     print(f"Can't find mfma instructions for {original_size}, please contact hipblaslt expert")

From aae674040da0687cbaa3e64ca0190216c9608837 Mon Sep 17 00:00:00 2001
From: fsx950223 <fsx950223@outlook.com>
Date: Tue, 26 Nov 2024 08:41:50 +0000
Subject: [PATCH 14/23] update pattern to match latest hipblaslt log

---
 .../tensile_config_generator.py               | 50 ++++++-------------
 1 file changed, 14 insertions(+), 36 deletions(-)

diff --git a/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py b/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py
index e53e1cd12c..4ee42c6e29 100644
--- a/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py
+++ b/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py
@@ -131,6 +131,7 @@
     fp32_instructions = [[16,16,4,1]]
     fp8_instructions = [[16,16,32,1]]
 
+
 HIPBLASLT_BENCH_BASE = (
     r"(?P<CMD>\w+) --api_method c "
     r"-m (?P<M>[\d ]+)"
@@ -149,12 +150,12 @@
     r"--transA (?P<TRANS_A>[\w ]+)"
     r"--transB (?P<TRANS_B>[\w ]+)"
     r"--batch_count (?P<BATCH_COUNT>[\d ]+)"
+    r"--scaleA (?P<SCALE_A>[\d ]+)"
+    r"--scaleB (?P<SCALE_B>[\d ]+)"
 )
 
 # Optional patterns for scale and bias
-SCALE_PATTERN = r"--scaleA (?P<SCALE_A>[\w ]+)--scaleB (?P<SCALE_B>[\w ]+)"
 BIAS_PATTERN = r"--bias_vector --bias_source (?P<BIAS_SOURCE>[\w ]+)"
-ACTIVATION_PATTERN = r"--activation_type (?P<ACTIVATION_TYPE>[\w ]+)"
 
 # Common ending pattern
 TYPE_PATTERN = (
@@ -165,30 +166,22 @@
     r"--scale_type (?P<SCALE_TYPE>[\w ]+)"
     r"--bias_type (?P<BIAS_TYPE>[\w ]+)"
     r"--compute_type (?P<COMPUTE_TYPE>[\w ]+)"
+    r"--algo_method (?P<ALGO_METHOD>[\w ]+)"
+    r"--solution_index (?P<SOLUTION_INDEX>[\d ]+)"
+    r"--activation_type (?P<ACTIVATION_TYPE>[\w ]+)"
 )
 
 # Build the combined pattern with optional parts
-def build_pattern(has_scale=False, has_bias=False, has_activation=False):
+def build_pattern(has_bias=False):
     pattern = HIPBLASLT_BENCH_BASE
-    if has_scale:
-        pattern += SCALE_PATTERN
     if has_bias:
         pattern += BIAS_PATTERN
     pattern += TYPE_PATTERN
-    if has_activation:
-        pattern += ACTIVATION_PATTERN
     return pattern
 
 # Create the four variations
 HIPBLASLT_BENCH_RE = build_pattern()
-HIPBLASLT_BENCH_RE_SAB = build_pattern(has_scale=True)
 HIPBLASLT_BENCH_RE_BIAS = build_pattern(has_bias=True)
-HIPBLASLT_BENCH_RE_ACT = build_pattern(has_activation=True)
-HIPBLASLT_BENCH_RE_SAB_ACT = build_pattern(has_scale=True, has_activation=True)
-HIPBLASLT_BENCH_RE_BIAS_ACT = build_pattern(has_bias=True, has_activation=True)
-HIPBLASLT_BENCH_RE_SAB_BIAS = build_pattern(has_scale=True, has_bias=True)
-HIPBLASLT_BENCH_RE_SAB_BIAS_ACT = build_pattern(has_scale=True, has_bias=True, has_activation=True)
-
 
 # Function to extract problem sizes from a line
 def extract_problem_size(match):
@@ -251,6 +244,8 @@ def extract_dtype(match):
     ComputeDataType = datatype_map(gdict.get('COMPUTE_TYPE', '').strip())
     TransposeA = trans_map(gdict.get('TRANS_A', '').strip())
     TransposeB = trans_map(gdict.get('TRANS_B', '').strip())
+    scaleA = gdict.get("SCALE_A").strip()
+    scaleB = gdict.get("SCALE_B").strip()
     if DataType in ["H", "B", "F8"]:
         HighPrecisionAccumulate = True
     else:
@@ -268,9 +263,12 @@ def extract_dtype(match):
     if gdict.get("ACTIVATION_TYPE"):
         res["Activation"] = True
         res["ActivationType"] = "hipblaslt_all"
-    if gdict.get("SCALE_A") is not None and gdict.get("SCALE_B") is not None:
+    if scaleA == "1" and scaleB == "1" is not None:
         res["UseScaleAB"] = "Scalar"
         res["UseScaleAlphaVec"] = 1
+    elif scaleA == "2" and scaleB == "2" is not None:
+        res["UseScaleAB"] = "Vector"
+        res["UseScaleAlphaVec"] = 1
     return res
 
 def find_matmul_instruction(mfma_instruction, size):
@@ -320,30 +318,10 @@ def get_groups(matmul_instruction_gen):
     return mi_groups0, mi_groups1, mi_left
 
 def match_pattern(line):
-    if 'activation_type' in line and 'bias_vector' in line and 'scaleA' in line and 'scaleB' in line:
-        match = re.search(
-            HIPBLASLT_BENCH_RE_SAB_BIAS_ACT, line
-        )
-    elif 'bias_vector' in line and 'scaleA' in line and 'scaleB' in line:
-        match = re.search(
-            HIPBLASLT_BENCH_RE_SAB_BIAS, line
-        )
-    elif 'activation_type' in line and 'scaleA' in line and 'scaleB' in line:
-        match = re.search(
-            HIPBLASLT_BENCH_RE_SAB_ACT, line
-        )
-    elif 'bias_vector' in line and 'activation_type' in line:
-        match = re.search(
-            HIPBLASLT_BENCH_RE_BIAS_ACT, line
-        )
-    elif 'bias_vector' in line:
+    if 'bias_vector' in line:
         match = re.search(
             HIPBLASLT_BENCH_RE_BIAS, line
         )
-    elif 'scaleA' in line and 'scaleB' in line:
-        match = re.search(
-            HIPBLASLT_BENCH_RE_SAB, line
-        )
     else:
         match = re.search(
             HIPBLASLT_BENCH_RE, line

From 65e85de166d2728496e3e2ddaf5e681a0b51741f Mon Sep 17 00:00:00 2001
From: fsx950223 <fsx950223@outlook.com>
Date: Tue, 26 Nov 2024 08:52:53 +0000
Subject: [PATCH 15/23] fix activation

---
 .../tensile_generator/tensile_config_generator.py         | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py b/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py
index 4ee42c6e29..db4d6eb47f 100644
--- a/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py
+++ b/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py
@@ -246,6 +246,8 @@ def extract_dtype(match):
     TransposeB = trans_map(gdict.get('TRANS_B', '').strip())
     scaleA = gdict.get("SCALE_A").strip()
     scaleB = gdict.get("SCALE_B").strip()
+    activation_type = gdict.get("ACTIVATION_TYPE").strip()
+    bias_source = gdict.get('BIAS_SOURCE', '').strip().upper()
     if DataType in ["H", "B", "F8"]:
         HighPrecisionAccumulate = True
     else:
@@ -256,11 +258,11 @@ def extract_dtype(match):
         F32XdlMathOp = 'x'
     res = {"Batched": True, "DataType": DataType, "DestDataType": DestDataType, "ComputeDataType": ComputeDataType, "TransposeA": TransposeA, "TransposeB": TransposeB, "HighPrecisionAccumulate": HighPrecisionAccumulate, "F32XdlMathOp": F32XdlMathOp, "OperationType": "GEMM", "UseBeta": True}
 
-    if gdict.get("BIAS_SOURCE"):
+    if bias_source:
         res["UseBias"] = 1
-        res["BiasSrc"] = gdict.get('BIAS_SOURCE', '').strip().upper()
+        res["BiasSrc"] = bias_source
         res["BiasDataTypeList"] = list(bias_datatype_map(gdict.get("BIAS_TYPE", '').strip()))
-    if gdict.get("ACTIVATION_TYPE"):
+    if activation_type != "none":
         res["Activation"] = True
         res["ActivationType"] = "hipblaslt_all"
     if scaleA == "1" and scaleB == "1" is not None:

From 42caf9e21fd6c76d6c13013492fa81377c32e3da Mon Sep 17 00:00:00 2001
From: fsx950223 <fsx950223@outlook.com>
Date: Wed, 27 Nov 2024 03:02:49 +0000
Subject: [PATCH 16/23] update log example

---
 .../hipblaslt_gemm_log_example.txt            | 34 ++++++++-----------
 1 file changed, 14 insertions(+), 20 deletions(-)

diff --git a/tensilelite/Tensile/Utilities/tensile_generator/hipblaslt_gemm_log_example.txt b/tensilelite/Tensile/Utilities/tensile_generator/hipblaslt_gemm_log_example.txt
index 23913ee7ad..e19f08f82a 100644
--- a/tensilelite/Tensile/Utilities/tensile_generator/hipblaslt_gemm_log_example.txt
+++ b/tensilelite/Tensile/Utilities/tensile_generator/hipblaslt_gemm_log_example.txt
@@ -1,20 +1,14 @@
-hipblaslt-bench --api_method c -m 1 -n 200 -k 24 --lda 24 --ldb 24 --ldc 1 --ldd 1  --stride_a 24 --stride_b 4800 --stride_c 200 --stride_d 200  --alpha 1.000000 --beta 0.000000 --transA T --transB N --batch_count 800  --a_type f32_r --b_type f32_r --c_type f32_r --d_type f32_r --scale_type f32_r --bias_type f32_r   --compute_type f32_r 
-hipblaslt-bench --api_method c -m 1 -n 24 -k 200 --lda 1 --ldb 24 --ldc 1 --ldd 1  --stride_a 200 --stride_b 4800 --stride_c 24 --stride_d 24  --alpha 1.000000 --beta 0.000000 --transA N --transB T --batch_count 800  --a_type f16_r --b_type f16_r --c_type f16_r --d_type f16_r --scale_type f32_r --bias_type f32_r   --compute_type f32_r 
-hipblaslt-bench --api_method c -m 1024 -n 200 -k 5244 --lda 1024 --ldb 5244 --ldc 1024 --ldd 1024  --stride_a 0 --stride_b 0 --stride_c 0 --stride_d 0  --alpha 1.000000 --beta 1.000000 --transA N --transB N --batch_count 1  --a_type f16_r --b_type f16_r --c_type f32_r --d_type f32_r --scale_type f32_r --bias_type f32_r   --compute_type f32_r 
-hipblaslt-bench --api_method c -m 128 -n 150 -k 128 --lda 128 --ldb 128 --ldc 128 --ldd 128  --stride_a 16384 --stride_b 19200 --stride_c 19200 --stride_d 19200  --alpha 1.000000 --beta 0.000000 --transA N --transB N --batch_count 2  --a_type f16_r --b_type f16_r --c_type f16_r --d_type f16_r --scale_type f32_r --bias_type f32_r   --compute_type f32_r 
-hipblaslt-bench --api_method c -m 128 -n 200 -k 256 --lda 128 --ldb 256 --ldc 128 --ldd 128  --stride_a 0 --stride_b 0 --stride_c 0 --stride_d 0  --alpha 1.000000 --beta 1.000000 --transA N --transB N --batch_count 1  --a_type f16_r --b_type f16_r --c_type f32_r --d_type f32_r --scale_type f32_r --bias_type f32_r   --compute_type f32_r 
-hipblaslt-bench --api_method c -m 150 -n 200 -k 32 --lda 32 --ldb 32 --ldc 150 --ldd 150  --stride_a 4800 --stride_b 6400 --stride_c 30000 --stride_d 30000  --alpha 1.000000 --beta 0.000000 --transA T --transB N --batch_count 8  --a_type f16_r --b_type f16_r --c_type f32_r --d_type f32_r --scale_type f32_r --bias_type f32_r   --compute_type f32_r 
-hipblaslt-bench --api_method c -m 2 -n 200 -k 64 --lda 2 --ldb 64 --ldc 2 --ldd 2  --stride_a 0 --stride_b 0 --stride_c 0 --stride_d 0  --alpha 1.000000 --beta 1.000000 --transA N --transB N --batch_count 1  --a_type f16_r --b_type f16_r --c_type f32_r --d_type f32_r --scale_type f32_r --bias_type f32_r   --compute_type f32_r 
-hipblaslt-bench --api_method c -m 200 -n 200 -k 24 --lda 24 --ldb 24 --ldc 200 --ldd 200  --stride_a 4800 --stride_b 4800 --stride_c 40000 --stride_d 40000  --alpha 1.000000 --beta 0.000000 --transA T --transB N --batch_count 4  --a_type f16_r --b_type f16_r --c_type f32_r --d_type f32_r --scale_type f32_r --bias_type f32_r   --compute_type f32_r 
-hipblaslt-bench --api_method c -m 24 -n 200 -k 200 --lda 24 --ldb 200 --ldc 24 --ldd 24  --stride_a 4800 --stride_b 40000 --stride_c 4800 --stride_d 4800  --alpha 1.000000 --beta 0.000000 --transA N --transB N --batch_count 4  --a_type f16_r --b_type f16_r --c_type f32_r --d_type f32_r --scale_type f32_r --bias_type f32_r   --compute_type f32_r 
-hipblaslt-bench --api_method c -m 24 -n 200 -k 50 --lda 24 --ldb 50 --ldc 24 --ldd 24  --stride_a 1200 --stride_b 10000 --stride_c 4800 --stride_d 4800  --alpha 1.000000 --beta 0.000000 --transA N --transB N --batch_count 8  --a_type f16_r --b_type f16_r --c_type f16_r --d_type f16_r --scale_type f32_r --bias_type f32_r   --compute_type f32_r 
-hipblaslt-bench --api_method c -m 256 -n 200 -k 512 --lda 256 --ldb 512 --ldc 256 --ldd 256  --stride_a 0 --stride_b 0 --stride_c 0 --stride_d 0  --alpha 1.000000 --beta 1.000000 --transA N --transB N --batch_count 1  --a_type f16_r --b_type f16_r --c_type f32_r --d_type f32_r --scale_type f32_r --bias_type f32_r   --compute_type f32_r 
-hipblaslt-bench --api_method c -m 256 -n 200 -k 96 --lda 256 --ldb 96 --ldc 256 --ldd 256  --stride_a 0 --stride_b 0 --stride_c 0 --stride_d 0  --alpha 1.000000 --beta 0.000000 --transA N --transB N --batch_count 1  --a_type f16_r --b_type f16_r --c_type f16_r --d_type f16_r --scale_type f32_r --bias_type f32_r   --compute_type f32_r 
-hipblaslt-bench --api_method c -m 32 -n 200 -k 150 --lda 32 --ldb 150 --ldc 32 --ldd 32  --stride_a 4800 --stride_b 30000 --stride_c 6400 --stride_d 6400  --alpha 1.000000 --beta 0.000000 --transA N --transB N --batch_count 8  --a_type f16_r --b_type f16_r --c_type f32_r --d_type f32_r --scale_type f32_r --bias_type f32_r   --compute_type f32_r 
-hipblaslt-bench --api_method c -m 384 -n 200 -k 96 --lda 384 --ldb 96 --ldc 384 --ldd 384  --stride_a 0 --stride_b 0 --stride_c 0 --stride_d 0  --alpha 1.000000 --beta 0.000000 --transA N --transB N --batch_count 1  --a_type f16_r --b_type f16_r --c_type f16_r --d_type f16_r --scale_type f32_r --bias_type f32_r   --compute_type f32_r 
-hipblaslt-bench --api_method c -m 50 -n 200 -k 24 --lda 24 --ldb 24 --ldc 50 --ldd 50  --stride_a 1200 --stride_b 4800 --stride_c 10000 --stride_d 10000  --alpha 1.000000 --beta 0.000000 --transA T --transB N --batch_count 8  --a_type f16_r --b_type f16_r --c_type f32_r --d_type f32_r --scale_type f32_r --bias_type f32_r   --compute_type f32_r 
-hipblaslt-bench --api_method c -m 512 -n 200 -k 1024 --lda 512 --ldb 1024 --ldc 512 --ldd 512  --stride_a 0 --stride_b 0 --stride_c 0 --stride_d 0  --alpha 1.000000 --beta 1.000000 --transA N --transB N --batch_count 1  --a_type f16_r --b_type f16_r --c_type f32_r --d_type f32_r --scale_type f32_r --bias_type f32_r   --compute_type f32_r 
-hipblaslt-bench --api_method c -m 64 -n 200 -k 128 --lda 64 --ldb 128 --ldc 64 --ldd 64  --stride_a 0 --stride_b 0 --stride_c 0 --stride_d 0  --alpha 1.000000 --beta 1.000000 --transA N --transB N --batch_count 1  --a_type f16_r --b_type f16_r --c_type f32_r --d_type f32_r --scale_type f32_r --bias_type f32_r   --compute_type f32_r 
-hipblaslt-bench --api_method c -m 96 -n 200 -k 96 --lda 96 --ldb 96 --ldc 96 --ldd 96  --stride_a 0 --stride_b 0 --stride_c 0 --stride_d 0  --alpha 1.000000 --beta 0.000000 --transA N --transB N --batch_count 1  --a_type f16_r --b_type f16_r --c_type f16_r --d_type f16_r --scale_type f32_r --bias_type f32_r   --compute_type f32_r 
-hipblaslt-bench --api_method c -m 96 -n 50 -k 96 --lda 96 --ldb 96 --ldc 96 --ldd 96  --stride_a 9216 --stride_b 4800 --stride_c 4800 --stride_d 4800  --alpha 1.000000 --beta 0.000000 --transA N --transB N --batch_count 2  --a_type f16_r --b_type f16_r --c_type f16_r --d_type f16_r --scale_type f32_r --bias_type f32_r   --compute_type f32_r 
-hipblaslt-bench --api_method c -m 96 -n 6000 -k 96 --lda 96 --ldb 96 --ldc 96 --ldd 96  --stride_a 0 --stride_b 0 --stride_c 0 --stride_d 0  --alpha 1.000000 --beta 0.000000 --transA N --transB N --batch_count 1  --a_type f16_r --b_type f16_r --c_type f16_r --d_type f16_r --scale_type f32_r --bias_type f32_r   --compute_type f32_r 
+hipblaslt-bench --api_method c -m 10 -n 1000 -k 128 --lda 128 --ldb 128 --ldc 10 --ldd 10  --stride_a 0 --stride_b 0 --stride_c 0 --stride_d 0  --alpha 1.000000 --beta 1.000000 --transA T --transB N --batch_count 1 --scaleA 0 --scaleB 0  --a_type f32_r --b_type f32_r --c_type f32_r --d_type f32_r --scale_type f32_r --bias_type f32_r   --compute_type f32_r --algo_method index --solution_index 156482 --activation_type none
+hipblaslt-bench --api_method c -m 10 -n 32 -k 128 --lda 128 --ldb 128 --ldc 10 --ldd 10  --stride_a 0 --stride_b 0 --stride_c 0 --stride_d 0  --alpha 1.000000 --beta 1.000000 --transA T --transB N --batch_count 1 --scaleA 0 --scaleB 0  --a_type f32_r --b_type f32_r --c_type f32_r --d_type f32_r --scale_type f32_r --bias_type f32_r   --compute_type f32_r --algo_method index --solution_index 156117 --activation_type none
+hipblaslt-bench --api_method c -m 10 -n 64 -k 128 --lda 128 --ldb 128 --ldc 10 --ldd 10  --stride_a 0 --stride_b 0 --stride_c 0 --stride_d 0  --alpha 1.000000 --beta 1.000000 --transA T --transB N --batch_count 1 --scaleA 0 --scaleB 0  --a_type f32_r --b_type f32_r --c_type f32_r --d_type f32_r --scale_type f32_r --bias_type f32_r   --compute_type f32_r --algo_method index --solution_index 156482 --activation_type none
+hipblaslt-bench --api_method c -m 128 -n 10 -k 32 --lda 128 --ldb 10 --ldc 128 --ldd 128  --stride_a 0 --stride_b 0 --stride_c 0 --stride_d 0  --alpha 1.000000 --beta 0.000000 --transA N --transB T --batch_count 1 --scaleA 0 --scaleB 0  --a_type f32_r --b_type f32_r --c_type f32_r --d_type f32_r --scale_type f32_r --bias_type f32_r   --compute_type f32_r --algo_method index --solution_index 13713 --activation_type none
+hipblaslt-bench --api_method c -m 128 -n 10 -k 64 --lda 128 --ldb 10 --ldc 128 --ldd 128  --stride_a 0 --stride_b 0 --stride_c 0 --stride_d 0  --alpha 1.000000 --beta 0.000000 --transA N --transB T --batch_count 1 --scaleA 0 --scaleB 0  --a_type f32_r --b_type f32_r --c_type f32_r --d_type f32_r --scale_type f32_r --bias_type f32_r   --compute_type f32_r --algo_method index --solution_index 13713 --activation_type none
+hipblaslt-bench --api_method c -m 128 -n 1000 -k 9216 --lda 9216 --ldb 9216 --ldc 128 --ldd 128  --stride_a 0 --stride_b 0 --stride_c 0 --stride_d 0  --alpha 1.000000 --beta 1.000000 --transA T --transB N --batch_count 1 --scaleA 0 --scaleB 0  --a_type f32_r --b_type f32_r --c_type f32_r --d_type f32_r --scale_type f32_r --bias_type f32_r   --compute_type f32_r --algo_method index --solution_index 156486 --activation_type none
+hipblaslt-bench --api_method c -m 128 -n 32 -k 10 --lda 128 --ldb 10 --ldc 128 --ldd 128  --stride_a 0 --stride_b 0 --stride_c 0 --stride_d 0  --alpha 1.000000 --beta 0.000000 --transA N --transB N --batch_count 1 --scaleA 0 --scaleB 0  --a_type f32_r --b_type f32_r --c_type f32_r --d_type f32_r --scale_type f32_r --bias_type f32_r   --compute_type f32_r --algo_method index --solution_index 19256 --activation_type none
+hipblaslt-bench --api_method c -m 128 -n 32 -k 9216 --lda 9216 --ldb 9216 --ldc 128 --ldd 128  --stride_a 0 --stride_b 0 --stride_c 0 --stride_d 0  --alpha 1.000000 --beta 1.000000 --transA T --transB N --batch_count 1 --scaleA 0 --scaleB 0  --a_type f32_r --b_type f32_r --c_type f32_r --d_type f32_r --scale_type f32_r --bias_type f32_r   --compute_type f32_r --algo_method index --solution_index 156426 --activation_type none
+hipblaslt-bench --api_method c -m 128 -n 64 -k 10 --lda 128 --ldb 10 --ldc 128 --ldd 128  --stride_a 0 --stride_b 0 --stride_c 0 --stride_d 0  --alpha 1.000000 --beta 0.000000 --transA N --transB N --batch_count 1 --scaleA 0 --scaleB 0  --a_type f32_r --b_type f32_r --c_type f32_r --d_type f32_r --scale_type f32_r --bias_type f32_r   --compute_type f32_r --algo_method index --solution_index 19234 --activation_type none
+hipblaslt-bench --api_method c -m 128 -n 64 -k 9216 --lda 9216 --ldb 9216 --ldc 128 --ldd 128  --stride_a 0 --stride_b 0 --stride_c 0 --stride_d 0  --alpha 1.000000 --beta 1.000000 --transA T --transB N --batch_count 1 --scaleA 0 --scaleB 0  --a_type f32_r --b_type f32_r --c_type f32_r --d_type f32_r --scale_type f32_r --bias_type f32_r   --compute_type f32_r --algo_method index --solution_index 156490 --activation_type none
+hipblaslt-bench --api_method c -m 9216 -n 128 -k 32 --lda 9216 --ldb 128 --ldc 9216 --ldd 9216  --stride_a 0 --stride_b 0 --stride_c 0 --stride_d 0  --alpha 1.000000 --beta 0.000000 --transA N --transB T --batch_count 1 --scaleA 0 --scaleB 0  --a_type f32_r --b_type f32_r --c_type f32_r --d_type f32_r --scale_type f32_r --bias_type f32_r   --compute_type f32_r --algo_method index --solution_index 13717 --activation_type none
+hipblaslt-bench --api_method c -m 9216 -n 128 -k 64 --lda 9216 --ldb 128 --ldc 9216 --ldd 9216  --stride_a 0 --stride_b 0 --stride_c 0 --stride_d 0  --alpha 1.000000 --beta 0.000000 --transA N --transB T --batch_count 1 --scaleA 0 --scaleB 0  --a_type f32_r --b_type f32_r --c_type f32_r --d_type f32_r --scale_type f32_r --bias_type f32_r   --compute_type f32_r --algo_method index --solution_index 13717 --activation_type none
+hipblaslt-bench --api_method c -m 9216 -n 32 -k 128 --lda 9216 --ldb 128 --ldc 9216 --ldd 9216  --stride_a 0 --stride_b 0 --stride_c 0 --stride_d 0  --alpha 1.000000 --beta 0.000000 --transA N --transB N --batch_count 1 --scaleA 0 --scaleB 0  --a_type f32_r --b_type f32_r --c_type f32_r --d_type f32_r --scale_type f32_r --bias_type f32_r   --compute_type f32_r --algo_method index --solution_index 19272 --activation_type none
+hipblaslt-bench --api_method c -m 9216 -n 64 -k 128 --lda 9216 --ldb 128 --ldc 9216 --ldd 9216  --stride_a 0 --stride_b 0 --stride_c 0 --stride_d 0  --alpha 1.000000 --beta 0.000000 --transA N --transB N --batch_count 1 --scaleA 0 --scaleB 0  --a_type f32_r --b_type f32_r --c_type f32_r --d_type f32_r --scale_type f32_r --bias_type f32_r   --compute_type f32_r --algo_method index --solution_index 19260 --activation_type none

From 601db370fc1e8feb2345375782ca538775f438c8 Mon Sep 17 00:00:00 2001
From: fsx950223 <fsx950223@outlook.com>
Date: Wed, 27 Nov 2024 06:25:25 +0000
Subject: [PATCH 17/23] apply full stage in gridbase tuning

---
 .../tensile_generator/tensile_config_generator.py     | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py b/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py
index db4d6eb47f..388d1a7073 100644
--- a/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py
+++ b/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py
@@ -597,12 +597,15 @@ def dump_yaml(gpu_idx, gemm_group, yaml_file, m_sum, n_sum, batch_sum, k_sum, it
                             matmul_instructions[dtype_str][str(matmul_instruction)] = matmul_instruction
                             if args.fast and (index > total_inst):
                                 break
+
                     if len(matmul_instruction_gen) > 0:
                         matmul_instruction_found = True
-                        break
-                    else:
-                        max_dim = int(np.argmax(size))
-                        size[max_dim] = size[max_dim] // 2
+                        if not args.full_stage:
+                            break
+
+                    max_dim = int(np.argmax(size))
+                    size[max_dim] = size[max_dim] // 2
+
             if not matmul_instruction_found:
                 print(f"Can't find mfma instructions for {original_size}, please contact hipblaslt expert")
             else:

From b16de48c1d3b28e3a549110338cbc45410c13e3d Mon Sep 17 00:00:00 2001
From: fsx950223 <fsx950223@outlook.com>
Date: Thu, 28 Nov 2024 02:48:03 +0000
Subject: [PATCH 18/23] fix dulplicate problem size introduced by solution
 index

---
 .../tensile_config_generator.py               | 137 +++++++++---------
 1 file changed, 70 insertions(+), 67 deletions(-)

diff --git a/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py b/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py
index 388d1a7073..bdece6bee5 100644
--- a/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py
+++ b/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py
@@ -225,15 +225,15 @@ def trans_map(trans):
 
 def bias_datatype_map(dtype):
     if dtype == "f16_r":
-        return [datatype_map('f16_r')]
+        return [datatype_map('f32_r'), datatype_map('f16_r')]
     elif dtype == "f32_r":
         return [datatype_map('f32_r')]
     elif dtype == "xf32_r":
         return [datatype_map('xf32_r')]
     elif dtype == "bf16_r":
-        return [datatype_map('bf16_r')]
+        return [datatype_map('f32_r'), datatype_map('bf16_r')]
     elif dtype == "f8_r":
-        return [datatype_map('f8_r')]
+        return [datatype_map('f32_r'), datatype_map('f8_r')]
     else:
         return []
 
@@ -296,7 +296,7 @@ def find_matmul_instruction(mfma_instruction, size):
                         matmul_instruction[-4] = wave_tile_m // (2**k)
                         matmul_instruction[-2] = 2**k
 
-                        for l in reversed(range(3-k)):
+                        for l in reversed(range(3)):
                             if wave_tile_n // (2**l) >= 1 and wave_tile_n // (2**l) <= 32:
                                 matmul_instruction[-3] = wave_tile_n // (2**l)
                                 matmul_instruction[-1] = 2**l
@@ -313,7 +313,7 @@ def get_groups(matmul_instruction_gen):
             ratio = mt[0] / mt[1]
             if ratio > NONTEMPORALRATIO:
                 mi_groups0.append(mi)
-            elif ratio < (1/NONTEMPORALRATIO):
+            elif ratio < (1 / NONTEMPORALRATIO):
                 mi_groups1.append(mi)
             else:
                 mi_left.append(mi)
@@ -442,10 +442,14 @@ def dump_yaml(gpu_idx, gemm_group, yaml_file, m_sum, n_sum, batch_sum, k_sum, it
         for line in f:
             match = match_pattern(line)
             if match:
-                if line in unique_gemms:
-                    unique_gemms[line] += 1
+                size = extract_problem_size(match)
+                dtype = extract_dtype(match)
+                size_str = json.dumps(size)
+                dtype_str = json.dumps(dtype)
+                if (size_str, dtype_str) in unique_gemms:
+                    unique_gemms[(size_str, dtype_str)] += 1
                 else:
-                    unique_gemms[line] = 1
+                    unique_gemms[(size_str, dtype_str)] = 1
 
     unique_gemms = {k: v for k, v in sorted(unique_gemms.items(), key=lambda item: item[1], reverse=True)[:args.topk]}
     for k, v in unique_gemms.items():
@@ -466,71 +470,70 @@ def dump_yaml(gpu_idx, gemm_group, yaml_file, m_sum, n_sum, batch_sum, k_sum, it
         k_sum = 0
 
         for k, v in unique_gemms_subgroup:
-            match = match_pattern(k)
-            if match:
-                size = extract_problem_size(match)
-                original_size = copy.deepcopy(size)
-                dtype = extract_dtype(match)
-                mfma_instructions = instruction_map(dtype)
-                dtype_str = json.dumps(dtype)
-                if mfma_instructions is None:
-                    continue
+            size_str, dtype_str = k
+            size = json.loads(size_str)
+            dtype = json.loads(dtype_str)
+            original_size = copy.deepcopy(size)
+            mfma_instructions = instruction_map(dtype)
 
-                matmul_instruction_found = False
-                for mfma_instruction in mfma_instructions:
-                    for _ in range(NUM_STAGES):
-                        matmul_instruction_gen = list(find_matmul_instruction(mfma_instruction, size))
-                        if args.groups:
-                            mi_groups0, mi_groups1, matmul_instruction_gen = get_groups(matmul_instruction_gen)
-                        else:
-                            mi_groups0 = []
-                            mi_groups1 = []
-
-                        total_inst = min(len(matmul_instruction_gen) // DIV_MI, MIN_MI)  # At least 5 insts and max of 33.3% of insts.
-                        for index, matmul_instruction in enumerate(matmul_instruction_gen):
-                            if matmul_instruction is not None:
-                                if dtype_str not in matmul_instructions:
-                                    matmul_instructions[dtype_str] = dict()
-                                matmul_instructions[dtype_str][str(matmul_instruction)] = matmul_instruction
-                                if args.fast and (index > total_inst):
-                                    break
-                        total_inst = min(len(mi_groups0) // DIV_MI, MIN_MI)
-                        for index, mi_0 in enumerate(mi_groups0):
-                            if dtype_str not in groups:
-                                groups[dtype_str] = [{},{}]
-                                groups[dtype_str][0]["MatrixInstruction"] = {}
-                                groups[dtype_str][1]["MatrixInstruction"] = {}
-                            groups[dtype_str][0]["MatrixInstruction"][str(mi_0)] = mi_0
-                            if args.fast and (index > total_inst):
-                                break
-                        total_inst = min(len(mi_groups1) // DIV_MI, MIN_MI)
-                        for index, mi_1 in enumerate(mi_groups1):
-                            if dtype_str not in groups:
-                                groups[dtype_str] = [{},{}]
-                                groups[dtype_str][0]["MatrixInstruction"] = {}
-                                groups[dtype_str][1]["MatrixInstruction"] = {}
-                            groups[dtype_str][1]["MatrixInstruction"][str(mi_1)] = mi_1
+            if mfma_instructions is None:
+                continue
+
+            matmul_instruction_found = False
+            for mfma_instruction in mfma_instructions:
+                for _ in range(NUM_STAGES):
+                    matmul_instruction_gen = list(find_matmul_instruction(mfma_instruction, size))
+                    if args.groups:
+                        mi_groups0, mi_groups1, matmul_instruction_gen = get_groups(matmul_instruction_gen)
+                    else:
+                        mi_groups0 = []
+                        mi_groups1 = []
+
+                    total_inst = min(len(matmul_instruction_gen) // DIV_MI, MIN_MI)  # At least 5 insts and max of 33.3% of insts.
+                    for index, matmul_instruction in enumerate(matmul_instruction_gen):
+                        if matmul_instruction is not None:
+                            if dtype_str not in matmul_instructions:
+                                matmul_instructions[dtype_str] = dict()
+                            matmul_instructions[dtype_str][str(matmul_instruction)] = matmul_instruction
                             if args.fast and (index > total_inst):
                                 break
-                        if len(matmul_instruction_gen) > 0 or len(mi_groups0) > 0 or len(mi_groups1) > 0:
-                            matmul_instruction_found = True
-                            if not args.full_stage:
-                                break
+                    total_inst = min(len(mi_groups0) // DIV_MI, MIN_MI)
+                    for index, mi_0 in enumerate(mi_groups0):
+                        if dtype_str not in groups:
+                            groups[dtype_str] = [{},{}]
+                            groups[dtype_str][0]["MatrixInstruction"] = {}
+                            groups[dtype_str][1]["MatrixInstruction"] = {}
+                        groups[dtype_str][0]["MatrixInstruction"][str(mi_0)] = mi_0
+                        if args.fast and (index > total_inst):
+                            break
+                    total_inst = min(len(mi_groups1) // DIV_MI, MIN_MI)
+                    for index, mi_1 in enumerate(mi_groups1):
+                        if dtype_str not in groups:
+                            groups[dtype_str] = [{},{}]
+                            groups[dtype_str][0]["MatrixInstruction"] = {}
+                            groups[dtype_str][1]["MatrixInstruction"] = {}
+                        groups[dtype_str][1]["MatrixInstruction"][str(mi_1)] = mi_1
+                        if args.fast and (index > total_inst):
+                            break
+                    if len(matmul_instruction_gen) > 0 or len(mi_groups0) > 0 or len(mi_groups1) > 0:
+                        matmul_instruction_found = True
+                        if not args.full_stage:
+                            break
 
-                        max_dim = int(np.argmax(size[:2]))
-                        size[max_dim] = size[max_dim] // 2
+                    max_dim = int(np.argmax(size[:2]))
+                    size[max_dim] = size[max_dim] // 2
 
-                if not matmul_instruction_found:
-                    print(f"Can't find mfma instructions for {original_size}, please contact hipblaslt expert")
+            if not matmul_instruction_found:
+                print(f"Can't find mfma instructions for {original_size}, please contact hipblaslt expert")
+            else:
+                if dtype_str in gemm_group:
+                    gemm_group[dtype_str].append({'Exact': list(original_size)})
                 else:
-                    if dtype_str in gemm_group:
-                        gemm_group[dtype_str].append({'Exact': list(original_size)})
-                    else:
-                        gemm_group[dtype_str] = [{'Exact': list(original_size)}]
-                    m_sum += original_size[0]
-                    n_sum += original_size[1]
-                    batch_sum += original_size[2]
-                    k_sum += original_size[3]
+                    gemm_group[dtype_str] = [{'Exact': list(original_size)}]
+                m_sum += original_size[0]
+                n_sum += original_size[1]
+                batch_sum += original_size[2]
+                k_sum += original_size[3]
 
         dump_yaml(gpu_idx, gemm_group, args.tensile_config, m_sum, n_sum, batch_sum, k_sum, args.iters, groups)
 

From c339f2d2947d33e6cbe0bb2158d7a8a8237b2b7c Mon Sep 17 00:00:00 2001
From: fsx950223 <fsx950223@outlook.com>
Date: Thu, 28 Nov 2024 07:52:44 +0000
Subject: [PATCH 19/23] pass unsupported gemm config

---
 .../tensile_generator/tensile_config_generator.py        | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py b/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py
index bdece6bee5..9b5022f26c 100644
--- a/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py
+++ b/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py
@@ -244,6 +244,8 @@ def extract_dtype(match):
     ComputeDataType = datatype_map(gdict.get('COMPUTE_TYPE', '').strip())
     TransposeA = trans_map(gdict.get('TRANS_A', '').strip())
     TransposeB = trans_map(gdict.get('TRANS_B', '').strip())
+    if None in [DataType, DestDataType, ComputeDataType, TransposeA, TransposeB]:
+        return None
     scaleA = gdict.get("SCALE_A").strip()
     scaleB = gdict.get("SCALE_B").strip()
     activation_type = gdict.get("ACTIVATION_TYPE").strip()
@@ -265,10 +267,10 @@ def extract_dtype(match):
     if activation_type != "none":
         res["Activation"] = True
         res["ActivationType"] = "hipblaslt_all"
-    if scaleA == "1" and scaleB == "1" is not None:
+    if scaleA == "1" and scaleB == "1":
         res["UseScaleAB"] = "Scalar"
         res["UseScaleAlphaVec"] = 1
-    elif scaleA == "2" and scaleB == "2" is not None:
+    elif scaleA == "2" and scaleB == "2":
         res["UseScaleAB"] = "Vector"
         res["UseScaleAlphaVec"] = 1
     return res
@@ -444,6 +446,9 @@ def dump_yaml(gpu_idx, gemm_group, yaml_file, m_sum, n_sum, batch_sum, k_sum, it
             if match:
                 size = extract_problem_size(match)
                 dtype = extract_dtype(match)
+                if dtype is None:
+                    print(f"Can't find dtype for {line}, please contact hipblaslt expert")
+                    continue
                 size_str = json.dumps(size)
                 dtype_str = json.dumps(dtype)
                 if (size_str, dtype_str) in unique_gemms:

From 34bad15fe701b3cf7c9126aa87a47534a3eb713b Mon Sep 17 00:00:00 2001
From: fsx950223 <fsx950223@outlook.com>
Date: Mon, 2 Dec 2024 05:12:32 +0000
Subject: [PATCH 20/23] calculate gsu

---
 .../tensile_config_generator.py               | 86 +++++++++++--------
 1 file changed, 49 insertions(+), 37 deletions(-)

diff --git a/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py b/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py
index 9b5022f26c..225e8ae128 100644
--- a/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py
+++ b/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py
@@ -106,15 +106,12 @@
     res = subprocess.run(["cat", "/sys/class/drm/card1/device/current_compute_partition"], stdout=subprocess.PIPE)
     if res.stdout.decode("utf-8").strip() == "CPX":
         XCC = 1
-        GSU = [1,2,3,4,5,6,7,8]
     else:
         XCC = 4
-        GSU = [1,2,3,4]
     DeviceNames = ["Device 0049", "Device 0050"]
     ScheduleName = "aquavanjaram"
 elif ArchitectureName == 'gfx90a':
     XCC = 1
-    GSU = [1,2,3,4]
     DeviceNames = ["Device 0050", "Device 0051", "Device 0052", "Device 0054", "Device 0062", "Device 7400", "Device 740c"]
     ScheduleName = "aldebaran"
 
@@ -237,6 +234,18 @@ def bias_datatype_map(dtype):
     else:
         return []
 
+def get_high_precision_accumulate(DataType):
+    if DataType in ["H", "B", "F8"]:
+        return True
+    else:
+        return False
+
+def adapt_xf32(ComputeDataType):
+    if ComputeDataType == "XS":
+        return 'S', 'x'
+    else:
+        return ComputeDataType, 0
+
 def extract_dtype(match):
     gdict = match.groupdict()
     DataType = datatype_map(gdict.get('A_TYPE', '').strip())
@@ -250,14 +259,8 @@ def extract_dtype(match):
     scaleB = gdict.get("SCALE_B").strip()
     activation_type = gdict.get("ACTIVATION_TYPE").strip()
     bias_source = gdict.get('BIAS_SOURCE', '').strip().upper()
-    if DataType in ["H", "B", "F8"]:
-        HighPrecisionAccumulate = True
-    else:
-        HighPrecisionAccumulate = False
-    F32XdlMathOp = 0
-    if ComputeDataType == "XS":
-        ComputeDataType = "S"
-        F32XdlMathOp = 'x'
+    HighPrecisionAccumulate = get_high_precision_accumulate(DataType)
+    ComputeDataType, F32XdlMathOp = adapt_xf32(ComputeDataType)
     res = {"Batched": True, "DataType": DataType, "DestDataType": DestDataType, "ComputeDataType": ComputeDataType, "TransposeA": TransposeA, "TransposeB": TransposeB, "HighPrecisionAccumulate": HighPrecisionAccumulate, "F32XdlMathOp": F32XdlMathOp, "OperationType": "GEMM", "UseBeta": True}
 
     if bias_source:
@@ -359,7 +362,12 @@ def calculate_min_flops(m_sum, n_sum, batch_sum, k_sum, iters):
 
     return (ENQUEUES_PER_SYNC + iters) * m_avg * n_avg * batch_avg * k_avg / 2
 
-def dump_yaml(gpu_idx, gemm_group, yaml_file, m_sum, n_sum, batch_sum, k_sum, iters, groups):
+def calculate_gsu(matmul_instruction, size):
+    mt0 = matmul_instruction[0] * matmul_instruction[5] * matmul_instruction[7]
+    mt1 = matmul_instruction[1] * matmul_instruction[6] * matmul_instruction[8]
+    return max(1, CU // (math.ceil(size[0] / mt0) * math.ceil(size[1] / mt1)))
+
+def dump_yaml(gpu_idx, gemm_group, yaml_file, m_sum, n_sum, batch_sum, k_sum, iters, groups, gsu):
     MinFlopsPerSync = calculate_min_flops(m_sum, n_sum, batch_sum, k_sum, iters)
     # Read the YAML file
     with open(yaml_file, 'r') as f:
@@ -423,7 +431,7 @@ def dump_yaml(gpu_idx, gemm_group, yaml_file, m_sum, n_sum, batch_sum, k_sum, it
             if "WorkGroupMappingXCC" in item:
                 item["WorkGroupMappingXCC"] = [XCC]
             if "GlobalSplitU" in item:
-                item["GlobalSplitU"] = list(GSU)
+                item["GlobalSplitU"] = list(gsu)
         data["BenchmarkProblems"][i][0] = dtype
     data["LibraryLogic"]["DeviceNames"] = DeviceNames
     data["LibraryLogic"]["ScheduleName"] = ScheduleName
@@ -484,8 +492,10 @@ def dump_yaml(gpu_idx, gemm_group, yaml_file, m_sum, n_sum, batch_sum, k_sum, it
             if mfma_instructions is None:
                 continue
 
+            gsu = set()
             matmul_instruction_found = False
             for mfma_instruction in mfma_instructions:
+                size = copy.deepcopy(original_size)
                 for _ in range(NUM_STAGES):
                     matmul_instruction_gen = list(find_matmul_instruction(mfma_instruction, size))
                     if args.groups:
@@ -497,6 +507,7 @@ def dump_yaml(gpu_idx, gemm_group, yaml_file, m_sum, n_sum, batch_sum, k_sum, it
                     total_inst = min(len(matmul_instruction_gen) // DIV_MI, MIN_MI)  # At least 5 insts and max of 33.3% of insts.
                     for index, matmul_instruction in enumerate(matmul_instruction_gen):
                         if matmul_instruction is not None:
+                            gsu.add(calculate_gsu(matmul_instruction, size))
                             if dtype_str not in matmul_instructions:
                                 matmul_instructions[dtype_str] = dict()
                             matmul_instructions[dtype_str][str(matmul_instruction)] = matmul_instruction
@@ -504,22 +515,26 @@ def dump_yaml(gpu_idx, gemm_group, yaml_file, m_sum, n_sum, batch_sum, k_sum, it
                                 break
                     total_inst = min(len(mi_groups0) // DIV_MI, MIN_MI)
                     for index, mi_0 in enumerate(mi_groups0):
-                        if dtype_str not in groups:
-                            groups[dtype_str] = [{},{}]
-                            groups[dtype_str][0]["MatrixInstruction"] = {}
-                            groups[dtype_str][1]["MatrixInstruction"] = {}
-                        groups[dtype_str][0]["MatrixInstruction"][str(mi_0)] = mi_0
-                        if args.fast and (index > total_inst):
-                            break
+                        if mi_0 is not None:
+                            gsu.add(calculate_gsu(mi_0, size))
+                            if dtype_str not in groups:
+                                groups[dtype_str] = [{},{}]
+                                groups[dtype_str][0]["MatrixInstruction"] = {}
+                                groups[dtype_str][1]["MatrixInstruction"] = {}
+                            groups[dtype_str][0]["MatrixInstruction"][str(mi_0)] = mi_0
+                            if args.fast and (index > total_inst):
+                                break
                     total_inst = min(len(mi_groups1) // DIV_MI, MIN_MI)
                     for index, mi_1 in enumerate(mi_groups1):
-                        if dtype_str not in groups:
-                            groups[dtype_str] = [{},{}]
-                            groups[dtype_str][0]["MatrixInstruction"] = {}
-                            groups[dtype_str][1]["MatrixInstruction"] = {}
-                        groups[dtype_str][1]["MatrixInstruction"][str(mi_1)] = mi_1
-                        if args.fast and (index > total_inst):
-                            break
+                        if mi_1 is not None:
+                            gsu.add(calculate_gsu(mi_1, size))
+                            if dtype_str not in groups:
+                                groups[dtype_str] = [{},{}]
+                                groups[dtype_str][0]["MatrixInstruction"] = {}
+                                groups[dtype_str][1]["MatrixInstruction"] = {}
+                            groups[dtype_str][1]["MatrixInstruction"][str(mi_1)] = mi_1
+                            if args.fast and (index > total_inst):
+                                break
                     if len(matmul_instruction_gen) > 0 or len(mi_groups0) > 0 or len(mi_groups1) > 0:
                         matmul_instruction_found = True
                         if not args.full_stage:
@@ -540,7 +555,7 @@ def dump_yaml(gpu_idx, gemm_group, yaml_file, m_sum, n_sum, batch_sum, k_sum, it
                 batch_sum += original_size[2]
                 k_sum += original_size[3]
 
-        dump_yaml(gpu_idx, gemm_group, args.tensile_config, m_sum, n_sum, batch_sum, k_sum, args.iters, groups)
+        dump_yaml(gpu_idx, gemm_group, args.tensile_config, m_sum, n_sum, batch_sum, k_sum, args.iters, groups, gsu)
 
 elif args.gridbase_config and args.hipblaslt_log is None:
     LibraryType = "GridBased"
@@ -559,14 +574,8 @@ def dump_yaml(gpu_idx, gemm_group, yaml_file, m_sum, n_sum, batch_sum, k_sum, it
             ComputeDataType = datatype_map(data['ComputeDataType'].strip())
             TransposeA = trans_map(data['TransposeA'])
             TransposeB = trans_map(data['TransposeB'])
-            if DataType in ["H", "B"]:
-                HighPrecisionAccumulate = True
-            else:
-                HighPrecisionAccumulate = False
-            F32XdlMathOp = 0
-            if ComputeDataType == "XS":
-                ComputeDataType = "S"
-                F32XdlMathOp = 'x'
+            HighPrecisionAccumulate = get_high_precision_accumulate(DataType)
+            ComputeDataType, F32XdlMathOp = adapt_xf32(ComputeDataType)
             dtype = {"Batched": True, "DataType": DataType, "DestDataType": DestDataType, "ComputeDataType": ComputeDataType, "TransposeA": TransposeA, "TransposeB": TransposeB, "HighPrecisionAccumulate": HighPrecisionAccumulate, "F32XdlMathOp": F32XdlMathOp, "OperationType": "GEMM", "UseBeta": True, "UseBias": 1, "Activation": True, "ActivationType": "hipblaslt_all", "UseScaleAlphaVec": 1}
             dtype_str = json.dumps(dtype)
             for m in m_shapes:
@@ -588,6 +597,7 @@ def dump_yaml(gpu_idx, gemm_group, yaml_file, m_sum, n_sum, batch_sum, k_sum, it
             size = list(size)
             original_size = copy.deepcopy(size)
             dtype_str = k[0]
+            gsu = set()
 
             dtype = json.loads(dtype_str)
             mfma_instructions = instruction_map(dtype)
@@ -595,11 +605,13 @@ def dump_yaml(gpu_idx, gemm_group, yaml_file, m_sum, n_sum, batch_sum, k_sum, it
                 continue
             matmul_instruction_found = False
             for mfma_instruction in mfma_instructions:
+                size = copy.deepcopy(original_size)
                 for _ in range(NUM_STAGES):
                     matmul_instruction_gen = list(find_matmul_instruction(mfma_instruction, size))
                     total_inst = min(len(matmul_instruction_gen) // 3, 5)  # At least 5 insts and max of 33.3% of insts.
                     for index, matmul_instruction in enumerate(matmul_instruction_gen):
                         if matmul_instruction is not None:
+                            gsu.add(calculate_gsu(matmul_instruction, size))
                             if dtype_str not in matmul_instructions:
                                 matmul_instructions[dtype_str] = dict()
                             matmul_instructions[dtype_str][str(matmul_instruction)] = matmul_instruction
@@ -626,4 +638,4 @@ def dump_yaml(gpu_idx, gemm_group, yaml_file, m_sum, n_sum, batch_sum, k_sum, it
                 batch_sum += original_size[2]
                 k_sum += original_size[3]
 
-        dump_yaml(gpu_idx, gemm_group, args.tensile_config, m_sum, n_sum, batch_sum, k_sum, args.iters, {})
+        dump_yaml(gpu_idx, gemm_group, args.tensile_config, m_sum, n_sum, batch_sum, k_sum, args.iters, {}, gsu)

From 5e54bb390c7a79fcf8b3b985d08092aa40720380 Mon Sep 17 00:00:00 2001
From: fsx950223 <fsx950223@outlook.com>
Date: Mon, 2 Dec 2024 06:53:56 +0000
Subject: [PATCH 21/23] fix a bug

---
 .../tensile_config_generator.py               | 22 ++++++++++---------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py b/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py
index 225e8ae128..a588b736b3 100644
--- a/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py
+++ b/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py
@@ -367,7 +367,7 @@ def calculate_gsu(matmul_instruction, size):
     mt1 = matmul_instruction[1] * matmul_instruction[6] * matmul_instruction[8]
     return max(1, CU // (math.ceil(size[0] / mt0) * math.ceil(size[1] / mt1)))
 
-def dump_yaml(gpu_idx, gemm_group, yaml_file, m_sum, n_sum, batch_sum, k_sum, iters, groups, gsu):
+def dump_yaml(gpu_idx, gemm_group, yaml_file, m_sum, n_sum, batch_sum, k_sum, iters, groups, gsu_group):
     MinFlopsPerSync = calculate_min_flops(m_sum, n_sum, batch_sum, k_sum, iters)
     # Read the YAML file
     with open(yaml_file, 'r') as f:
@@ -431,7 +431,7 @@ def dump_yaml(gpu_idx, gemm_group, yaml_file, m_sum, n_sum, batch_sum, k_sum, it
             if "WorkGroupMappingXCC" in item:
                 item["WorkGroupMappingXCC"] = [XCC]
             if "GlobalSplitU" in item:
-                item["GlobalSplitU"] = list(gsu)
+                item["GlobalSplitU"] = gsu_group[dtype_str]
         data["BenchmarkProblems"][i][0] = dtype
     data["LibraryLogic"]["DeviceNames"] = DeviceNames
     data["LibraryLogic"]["ScheduleName"] = ScheduleName
@@ -472,6 +472,7 @@ def dump_yaml(gpu_idx, gemm_group, yaml_file, m_sum, n_sum, batch_sum, k_sum, it
 
     for gpu_idx, unique_gemms_subgroup in enumerate(unique_gemms_subgroups):
         gemm_group = {}
+        gsu_group = {}
         matmul_instructions = {}
         groups = {}
         if unique_gemms_subgroup is None:
@@ -484,9 +485,8 @@ def dump_yaml(gpu_idx, gemm_group, yaml_file, m_sum, n_sum, batch_sum, k_sum, it
 
         for k, v in unique_gemms_subgroup:
             size_str, dtype_str = k
-            size = json.loads(size_str)
+            original_size = json.loads(size_str)
             dtype = json.loads(dtype_str)
-            original_size = copy.deepcopy(size)
             mfma_instructions = instruction_map(dtype)
 
             if mfma_instructions is None:
@@ -550,12 +550,13 @@ def dump_yaml(gpu_idx, gemm_group, yaml_file, m_sum, n_sum, batch_sum, k_sum, it
                     gemm_group[dtype_str].append({'Exact': list(original_size)})
                 else:
                     gemm_group[dtype_str] = [{'Exact': list(original_size)}]
+                gsu_group[dtype_str] = list(gsu)
                 m_sum += original_size[0]
                 n_sum += original_size[1]
                 batch_sum += original_size[2]
                 k_sum += original_size[3]
 
-        dump_yaml(gpu_idx, gemm_group, args.tensile_config, m_sum, n_sum, batch_sum, k_sum, args.iters, groups, gsu)
+        dump_yaml(gpu_idx, gemm_group, args.tensile_config, m_sum, n_sum, batch_sum, k_sum, args.iters, groups, gsu_group)
 
 elif args.gridbase_config and args.hipblaslt_log is None:
     LibraryType = "GridBased"
@@ -589,20 +590,20 @@ def dump_yaml(gpu_idx, gemm_group, yaml_file, m_sum, n_sum, batch_sum, k_sum, it
     for gpu_idx, unique_gemms_subgroup in enumerate(unique_gemms_subgroups):
         gemm_group = {}
         matmul_instructions = {}
+        gsu_group = {}
         m_sum = 0
         n_sum = 0
         batch_sum = 0
         k_sum = 0
         for k, size in unique_gemms_subgroup:
-            size = list(size)
-            original_size = copy.deepcopy(size)
+            original_size = list(size)
             dtype_str = k[0]
-            gsu = set()
-
             dtype = json.loads(dtype_str)
             mfma_instructions = instruction_map(dtype)
             if mfma_instructions is None:
                 continue
+
+            gsu = set()
             matmul_instruction_found = False
             for mfma_instruction in mfma_instructions:
                 size = copy.deepcopy(original_size)
@@ -633,9 +634,10 @@ def dump_yaml(gpu_idx, gemm_group, yaml_file, m_sum, n_sum, batch_sum, k_sum, it
                     gemm_group[dtype_str].append({'Exact': list(original_size)})
                 else:
                     gemm_group[dtype_str] = [{'Exact': list(original_size)}]
+                gsu_group[dtype_str] = list(gsu)
                 m_sum += original_size[0]
                 n_sum += original_size[1]
                 batch_sum += original_size[2]
                 k_sum += original_size[3]
 
-        dump_yaml(gpu_idx, gemm_group, args.tensile_config, m_sum, n_sum, batch_sum, k_sum, args.iters, {}, gsu)
+        dump_yaml(gpu_idx, gemm_group, args.tensile_config, m_sum, n_sum, batch_sum, k_sum, args.iters, {}, gsu_group)

From adba89546e8f30568dcf47363b01a4a03a3ab490 Mon Sep 17 00:00:00 2001
From: fsx950223 <fsx950223@outlook.com>
Date: Mon, 2 Dec 2024 08:40:17 +0000
Subject: [PATCH 22/23] fix a bug

---
 .../tensile_config_generator.py               | 20 ++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py b/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py
index a588b736b3..816a0300c0 100644
--- a/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py
+++ b/tensilelite/Tensile/Utilities/tensile_generator/tensile_config_generator.py
@@ -431,7 +431,7 @@ def dump_yaml(gpu_idx, gemm_group, yaml_file, m_sum, n_sum, batch_sum, k_sum, it
             if "WorkGroupMappingXCC" in item:
                 item["WorkGroupMappingXCC"] = [XCC]
             if "GlobalSplitU" in item:
-                item["GlobalSplitU"] = gsu_group[dtype_str]
+                item["GlobalSplitU"] = list(gsu_group[dtype_str])
         data["BenchmarkProblems"][i][0] = dtype
     data["LibraryLogic"]["DeviceNames"] = DeviceNames
     data["LibraryLogic"]["ScheduleName"] = ScheduleName
@@ -492,7 +492,9 @@ def dump_yaml(gpu_idx, gemm_group, yaml_file, m_sum, n_sum, batch_sum, k_sum, it
             if mfma_instructions is None:
                 continue
 
-            gsu = set()
+            if dtype_str not in gsu_group:
+                gsu_group[dtype_str] = set()
+
             matmul_instruction_found = False
             for mfma_instruction in mfma_instructions:
                 size = copy.deepcopy(original_size)
@@ -507,7 +509,7 @@ def dump_yaml(gpu_idx, gemm_group, yaml_file, m_sum, n_sum, batch_sum, k_sum, it
                     total_inst = min(len(matmul_instruction_gen) // DIV_MI, MIN_MI)  # At least 5 insts and max of 33.3% of insts.
                     for index, matmul_instruction in enumerate(matmul_instruction_gen):
                         if matmul_instruction is not None:
-                            gsu.add(calculate_gsu(matmul_instruction, size))
+                            gsu_group[dtype_str].add(calculate_gsu(matmul_instruction, size))
                             if dtype_str not in matmul_instructions:
                                 matmul_instructions[dtype_str] = dict()
                             matmul_instructions[dtype_str][str(matmul_instruction)] = matmul_instruction
@@ -516,7 +518,7 @@ def dump_yaml(gpu_idx, gemm_group, yaml_file, m_sum, n_sum, batch_sum, k_sum, it
                     total_inst = min(len(mi_groups0) // DIV_MI, MIN_MI)
                     for index, mi_0 in enumerate(mi_groups0):
                         if mi_0 is not None:
-                            gsu.add(calculate_gsu(mi_0, size))
+                            gsu_group[dtype_str].add(calculate_gsu(mi_0, size))
                             if dtype_str not in groups:
                                 groups[dtype_str] = [{},{}]
                                 groups[dtype_str][0]["MatrixInstruction"] = {}
@@ -527,7 +529,7 @@ def dump_yaml(gpu_idx, gemm_group, yaml_file, m_sum, n_sum, batch_sum, k_sum, it
                     total_inst = min(len(mi_groups1) // DIV_MI, MIN_MI)
                     for index, mi_1 in enumerate(mi_groups1):
                         if mi_1 is not None:
-                            gsu.add(calculate_gsu(mi_1, size))
+                            gsu_group[dtype_str].add(calculate_gsu(mi_1, size))
                             if dtype_str not in groups:
                                 groups[dtype_str] = [{},{}]
                                 groups[dtype_str][0]["MatrixInstruction"] = {}
@@ -550,7 +552,6 @@ def dump_yaml(gpu_idx, gemm_group, yaml_file, m_sum, n_sum, batch_sum, k_sum, it
                     gemm_group[dtype_str].append({'Exact': list(original_size)})
                 else:
                     gemm_group[dtype_str] = [{'Exact': list(original_size)}]
-                gsu_group[dtype_str] = list(gsu)
                 m_sum += original_size[0]
                 n_sum += original_size[1]
                 batch_sum += original_size[2]
@@ -603,7 +604,8 @@ def dump_yaml(gpu_idx, gemm_group, yaml_file, m_sum, n_sum, batch_sum, k_sum, it
             if mfma_instructions is None:
                 continue
 
-            gsu = set()
+            if dtype_str not in gsu_group:
+                gsu_group[dtype_str] = set()
             matmul_instruction_found = False
             for mfma_instruction in mfma_instructions:
                 size = copy.deepcopy(original_size)
@@ -612,7 +614,7 @@ def dump_yaml(gpu_idx, gemm_group, yaml_file, m_sum, n_sum, batch_sum, k_sum, it
                     total_inst = min(len(matmul_instruction_gen) // 3, 5)  # At least 5 insts and max of 33.3% of insts.
                     for index, matmul_instruction in enumerate(matmul_instruction_gen):
                         if matmul_instruction is not None:
-                            gsu.add(calculate_gsu(matmul_instruction, size))
+                            gsu_group[dtype_str].add(calculate_gsu(matmul_instruction, size))
                             if dtype_str not in matmul_instructions:
                                 matmul_instructions[dtype_str] = dict()
                             matmul_instructions[dtype_str][str(matmul_instruction)] = matmul_instruction
@@ -634,7 +636,7 @@ def dump_yaml(gpu_idx, gemm_group, yaml_file, m_sum, n_sum, batch_sum, k_sum, it
                     gemm_group[dtype_str].append({'Exact': list(original_size)})
                 else:
                     gemm_group[dtype_str] = [{'Exact': list(original_size)}]
-                gsu_group[dtype_str] = list(gsu)
+
                 m_sum += original_size[0]
                 n_sum += original_size[1]
                 batch_sum += original_size[2]

From 82b8eb0b8ab3b137c7cdbdb3df6bfddec6880f5b Mon Sep 17 00:00:00 2001
From: fsx950223 <fsx950223@outlook.com>
Date: Mon, 2 Dec 2024 09:11:08 +0000
Subject: [PATCH 23/23] skip slow kernels

---
 .../Tensile/Utilities/tensile_generator/tuning_template.yaml     | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensilelite/Tensile/Utilities/tensile_generator/tuning_template.yaml b/tensilelite/Tensile/Utilities/tensile_generator/tuning_template.yaml
index bfefb87aae..77fbd932b6 100644
--- a/tensilelite/Tensile/Utilities/tensile_generator/tuning_template.yaml
+++ b/tensilelite/Tensile/Utilities/tensile_generator/tuning_template.yaml
@@ -272,6 +272,7 @@ BenchmarkProblems:
     InitialSolutionParameters: null
     JoinParameters: null
 GlobalParameters:
+  SkipSlowSolutionRatio: 0.5
   BoundsCheck: false
   DataInitTypeAB: 12
   DataInitTypeAlpha: 1