s4=>i4

DDEle · DDEle · commit 96d29667ba2f · 2024-07-10T08:26:45.000Z
diff --git a/include/common/core/common_types.hpp b/include/common/core/common_types.hpp
@@ -28,9 +28,9 @@ enum class grf_mode : uint8_t { normal = 0, double_grf = 1 };
 enum class mem_layout : uint8_t { row_major = 0, col_major = 1 };
 
 enum class quant_mode : uint8_t {
-  S4_ASYM = 0,
-  S4_FULLRANGE_NO_ZP = 1,
-  INT4_ASYM_FP_ZERO = 2
+  I4_ASYM = 0,
+  I4_FULLRANGE_NO_ZP = 1,
+  I4_ASYM_FP_ZERO = 2
 };
 
 struct quant_info {
diff --git a/include/experimental/group/gemm/impl/int4_dequantize_xe.hpp b/include/experimental/group/gemm/impl/int4_dequantize_xe.hpp
@@ -102,7 +102,7 @@ class gemm_t<
           std::is_same<remove_const_t<dtype_b>, remove_const_t<int4x8>>::value,
       "this is for 4bit matB ");
   static_assert(
-      quant_info_.quant_mode == quant_mode::INT4_ASYM_FP_ZERO
+      quant_info_.quant_mode == quant_mode::I4_ASYM_FP_ZERO
           ? std::is_same_v<
                 remove_const_t<dtype_zero_pt>,
                 remove_const_t<dtype_a>>
@@ -291,7 +291,7 @@ class gemm_t<
 
   // compress int4 along N dimensions
   using zero_pt_tile_desc_t = std::conditional_t<
-      quant_info_.quant_mode != quant_mode::INT4_ASYM_FP_ZERO,
+      quant_info_.quant_mode != quant_mode::I4_ASYM_FP_ZERO,
       subgroup::tile_desc_t<
           (tile_size_x_b + pack_ratio - 1) / pack_ratio,
           tile_size_y_zero_pt,
@@ -535,7 +535,7 @@ class gemm_t<
       subgroup::tile_prefetch<cache_hint::cached, cache_hint::cached>(
           scale_prefetch_payload);
       if constexpr (
-          compute_policy::quant_mode != quant_mode::S4_FULLRANGE_NO_ZP) {
+          compute_policy::quant_mode != quant_mode::I4_FULLRANGE_NO_ZP) {
         // TODO 1D prefetch need pack to U32/U64
         subgroup::tile_prefetch<cache_hint::cached, cache_hint::cached>(
             zero_pt_prefetch_payload);
@@ -549,7 +549,7 @@ class gemm_t<
         scale_prefetch_payload.template update_tdesc<update_dir_b>(
             scale_t::tile_size_y);
         if constexpr (
-            compute_policy::quant_mode != quant_mode::S4_FULLRANGE_NO_ZP) {
+            compute_policy::quant_mode != quant_mode::I4_FULLRANGE_NO_ZP) {
           zero_pt_prefetch_payload
               .template update_tdesc<tdesc_update_dir::y_dir>(
                   zero_pt_t::tile_size_y);
@@ -579,7 +579,7 @@ class gemm_t<
       subgroup::tile_load<cache_hint::cached, cache_hint::cached>(
           scale, scale_payload);
       if constexpr (
-          compute_policy::quant_mode != quant_mode::S4_FULLRANGE_NO_ZP) {
+          compute_policy::quant_mode != quant_mode::I4_FULLRANGE_NO_ZP) {
         subgroup::tile_load<cache_hint::cached, cache_hint::cached>(
             zero_pt, zero_pt_payload);
       }
@@ -594,7 +594,7 @@ class gemm_t<
         subgroup::tile_prefetch<cache_hint::cached, cache_hint::cached>(
             scale_prefetch_payload);
         if constexpr (
-            compute_policy::quant_mode != quant_mode::S4_FULLRANGE_NO_ZP) {
+            compute_policy::quant_mode != quant_mode::I4_FULLRANGE_NO_ZP) {
           // TODO 1D prefetch need pack to U32/U64
           subgroup::tile_prefetch<cache_hint::cached, cache_hint::cached>(
               zero_pt_prefetch_payload);
@@ -608,7 +608,7 @@ class gemm_t<
         scale_payload.template update_tdesc<update_dir_b>(scale_t::tile_size_y);
       }
       if constexpr (
-          compute_policy::quant_mode != quant_mode::S4_FULLRANGE_NO_ZP) {
+          compute_policy::quant_mode != quant_mode::I4_FULLRANGE_NO_ZP) {
         if (tile_k_idx % zero_pt_addr_update_freq == 0) {
           zero_pt_payload.template update_tdesc<tdesc_update_dir::y_dir>(
               zero_pt_t::tile_size_y);
@@ -623,7 +623,7 @@ class gemm_t<
           scale_prefetch_payload.template update_tdesc<tdesc_update_dir::y_dir>(
               scale_t::tile_size_y);
           if constexpr (
-              compute_policy::quant_mode != quant_mode::S4_FULLRANGE_NO_ZP) {
+              compute_policy::quant_mode != quant_mode::I4_FULLRANGE_NO_ZP) {
             zero_pt_prefetch_payload
                 .template update_tdesc<tdesc_update_dir::y_dir>(
                     zero_pt_t::tile_size_y);
diff --git a/include/experimental/kernel/gemm/impl/int4_dequantize_kslicing_xe.hpp b/include/experimental/kernel/gemm/impl/int4_dequantize_kslicing_xe.hpp
@@ -159,7 +159,7 @@ class gemm_universal_t<
   /// @brief GEMM arguments.
   /// This is the interface for users to pass the application-related runtime
   /// variables.
-  template <quant_mode quant_mode = quant_mode::S4_FULLRANGE_NO_ZP>
+  template <quant_mode quant_mode = quant_mode::I4_FULLRANGE_NO_ZP>
   struct arguments_t {
     /// @brief Is the size of the m dimension of the matrix multiplication (m x
     /// k x n).
@@ -295,7 +295,7 @@ class gemm_universal_t<
     }
   };
   template <>
-  struct arguments_t<quant_mode::S4_FULLRANGE_NO_ZP> {
+  struct arguments_t<quant_mode::I4_FULLRANGE_NO_ZP> {
     /// @brief Is the size of the m dimension of the matrix multiplication (m x
     /// k x n).
     uint32_t matrix_m;
@@ -570,7 +570,7 @@ class gemm_universal_t<
     // check for int4x2
     implementable &=
         ((args.matB_ld % pack_ratio == 0) && (args.matrix_n % pack_ratio == 0));
-    if constexpr (gemm_t::compute_policy::quant_mode == quant_mode::S4_ASYM) {
+    if constexpr (gemm_t::compute_policy::quant_mode == quant_mode::I4_ASYM) {
       implementable &= (args.zero_pt_ld % pack_ratio == 0);
     }
 
@@ -622,7 +622,7 @@ class gemm_universal_t<
     int start_y_scale = start_k / dequant_s;
 
     int start_x_zero_pt =
-        gemm_t::compute_policy::quant_mode == quant_mode::INT4_ASYM_FP_ZERO
+        gemm_t::compute_policy::quant_mode == quant_mode::I4_ASYM_FP_ZERO
         ? start_n
         : start_n / pack_ratio;
     int start_y_zero_pt = start_k / dequant_s;
@@ -671,15 +671,15 @@ class gemm_universal_t<
     uint32_t inner_loop_count = (wg_tile_k + k_stride - 1) / k_stride;
     gemm_args_t gemm_args;
     if constexpr (
-        gemm_t::compute_policy::quant_mode == quant_mode::S4_FULLRANGE_NO_ZP) {
+        gemm_t::compute_policy::quant_mode == quant_mode::I4_FULLRANGE_NO_ZP) {
       gemm_args = gemm_args_t(
           mem_desc_a,
           mem_desc_b,
           inner_loop_start,
           inner_loop_count,
           mem_desc_scale);
     } else if constexpr (
-        gemm_t::compute_policy::quant_mode == quant_mode::S4_ASYM) {
+        gemm_t::compute_policy::quant_mode == quant_mode::I4_ASYM) {
       mem_desc_zero_pt_t mem_desc_zero_pt(
           args.zero_pt_base,
           {(args.matrix_n + pack_ratio - 1) / pack_ratio,
@@ -694,7 +694,7 @@ class gemm_universal_t<
           mem_desc_scale,
           mem_desc_zero_pt);
     } else if constexpr (
-        gemm_t::compute_policy::quant_mode == quant_mode::INT4_ASYM_FP_ZERO) {
+        gemm_t::compute_policy::quant_mode == quant_mode::I4_ASYM_FP_ZERO) {
       mem_desc_zero_pt_t mem_desc_zero_pt(
           args.zero_pt_base,
           {args.matrix_n,
diff --git a/include/subgroup/tile/impl/tile_op_functor.hpp b/include/subgroup/tile/impl/tile_op_functor.hpp
@@ -130,7 +130,7 @@ struct dequant_int4_weight_t {
                 (offset_y_in_tile) / dequant_s * scale_t::block_size_x +
                 offset_x_in_tile;
 
-            if constexpr (quant_mode == quant_mode::S4_ASYM) {
+            if constexpr (quant_mode == quant_mode::I4_ASYM) {
               uint32_t zero_pt_idx =
                   offset_y_in_tile / dequant_s * zero_pt_t::block_size_x +
                   offset_x_in_tile / pack_ratio;
@@ -150,16 +150,16 @@ struct dequant_int4_weight_t {
                   cvt_blk_i8.xetla_select<step, 1>(jj * block_size_y_b + ii) -
                   zero_pt_i8;
             } else if constexpr (
-                quant_mode == quant_mode::S4_FULLRANGE_NO_ZP ||
-                quant_mode == quant_mode::INT4_ASYM_FP_ZERO) {
+                quant_mode == quant_mode::I4_FULLRANGE_NO_ZP ||
+                quant_mode == quant_mode::I4_ASYM_FP_ZERO) {
               cvt_blk_i8.xetla_select<step, 1>(jj * block_size_y_b + ii) =
                   cvt_blk_i8.xetla_select<step, 1>(jj * block_size_y_b + ii) -
                   int8_t(8);
             }
             dst_blk.xetla_select<step, 1>(jj * block_size_y_b + ii) =
                 cvt_blk_i8.xetla_select<step, 1>(jj * block_size_y_b + ii) *
                 scale.reg[scale_idx];
-            if constexpr (quant_mode == quant_mode::INT4_ASYM_FP_ZERO) {
+            if constexpr (quant_mode == quant_mode::I4_ASYM_FP_ZERO) {
               uint32_t zero_pt_idx =
                   offset_y_in_tile / dequant_s * zero_pt_t::block_size_x +
                   offset_x_in_tile;
diff --git a/tests/integration/gemm/int4_dequantization/main.cpp b/tests/integration/gemm/int4_dequantization/main.cpp
@@ -229,8 +229,9 @@ void dequantize_gemm_run(uint32_t iter) {
       compute_attr_t<data_type_acc_in, data_type_acc_in, data_type_acc>;
   using perf_tuning_knob = xetla::group::
       perf_tuning_knob_t<sg_tile_k, prefetch_distance, periodic_sync_interval>;
-  
-  static constexpr quant_info quant_info{quant_mode::S4_ASYM, Test::dequant_s, layout_b};
+
+  static constexpr quant_info quant_info{
+      quant_mode::I4_ASYM, Test::dequant_s, layout_b};
 
   using compute_policy = xetla::group::compute_policy_int4_dequantize<
       compute_attr,
diff --git a/tests/integration/gemm/int4_dequantization_bias/main_client.cpp b/tests/integration/gemm/int4_dequantization_bias/main_client.cpp
@@ -622,7 +622,7 @@ void dequantize_gemm_run(int iter) {
       perf_tuning_knob_t<sg_tile_k, prefetch_distance, periodic_sync_interval>;
 
   static constexpr quant_info quant_info{
-      quant_mode::S4_FULLRANGE_NO_ZP, Test::dequant_s, layout_b};
+      quant_mode::I4_FULLRANGE_NO_ZP, Test::dequant_s, layout_b};
 
   using compute_policy = xetla::group::compute_policy_int4_dequantize<
       compute_attr,
@@ -1043,4 +1043,4 @@ REGISTER_TYPED_TEST_SUITE_P(dequantize_gemm_act_shuf_test, esimd);
 INSTANTIATE_TYPED_TEST_SUITE_P(
     dequantize_gemm_act_shuf_test_suite,
     dequantize_gemm_act_shuf_test,
-    tests);
+    tests);
diff --git a/tests/integration/gemm/int4_dequantization_bias/main_xe.cpp b/tests/integration/gemm/int4_dequantization_bias/main_xe.cpp
@@ -388,7 +388,7 @@ void dequantize_gemm_run(int iter) {
   using perf_tuning_knob = xetla::group::
       perf_tuning_knob_t<sg_tile_k, prefetch_distance, periodic_sync_interval>;
   static constexpr quant_info quant_info{
-      quant_mode::S4_FULLRANGE_NO_ZP, Test::dequant_s, layout_b};
+      quant_mode::I4_FULLRANGE_NO_ZP, Test::dequant_s, layout_b};
 
   using compute_policy = xetla::group::compute_policy_int4_dequantize<
       compute_attr,
diff --git a/tests/integration/gemv/int4/main.cpp b/tests/integration/gemv/int4/main.cpp
@@ -39,9 +39,9 @@ class test_col_major_1 {
   static constexpr size_t sg_n = 1;
   static constexpr size_t sg_k = 512 / sg_m;
   static constexpr size_t dequant_s = 128;
-  // static constexpr quant_mode quant_mode = quant_mode::S4_ASYM;
-  // static constexpr quant_mode quant_mode = quant_mode::S4_FULLRANGE_NO_ZP;
-  static constexpr quant_mode quant_mode = quant_mode::INT4_ASYM_FP_ZERO;
+  // static constexpr quant_mode quant_mode = quant_mode::I4_ASYM;
+  // static constexpr quant_mode quant_mode = quant_mode::I4_FULLRANGE_NO_ZP;
+  static constexpr quant_mode quant_mode = quant_mode::I4_ASYM_FP_ZERO;
 
   static constexpr size_t local_kslicing = 1;
   static constexpr size_t global_kslicing = 1;
@@ -121,7 +121,7 @@ int gemm_result_validate(
 }
 
 template <
-    quant_mode quant_mode = quant_mode::S4_FULLRANGE_NO_ZP,
+    quant_mode quant_mode = quant_mode::I4_FULLRANGE_NO_ZP,
     typename data_type_acc_in = fp16,
     typename data_type_b,
     typename data_type_scale,
@@ -133,15 +133,15 @@ std::vector<fp16> convert_int4(
   std::vector<fp16> dequant_fp16(sizeof(data_type_b) * 2);
 
   int8_t zero_pt_i8;
-  if constexpr (quant_mode != quant_mode::INT4_ASYM_FP_ZERO)
+  if constexpr (quant_mode != quant_mode::I4_ASYM_FP_ZERO)
     zero_pt_i8 = zero_pt & 0xf;
   for (uint32_t i = 0; i < dequant_fp16.size(); i++) {
     int8_t dequant_8bit = data_b & 0xf;
-    if constexpr (quant_mode == quant_mode::S4_FULLRANGE_NO_ZP) {
+    if constexpr (quant_mode == quant_mode::I4_FULLRANGE_NO_ZP) {
       dequant_fp16[i] = scale * (dequant_8bit - 8);
-    } else if constexpr (quant_mode == quant_mode::S4_ASYM) {
+    } else if constexpr (quant_mode == quant_mode::I4_ASYM) {
       dequant_fp16[i] = scale * (dequant_8bit - zero_pt_i8);
-    } else if constexpr (quant_mode == quant_mode::INT4_ASYM_FP_ZERO) {
+    } else if constexpr (quant_mode == quant_mode::I4_ASYM_FP_ZERO) {
       dequant_fp16[i] = scale * (dequant_8bit - 8) + zero_pt;
     } else {
       assert(0);
@@ -154,7 +154,7 @@ std::vector<fp16> convert_int4(
 template <
     size_t dequant_s,
     mem_layout layout_b = mem_layout::col_major,
-    quant_mode quant_mode = quant_mode::S4_FULLRANGE_NO_ZP,
+    quant_mode quant_mode = quant_mode::I4_FULLRANGE_NO_ZP,
     typename data_type_acc_in = fp16,
     typename data_type_b,
     typename data_type_scale,
@@ -176,13 +176,13 @@ std::vector<data_type_acc_in> dequantize_weight(
     for (uint32_t j = 0; j < width; j += step) {
       int start_b_in = i * width + j;
       int start_scale_in = start_b_in / step;
-      int start_zero_pt_in = quant_mode == quant_mode::INT4_ASYM_FP_ZERO
+      int start_zero_pt_in = quant_mode == quant_mode::I4_ASYM_FP_ZERO
           ? (j / step) * matrix_n + i
           : (j / step) * (matrix_n / pack_radio) + i / pack_radio;
       int start_out =
           layout_b == mem_layout::row_major ? 0 : i * matrix_k + j * pack_radio;
       data_type_zero_pt zp_value = zero_pt[start_zero_pt_in];
-      if constexpr (quant_mode != quant_mode::INT4_ASYM_FP_ZERO)
+      if constexpr (quant_mode != quant_mode::I4_ASYM_FP_ZERO)
         zp_value = zp_value >> (4 * (i % pack_radio));
       for (uint32_t jj = 0; jj < step; jj++) {
         std::vector<fp16> dequant_fp16 = convert_int4<quant_mode>(
@@ -225,7 +225,7 @@ void dequantize_gemv_run(int iter) {
   using data_type_b = typename Test::data_type_b;
   using data_type_c = typename Test::data_type_c;
   using data_type_zero_pt = std::conditional_t<
-      Test::quant_mode == quant_mode::INT4_ASYM_FP_ZERO,
+      Test::quant_mode == quant_mode::I4_ASYM_FP_ZERO,
       data_type_c,
       data_type_b>;
   using data_type_scale = fp16;
@@ -246,7 +246,7 @@ void dequantize_gemv_run(int iter) {
   constexpr size_t size_zero_pt_k = matrix_k / dequant_s;
   constexpr size_t size_zero_pt_n = matrix_n;
   constexpr size_t size_zero_pt =
-      Test::quant_mode != quant_mode::INT4_ASYM_FP_ZERO
+      Test::quant_mode != quant_mode::I4_ASYM_FP_ZERO
       ? size_zero_pt_k * size_zero_pt_n / 2
       : size_zero_pt_k * size_zero_pt_n;
 
@@ -490,7 +490,7 @@ void dequantize_gemv_run(int iter) {
        // It accepts the base pointer to matrix D, and its dimensions
        {bias_d, bias_add_shape}});
   typename gemm_op_t::template arguments_t<compute_policy::quant_mode> gemm_arg;
-  if constexpr (compute_policy::quant_mode == quant_mode::S4_FULLRANGE_NO_ZP) {
+  if constexpr (compute_policy::quant_mode == quant_mode::I4_FULLRANGE_NO_ZP) {
     gemm_arg =
         typename gemm_op_t::template arguments_t<compute_policy::quant_mode>(
             matrix_m,
@@ -508,8 +508,8 @@ void dequantize_gemv_run(int iter) {
             Cnt_d,
             epilogue_args);
   } else if constexpr (
-      compute_policy::quant_mode == quant_mode::S4_ASYM ||
-      compute_policy::quant_mode == quant_mode::INT4_ASYM_FP_ZERO) {
+      compute_policy::quant_mode == quant_mode::I4_ASYM ||
+      compute_policy::quant_mode == quant_mode::I4_ASYM_FP_ZERO) {
     gemm_arg =
         typename gemm_op_t::template arguments_t<compute_policy::quant_mode>(
             matrix_m,