Skip to content
This repository was archived by the owner on Aug 30, 2024. It is now read-only.

XeTLA Use ESIMD 2D Load APIs #324

Draft
wants to merge 19 commits into
base: xetla
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 8 additions & 8 deletions examples/09_gate_recurrent_unit/kernel_func.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ struct fused_config_t {
{start_x_b, start_y_b}); \
gemm_args.init(mem_desc_a, mem_desc_b, inner_loop_count_##id); \
op(g, matAcc_##acc_id, gemm_args); \
SW_BARRIER();
sw_barrier();

#define MATC_STORE(ptr_c) \
mem_desc_c.init( \
Expand Down Expand Up @@ -229,7 +229,7 @@ struct gru_layer {
int start_n = (j)*wg_tile_n;
CONFIG_SETTING(batch_size, -1, hidden_size);
matAcc_0.init(0);
SW_BARRIER();
sw_barrier();

// calculate reset gate: r_t = \sigmoid(X_t x W_ir + h_{t - 1} x W_hr)
// acc0 = X_t x W_ir
Expand Down Expand Up @@ -278,19 +278,19 @@ struct gru_layer {
matAcc_0.reg = matAcc_0.reg * (1 - matAcc_1.reg) +
matAcc_1.reg *
xetla_cvt<Act_T, T, matAcc_t::tile_elems>(mat_hidden.reg);
SW_BARRIER();
sw_barrier();

if (seq_id == seq_len - 1) {
MATC_STORE(args->layer_output);
SW_BARRIER();
sw_barrier();
__esimd_barrier();
}
MATC_STORE(args->cell_out_ptr + seq_id * io_size);
SW_BARRIER();
sw_barrier();
__esimd_barrier();

MATC_STORE(args->one_cell_ptr + (seq_id % 2) * io_size);
SW_BARRIER();
sw_barrier();
__esimd_barrier();
}
args->hx_ptr = args->one_cell_ptr + (seq_id % 2) * io_size;
Expand Down Expand Up @@ -386,7 +386,7 @@ struct kernel_xcoder_gru_fusion {
args.W_hz_ptr = (W_hz_ptr);
args.W_in_ptr = (W_in_ptr);
args.W_hn_ptr = (W_hn_ptr);
SW_BARRIER();
sw_barrier();
fused_op::call(item, &args);
ping = (ping + 1) % 2;
pong = (pong + 1) % 2;
Expand All @@ -411,7 +411,7 @@ struct kernel_xcoder_gru_fusion {
? hidden_out_ptr
: (ping_pong_buffer + ping * one_layer_size);
args.layer_ptr = ((ping_pong_buffer + pong * one_layer_size));
SW_BARRIER();
sw_barrier();
fused_op::call(item, &args);
ping = (ping + 1) % 2;
pong = (pong + 1) % 2;
Expand Down
27 changes: 26 additions & 1 deletion include/common/core/arch_config.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,16 +35,41 @@ template <>
struct load_store_attr_t<msg_type::block_2d, gpu_arch::XeHpc> {
/// HW limitation checks https://gfxspecs.intel.com/Predator/Home/Index/55490
static constexpr bool has_hw_block_2d = true;
// If Transposed and Transformed are both set to false
// BlockHeight must not exceed 32.
static constexpr uint32_t max_load_height_in_elem = 32;

// BlockWidth * NBlocks must not exceed 64 for bytes, 32 for words, 16 for
// dwords, and 8 for qwords.
static constexpr uint32_t max_load_width_in_bytes = 64;

// If Transposed is true then
// BlockWidth must be 1,2,4 for qwords and be in range [1..8] for dwords.
static constexpr uint32_t max_trans_load_width_in_bytes = 32;

// BlockHeight must be 8 for qwords and be in range [1..32] for dwords.
static constexpr uint32_t max_trans_load_height_in_elem = 32;

// If Transformed is true
// BlockWidth must be in range [4..16] for bytes and [2..16] for word.
static constexpr uint32_t max_vnni_load_width_in_elems = 16;

// BlockHeight must be in range [4..32] for bytes and [2..32] for words.
static constexpr uint32_t min_vnni_load_height_in_bytes = 4;

// BlockHeight must not exceed 8.
static constexpr uint32_t max_store_height_in_elem = 8;

// BlockWidth must not exceed 64 for bytes, 32 for words, 16 for dwords, and 8
// for qwords.
static constexpr uint32_t max_store_width_in_bytes = 64;

// BlockHeight must not exceed 32.
// BlockWidth * NBlocks must not exceed 64 for bytes, 32 for words, 16 for
// dwords, and 8 for qwords.
static constexpr uint32_t max_load_size_in_bytes = 2048;

// BlockWidth * BlockHeight * sizeof(T) must not exceed 512.
static constexpr uint32_t max_store_size_in_bytes = 512;

static constexpr uint32_t special_prefetch_width_in_bytes = 64;
Expand Down Expand Up @@ -97,7 +122,7 @@ struct load_store_attr_t<msg_type::block_1d, arch_tag> {
static constexpr uint32_t max_aligned_load_vec_len = 256;
static constexpr uint32_t max_store_vec_len = 256;
static constexpr uint32_t max_aligned_store_vec_len = 256;
static constexpr uint32_t max_prefetch_vec_len = 32;
static constexpr uint32_t max_prefetch_vec_len = 256;
static constexpr uint32_t max_channel_num = 16;
};

Expand Down
10 changes: 10 additions & 0 deletions include/common/core/barrier.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,16 @@ namespace gpu::xetla {
/// @addtogroup xetla_core_barrier
/// @{

/// sw_barrier, insert software scheduling barrier, for better code control
///

void sw_barrier() {
#if __INTEL_LLVM_COMPILER >= 20250000
#else
__ESIMD_NS::fence<__ESIMD_NS::fence_mask::sw_barrier>();
#endif
}

/// @brief Initialize the number of named barrier index for a kernel.
/// Available only on PVC. Only need to initialize once at the beginning.
///
Expand Down
Loading
Loading