Skip to content

[ET-VK][Ops] quantize_per_tensor.tensor variant #12208

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: gh/ahmtox/36/base
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 11 additions & 3 deletions backends/vulkan/runtime/graph/ops/glsl/quantize_buffer.glsl
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,14 @@ ${layout_declare_tensor(B, "w", "t_out", OUT_DTYPE, "buffer")}
${layout_declare_tensor(B, "r", "t_in", IN_DTYPE, "buffer")}

$if MODE == "per_tensor":
$if SHAPE == "tensor":
${layout_declare_tensor(B, "r", "t_scale", "float", "buffer")}
${layout_declare_tensor(B, "r", "t_zero_point", "int", "buffer")}

layout(push_constant) uniform restrict Block {
float scale;
int zero_point;
$if SHAPE == "scalar":
float scale;
int zero_point;
int quant_min;
int quant_max;
};
Expand Down Expand Up @@ -142,7 +147,10 @@ void quantize_per_tensor() {
const int in_bufi = tidx_to_bufi(out_tidx, t_in_strides);

IN_T value = t_in[in_bufi];
OUT_T qvalue = quantize_val(value, scale, zero_point);
$if SHAPE == "scalar":
OUT_T qvalue = quantize_val(value, scale, zero_point);
$if SHAPE == "tensor":
OUT_T qvalue = quantize_val(value, t_scale[0], t_zero_point[0]);

t_out[out_bufi] = qvalue;
}
Expand Down
4 changes: 4 additions & 0 deletions backends/vulkan/runtime/graph/ops/glsl/quantize_buffer.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ quantize_buffer:
IN_DTYPE: float
OUT_DTYPE: int32
MODE: per_tensor
SHAPE: tensor
generate_variant_forall:
IN_DTYPE:
- VALUE: half
Expand All @@ -15,6 +16,9 @@ quantize_buffer:
shader_variants:
- NAME: quantize_per_tensor_buffer
MODE: per_tensor
SHAPE: scalar
- NAME: quantize_per_tensor_tensor_buffer
MODE: per_tensor
- NAME: quantize_per_token_buffer
MODE: per_token
- NAME: quantize_per_channel_buffer
Expand Down
15 changes: 12 additions & 3 deletions backends/vulkan/runtime/graph/ops/glsl/quantize_texture.glsl
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
#define IVEC4_T ${texel_load_type(OUT_DTYPE, "texture3d")}

#define ${MODE}
#define ${SHAPE}

${define_active_storage_type("texture3d")}
${define_required_extensions(IN_DTYPE)}
Expand All @@ -32,9 +33,14 @@ ${layout_declare_tensor(B, "w", "t_out", OUT_DTYPE, "texture3d")}
${layout_declare_tensor(B, "r", "t_in", IN_DTYPE, "texture3d")}

$if MODE == "per_tensor":
$if SHAPE == "tensor":
${layout_declare_tensor(B, "r", "t_scale", "float", "buffer")}
${layout_declare_tensor(B, "r", "t_zero_point", "int", "buffer")}

layout(push_constant) uniform restrict Block {
float scale;
int zero_point;
$if SHAPE == "scalar":
float scale;
int zero_point;
int quant_min;
int quant_max;
};
Expand Down Expand Up @@ -146,7 +152,10 @@ void quantize_per_tensor() {

[[unroll]] for (int i = 0; i < 4; ++i) {
IN_T value = IN_T(intex[i]);
OUT_T qvalue = quantize_val(value, scale, zero_point);
$if SHAPE == "scalar":
OUT_T qvalue = quantize_val(value, scale, zero_point);
$if SHAPE == "tensor":
OUT_T qvalue = quantize_val(value, t_scale[0], t_zero_point[0]);
outtex[i] = qvalue;
}
write_texel(t_out, pos, outtex);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ quantize_texture:
IN_DTYPE: float
OUT_DTYPE: int32
MODE: per_tensor
SHAPE: tensor
generate_variant_forall:
IN_DTYPE:
- VALUE: half
Expand All @@ -15,6 +16,9 @@ quantize_texture:
shader_variants:
- NAME: quantize_per_tensor_texture3d
MODE: per_tensor
SHAPE: scalar
- NAME: quantize_per_tensor_tensor_texture3d
MODE: per_tensor
- NAME: quantize_per_token_texture3d
MODE: per_token
- NAME: quantize_per_channel_texture3d
Expand Down
36 changes: 29 additions & 7 deletions backends/vulkan/runtime/graph/ops/impl/Quantize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -80,13 +80,23 @@ void add_quantize_per_tensor_node(
const ValueRef& quant_min,
const ValueRef& quant_max,
const ValueRef& output) {
const bool is_tensor_scale_zp =
graph.val_is_tensor(scale) && graph.val_is_tensor(zero_point);

std::string kernel_name("quantize_per_tensor");
if (is_tensor_scale_zp) {
kernel_name += "_tensor";
}
add_storage_type_suffix(kernel_name, graph.storage_type_of(input));
add_dtype_suffix(kernel_name, graph.dtype_of(input));
add_dtype_suffix(kernel_name, graph.dtype_of(output));

float scale_val = static_cast<float>(graph.get_double(scale));
int zero_point_val = static_cast<int>(graph.get_int(zero_point));
float scale_val = 1.0;
int zero_point_val = 0;
if (!is_tensor_scale_zp) {
scale_val = static_cast<float>(graph.get_double(scale));
zero_point_val = static_cast<int>(graph.get_int(zero_point));
}
int quant_min_val = static_cast<int>(graph.get_int(quant_min));
int quant_max_val = static_cast<int>(graph.get_int(quant_max));

Expand All @@ -100,15 +110,17 @@ void add_quantize_per_tensor_node(
graph.strides_ubo(input),
graph.sizes_ubo(output),
graph.strides_ubo(output)};
} else {
param_ubos = {
graph.logical_limits_ubo(input), graph.logical_limits_ubo(output)};
}

if (is_tensor_scale_zp) {
push_constants = {
PushConstantDataInfo(&scale_val, sizeof(float)),
PushConstantDataInfo(&zero_point_val, sizeof(int)),
PushConstantDataInfo(&quant_min_val, sizeof(int)),
PushConstantDataInfo(&quant_max_val, sizeof(int)),
};
} else {
param_ubos = {
graph.logical_limits_ubo(input), graph.logical_limits_ubo(output)};
push_constants = {
PushConstantDataInfo(&scale_val, sizeof(float)),
PushConstantDataInfo(&zero_point_val, sizeof(int)),
Expand All @@ -122,13 +134,20 @@ void add_quantize_per_tensor_node(
graph.hashed_layout_of(input),
};

std::vector<ArgGroup> inputs_and_outputs = {
{output, vkapi::kWrite}, {input, vkapi::kRead}};
if (is_tensor_scale_zp) {
inputs_and_outputs.emplace_back(
ArgGroup{{scale, zero_point}, vkapi::kRead});
}

graph.execute_nodes().emplace_back(new DynamicDispatchNode(
graph,
VK_KERNEL_FROM_STR(kernel_name),
default_pick_global_wg_size,
default_pick_local_wg_size,
// Inputs and Outputs
{{output, vkapi::kWrite}, {input, vkapi::kRead}},
inputs_and_outputs,
// Shader param buffers
param_ubos,
// Push Constants
Expand Down Expand Up @@ -489,6 +508,9 @@ REGISTER_OPERATORS {
VK_REGISTER_OP(
quantized_decomposed.quantize_per_tensor.default,
quantize_per_tensor_impl);
VK_REGISTER_OP(
quantized_decomposed.quantize_per_tensor.tensor,
quantize_per_tensor_impl);
VK_REGISTER_OP(
quantized_decomposed.quantize_per_token.default, quantize_per_token_impl);
VK_REGISTER_OP(
Expand Down
Loading
Loading