Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[QNN EP] [DRAFT] Support Conv float weight/bias. #22906

Draft
wants to merge 4 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,11 @@ constexpr bool Is4BitIntType(int32_t data_type) {
(data_type == ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_UINT4);
}

constexpr bool IsFloatType(int32_t data_type) {
return (data_type == ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_FLOAT) ||
(data_type == ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_FLOAT16);
}

// adjust for an optional input/output that has an entry but does not exist
int NumActualValues(const Node& node, bool input) {
const auto& defs = input ? node.InputDefs() : node.OutputDefs();
Expand Down Expand Up @@ -336,38 +341,68 @@ bool ConvNodeGroupSelector::Check(const GraphViewer& graph_viewer,
const Node& node,
const std::vector<const Node*>& dq_nodes,
const std::vector<const Node*>& q_nodes) const {
if (!CheckQDQNodes(graph_viewer, node, dq_nodes, q_nodes)) {
auto is_const_float = [&graph_viewer](const NodeArg* input_def) {
const ONNX_NAMESPACE::TensorProto* initializer = graph_viewer.GetConstantInitializer(input_def->Name());
return (initializer != nullptr) && IsFloatType(initializer->data_type());
};

const auto& node_inputs = node.InputDefs();
const bool is_input_const_float = is_const_float(node_inputs[0]);
const bool is_weight_const_float = is_const_float(node_inputs[1]);
const bool has_bias = node_inputs.size() > 2 && node_inputs[2]->Exists();
const bool is_bias_const_float = has_bias && is_const_float(node_inputs[2]);

if (is_input_const_float) {
return false;
}

if (!allow_float_weight_and_bias_ && (is_weight_const_float || is_bias_const_float)) {
return false;
}

// Check that if an input is not a float initializer, it must come from a DQ.
const int expected_num_dqs = (1 + static_cast<int>(!is_weight_const_float) +
static_cast<int>(has_bias && !is_bias_const_float));

if (!CheckQDQNodes(graph_viewer, node, dq_nodes, q_nodes, expected_num_dqs)) {
return false;
}

// input and output types need to be same
int32_t dt_input = dq_nodes[0]->InputDefs()[0]->TypeAsProto()->tensor_type().elem_type();
int32_t dt_weight = dq_nodes[1]->InputDefs()[0]->TypeAsProto()->tensor_type().elem_type();
int32_t dt_output = q_nodes[0]->OutputDefs()[0]->TypeAsProto()->tensor_type().elem_type();
if (dt_input != dt_output) {
return false;
}

if (!allow_4bit_weight_ && Is4BitIntType(dt_weight)) {
if (!allow_16bit_ && Is16BitIntType(dt_input)) {
return false;
}

if (dt_input == ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_INT8) {
if (!int8_allowed_ || dt_weight != dt_input) {
// Check quantized weight type.
if (!is_weight_const_float) {
int32_t dt_weight = dq_nodes[1]->InputDefs()[0]->TypeAsProto()->tensor_type().elem_type();
if (!allow_4bit_weight_ && Is4BitIntType(dt_weight)) {
return false;
}
}

if (dq_nodes.size() == 3) { // has bias
int32_t dt_bias = dq_nodes[2]->InputDefs()[0]->TypeAsProto()->tensor_type().elem_type();
if (dt_bias != ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_INT32) {
if (dt_input == ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_INT8) {
if (!int8_allowed_ || dt_weight != dt_input) {
return false;
}
}

if (!allow_16bit_ && Is16BitIntType(dt_weight)) {
return false;
}
}

// 16-bit int types must be explicitly allowed.
if (!allow_16bit_ && (Is16BitIntType(dt_input) || Is16BitIntType(dt_weight))) {
return false;
// Check quantized bias (if any)
if (has_bias && !is_bias_const_float) {
int32_t dt_bias = dq_nodes.back()->InputDefs()[0]->TypeAsProto()->tensor_type().elem_type();
if (dt_bias != ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_INT32) {
return false;
}
}

return true;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -145,8 +145,12 @@ class SplitNodeGroupSelector : public NodeGroupSelector {
class ConvNodeGroupSelector : public NodeGroupSelector {
public:
// default to 'true'
ConvNodeGroupSelector(bool int8_allowed = true, bool allow_16bit = true, bool allow_4bit_weight = true)
: int8_allowed_(int8_allowed), allow_16bit_(allow_16bit), allow_4bit_weight_(allow_4bit_weight) {}
ConvNodeGroupSelector(bool int8_allowed = true, bool allow_16bit = true, bool allow_4bit_weight = true,
bool allow_float_weight_and_bias = true)
: int8_allowed_(int8_allowed),
allow_16bit_(allow_16bit),
allow_4bit_weight_(allow_4bit_weight),
allow_float_weight_and_bias_(allow_float_weight_and_bias) {}

private:
bool Check(const GraphViewer& graph_viewer, const Node& node,
Expand All @@ -156,6 +160,7 @@ class ConvNodeGroupSelector : public NodeGroupSelector {
bool int8_allowed_;
bool allow_16bit_;
bool allow_4bit_weight_;
bool allow_float_weight_and_bias_; // EP will have to quantize the weights if necessary.
};

class WhereNodeGroupSelector : public NodeGroupSelector {
Expand Down Expand Up @@ -360,7 +365,8 @@ class ConvSelector : public BaseSelector {
public:
ConvSelector(bool int8_allowed = false, bool allow_16bit = false, bool allow_4bit_weight = false,
gsl::span<const char*> compatible_providers = {})
: BaseSelector(std::make_unique<ConvNodeGroupSelector>(int8_allowed, allow_16bit, allow_4bit_weight),
: BaseSelector(std::make_unique<ConvNodeGroupSelector>(int8_allowed, allow_16bit, allow_4bit_weight,
/*allow_float_weight_and_bias*/ false),
compatible_providers) {}

void UpdateBuilder(NodesToOptimizeIndicesBuilder&) const override;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,40 @@ Status BaseOpBuilder::ProcessInputs(QnnModelWrapper& qnn_model_wrapper,
return Status::OK();
}

Status BaseOpBuilder::GetBiasQuantParams(const QnnQuantParamsWrapper& input0_qparams,
const QnnQuantParamsWrapper& weight_qparams,
/*out*/ std::vector<float>& bias_scales,
/*out*/ std::vector<int32_t>& bias_offsets,
const logging::Logger& logger) const {
ORT_UNUSED_PARAMETER(logger);
// For now, only handle case where input0 is per-tensor quantized and input1 is either per-tensor
// or per-channel quantized.
ORT_RETURN_IF_NOT(input0_qparams.IsPerTensor(/*include_bw*/ true) && weight_qparams.IsQuantized(),
"QNN EP currently only supports computing bias quantization params for per-tensor ",
"input[0] and per-tensor/per-channel input[1]");

// Bias's quantization scale(s) should be the product of the other inputs' quantization scales.
// Input[0] is expected to have one scale (per-tensor).
// If input[1] is per-channel (many scales), then the bias also needs to be per-channel.
std::vector<float> input0_quant_scales;
std::vector<float> weight_quant_scales;
ORT_RETURN_IF_ERROR(input0_qparams.GetScales(input0_quant_scales));
ORT_RETURN_IF_ERROR(weight_qparams.GetScales(weight_quant_scales));

const size_t num_bias_scales_offsets = weight_quant_scales.size();
assert(input0_quant_scales.size() == 1); // Expected for per-tensor.
ORT_RETURN_IF_NOT(num_bias_scales_offsets >= input0_quant_scales.size(),
"Input[1] should have >= 1 quantization scale values");

bias_offsets = std::vector<int32_t>(num_bias_scales_offsets, 0); // Bias's zero-points should be all zeros.
bias_scales.resize(num_bias_scales_offsets);
for (size_t i = 0; i < num_bias_scales_offsets; i++) {
bias_scales[i] = input0_quant_scales[0] * weight_quant_scales[i];
}

return Status::OK();
}

Status BaseOpBuilder::AddZeroBiasInput(QnnModelWrapper& qnn_model_wrapper,
const QnnQuantParamsWrapper& input0_qparams,
const QnnQuantParamsWrapper& input1_qparams,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,12 @@ class BaseOpBuilder : public IOpBuilder {
const logging::Logger& logger,
std::vector<std::string>& input_names) const ORT_MUST_USE_RESULT;

Status GetBiasQuantParams(const QnnQuantParamsWrapper& input0_qparams,
const QnnQuantParamsWrapper& weight_qparams,
/*out*/ std::vector<float>& bias_scales,
/*out*/ std::vector<int32_t>& bias_offsets,
const logging::Logger& logger) const ORT_MUST_USE_RESULT;

Status AddZeroBiasInput(QnnModelWrapper& qnn_model_wrapper,
const QnnQuantParamsWrapper& input0_qparams,
const QnnQuantParamsWrapper& input1_qparams,
Expand Down
149 changes: 146 additions & 3 deletions onnxruntime/core/providers/qnn/builder/opbuilder/conv_op_builder.cc
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,8 @@ Status ConvOpBuilder::ProcessConv2D3DInputs(QnnModelWrapper& qnn_model_wrapper,

assert(num_inputs >= 2); // Checked by IsOpSupported.

QnnQuantParamsWrapper weight_qparams;

//
// Input 0
//
Expand Down Expand Up @@ -231,6 +233,26 @@ Status ConvOpBuilder::ProcessConv2D3DInputs(QnnModelWrapper& qnn_model_wrapper,
return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "QNN EP: Unexpected convolution op type: ", node_unit.OpType().c_str());
}

// Quantize float32 weight to int8_t (per-tensor, symmetric) if necessary.
if (!input_info.quant_param.IsQuantized()) {
ORT_RETURN_IF(input_info.initializer_tensor->data_type() != ONNX_NAMESPACE::TensorProto_DataType_FLOAT,
"QNN EP only supports unquantized float32 weights");

Qnn_DataType_t quant_type = QNN_DATATYPE_SFIXED_POINT_8; // int8_t quantization of input[1] works with input[0] of all types.
std::array<float, 1> weight_scales = {0.0f};
std::array<int32_t, 1> weight_offsets = {0};
gsl::span<float> flt_weight = ReinterpretAsSpan<float, uint8_t>(unpacked_tensor);
ORT_RETURN_IF_ERROR(qnn::utils::GetDataQuantParams(flt_weight, actual_shape, weight_scales, weight_offsets,
quant_type, /*symmetric*/ true, /*axis*/ std::nullopt));

std::vector<uint8_t> quant_weight(flt_weight.size());
ORT_RETURN_IF_ERROR(qnn::utils::QuantizeData(flt_weight, actual_shape, weight_scales, weight_offsets,
quant_weight, quant_type));
unpacked_tensor = std::move(quant_weight);
input_info.qnn_data_type = quant_type;
input_info.quant_param = QnnQuantParamsWrapper(weight_scales[0], weight_offsets[0]);
}

// Transpose quantization parameter's axis if this is using per-channel quantization.
if (input_info.quant_param.IsPerChannel()) {
std::vector<size_t> perm;
Expand Down Expand Up @@ -279,6 +301,7 @@ Status ConvOpBuilder::ProcessConv2D3DInputs(QnnModelWrapper& qnn_model_wrapper,
}
}

weight_qparams = input_info.quant_param.Copy(); // Store a copy of weight quantization params in case we need to quantize float bias.
Qnn_TensorType_t tensor_type = qnn_model_wrapper.GetTensorType(actual_name);
QnnTensorWrapper input_tensorwrapper(actual_name, tensor_type, input_info.qnn_data_type,
std::move(input_info.quant_param),
Expand All @@ -289,9 +312,58 @@ Status ConvOpBuilder::ProcessConv2D3DInputs(QnnModelWrapper& qnn_model_wrapper,
//
// Input 2: bias
//
const bool has_bias_input = num_inputs == 3;
const bool has_bias_input = num_inputs == 3 && inputs[2].node_arg.Exists();
if (has_bias_input) {
ORT_RETURN_IF_ERROR(ProcessInput(qnn_model_wrapper, inputs[2], logger, input_names));
TensorInfo bias_info = {};
ORT_RETURN_IF_ERROR(qnn_model_wrapper.GetTensorInfo(inputs[2], bias_info));

if (!bias_info.quant_param.IsQuantized() && bias_info.is_initializer) {
// Quantize float bias with bias_scale = input0_scale * weight_scale, bias_offset = 0. If weight is per-channel,
// then the bias will be quantized per-channel (axis 0) as well.
ORT_RETURN_IF(bias_info.initializer_tensor->data_type() != ONNX_NAMESPACE::TensorProto_DataType_FLOAT,
"QNN EP only supports unquantized float32 bias");

TensorInfo input0_info = {};
ORT_RETURN_IF_ERROR(qnn_model_wrapper.GetTensorInfo(inputs[0], input0_info));

std::vector<float> bias_scales;
std::vector<int32_t> bias_offsets;
ORT_RETURN_IF_ERROR(GetBiasQuantParams(input0_info.quant_param, weight_qparams,
bias_scales, bias_offsets, logger));

size_t num_bias_elems = qnn::utils::ShapeSizeCalc(bias_info.shape, 0, bias_info.shape.size());
std::vector<uint8_t> bias_quant_bytes(num_bias_elems * sizeof(int32_t), 0);

Qnn_DataType_t bias_quant_type = QNN_DATATYPE_SFIXED_POINT_32;
std::vector<uint8_t> flt_bias_bytes;
ORT_RETURN_IF_ERROR(qnn_model_wrapper.UnpackInitializerData(*bias_info.initializer_tensor, flt_bias_bytes));
gsl::span<float> flt_bias = ReinterpretAsSpan<float, uint8_t>(flt_bias_bytes);
assert(flt_bias.size() == num_bias_elems);

std::optional<int64_t> quant_axis;
if (weight_qparams.IsPerChannel()) {
quant_axis = 0;
}
ORT_RETURN_IF_ERROR(qnn::utils::QuantizeData(flt_bias, bias_info.shape, bias_scales, bias_offsets, bias_quant_bytes,
bias_quant_type, /*axis*/ quant_axis));
QnnQuantParamsWrapper bias_qparams;

if (quant_axis.has_value()) {
bias_qparams = QnnQuantParamsWrapper(bias_scales, bias_offsets, /*axis*/ static_cast<int32_t>(*quant_axis),
/*is_int4*/ false);
} else {
bias_qparams = QnnQuantParamsWrapper(bias_scales[0], bias_offsets[0]);
}

const std::string& bias_name = inputs[2].node_arg.Name();
auto bias_tensor_wrapper = QnnTensorWrapper(bias_name, QNN_TENSOR_TYPE_STATIC, bias_quant_type,
std::move(bias_qparams), std::move(bias_info.shape), std::move(bias_quant_bytes));

qnn_model_wrapper.AddTensorWrapper(std::move(bias_tensor_wrapper));
input_names.push_back(bias_name);
} else {
ORT_RETURN_IF_ERROR(ProcessInput(qnn_model_wrapper, inputs[2], logger, input_names));
}
}

#if QNN_API_VERSION_MAJOR == 2 && (QNN_API_VERSION_MINOR >= 16 && QNN_API_VERSION_MINOR <= 18)
Expand Down Expand Up @@ -325,6 +397,7 @@ Status ConvOpBuilder::ProcessConv1DInputs(QnnModelWrapper& qnn_model_wrapper,
const size_t num_inputs = inputs.size();
OnnxConvType conv_type = {};
ORT_RETURN_IF_ERROR(GetOnnxConvType(node_unit.OpType(), conv_type));
QnnQuantParamsWrapper weight_qparams;

assert(num_inputs >= 2); // Checked by IsOpSupported.

Expand Down Expand Up @@ -460,6 +533,26 @@ Status ConvOpBuilder::ProcessConv1DInputs(QnnModelWrapper& qnn_model_wrapper,
return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "QNN EP: Unexpected convolution op type: ", node_unit.OpType().c_str());
}

// Quantize float32 weight to int8_t (per-tensor, symmetric) if necessary.
if (!input_info.quant_param.IsQuantized()) {
ORT_RETURN_IF(input_info.initializer_tensor->data_type() != ONNX_NAMESPACE::TensorProto_DataType_FLOAT,
"QNN EP only supports unquantized float32 weights");

Qnn_DataType_t quant_type = QNN_DATATYPE_SFIXED_POINT_8; // int8_t quantization of input[1] works with input[0] of all types.
std::array<float, 1> weight_scales = {0.0f};
std::array<int32_t, 1> weight_offsets = {0};
gsl::span<float> flt_weight = ReinterpretAsSpan<float, uint8_t>(unpacked_tensor);
ORT_RETURN_IF_ERROR(qnn::utils::GetDataQuantParams(flt_weight, final_shape, weight_scales, weight_offsets,
quant_type, /*symmetric*/ true, /*axis*/ std::nullopt));

std::vector<uint8_t> quant_weight(flt_weight.size());
ORT_RETURN_IF_ERROR(qnn::utils::QuantizeData(flt_weight, final_shape, weight_scales, weight_offsets,
quant_weight, quant_type));
unpacked_tensor = std::move(quant_weight);
input_info.qnn_data_type = quant_type;
input_info.quant_param = QnnQuantParamsWrapper(weight_scales[0], weight_offsets[0]);
}

// Transpose quantization parameter's axis if this is using per-channel quantization.
if (input_info.quant_param.IsPerChannel()) {
const std::vector<size_t>& perm = conv_type == OnnxConvType::kConv ? nchw2hwcn_perm : cnhw2hwcn_perm;
Expand Down Expand Up @@ -507,6 +600,7 @@ Status ConvOpBuilder::ProcessConv1DInputs(QnnModelWrapper& qnn_model_wrapper,
}
}

weight_qparams = input_info.quant_param.Copy(); // Store a copy of weight quantization params in case we need to quantize float bias.
Qnn_TensorType_t tensor_type = qnn_model_wrapper.GetTensorType(conv_weight_input_name);
QnnTensorWrapper input_tensorwrapper(conv_weight_input_name, tensor_type, input_info.qnn_data_type,
std::move(input_info.quant_param), std::move(final_shape),
Expand All @@ -518,7 +612,56 @@ Status ConvOpBuilder::ProcessConv1DInputs(QnnModelWrapper& qnn_model_wrapper,
// Input 2: bias
//
if (num_inputs == 3) {
ORT_RETURN_IF_ERROR(ProcessInput(qnn_model_wrapper, inputs[2], logger, input_names));
TensorInfo bias_info = {};
ORT_RETURN_IF_ERROR(qnn_model_wrapper.GetTensorInfo(inputs[2], bias_info));

if (!bias_info.quant_param.IsQuantized() && bias_info.is_initializer) {
// Quantize float bias with bias_scale = input0_scale * weight_scale, bias_offset = 0. If weight is per-channel,
// then the bias will be quantized per-channel (axis 0) as well.
ORT_RETURN_IF(bias_info.initializer_tensor->data_type() != ONNX_NAMESPACE::TensorProto_DataType_FLOAT,
"QNN EP only supports unquantized float32 bias");

TensorInfo input0_info = {};
ORT_RETURN_IF_ERROR(qnn_model_wrapper.GetTensorInfo(inputs[0], input0_info));

std::vector<float> bias_scales;
std::vector<int32_t> bias_offsets;
ORT_RETURN_IF_ERROR(GetBiasQuantParams(input0_info.quant_param, weight_qparams,
bias_scales, bias_offsets, logger));

size_t num_bias_elems = qnn::utils::ShapeSizeCalc(bias_info.shape, 0, bias_info.shape.size());
std::vector<uint8_t> bias_quant_bytes(num_bias_elems * sizeof(int32_t), 0);

Qnn_DataType_t bias_quant_type = QNN_DATATYPE_SFIXED_POINT_32;
std::vector<uint8_t> flt_bias_bytes;
ORT_RETURN_IF_ERROR(qnn_model_wrapper.UnpackInitializerData(*bias_info.initializer_tensor, flt_bias_bytes));
gsl::span<float> flt_bias = ReinterpretAsSpan<float, uint8_t>(flt_bias_bytes);
assert(flt_bias.size() == num_bias_elems);

std::optional<int64_t> quant_axis;
if (weight_qparams.IsPerChannel()) {
quant_axis = 0;
}
ORT_RETURN_IF_ERROR(qnn::utils::QuantizeData(flt_bias, bias_info.shape, bias_scales, bias_offsets, bias_quant_bytes,
bias_quant_type, /*axis*/ quant_axis));
QnnQuantParamsWrapper bias_qparams;

if (quant_axis.has_value()) {
bias_qparams = QnnQuantParamsWrapper(bias_scales, bias_offsets, /*axis*/ static_cast<int32_t>(*quant_axis),
/*is_int4*/ false);
} else {
bias_qparams = QnnQuantParamsWrapper(bias_scales[0], bias_offsets[0]);
}

const std::string& bias_name = inputs[2].node_arg.Name();
auto bias_tensor_wrapper = QnnTensorWrapper(bias_name, QNN_TENSOR_TYPE_STATIC, bias_quant_type,
std::move(bias_qparams), std::move(bias_info.shape), std::move(bias_quant_bytes));

qnn_model_wrapper.AddTensorWrapper(std::move(bias_tensor_wrapper));
input_names.push_back(bias_name);
} else {
ORT_RETURN_IF_ERROR(ProcessInput(qnn_model_wrapper, inputs[2], logger, input_names));
}
}

return Status::OK();
Expand Down
Loading
Loading