Skip to content

Commit

Permalink
[GPU] Added functional test and review fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
Lyamin-Roman committed Dec 10, 2024
1 parent c33e8a3 commit be95170
Show file tree
Hide file tree
Showing 3 changed files with 247 additions and 30 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,16 @@ LoRAHorizontalFusion::LoRAHorizontalFusion() {
#define check(node) if (!node) return false;

const auto& add = std::dynamic_pointer_cast<ov::op::v1::Add>(node); check(add)
const auto& matmul2 = std::dynamic_pointer_cast<ov::op::v0::MatMul>(add->get_input_node_shared_ptr(1)); check(matmul2)
const auto& matmul2 = std::dynamic_pointer_cast<ov::op::v0::MatMul>(add->get_input_node_shared_ptr(0)) ?
std::dynamic_pointer_cast<ov::op::v0::MatMul>(add->get_input_node_shared_ptr(0)) :
std::dynamic_pointer_cast<ov::op::v0::MatMul>(add->get_input_node_shared_ptr(1)); check(matmul2)
const auto& multiply = std::dynamic_pointer_cast<ov::op::v1::Multiply>(matmul2->get_input_node_shared_ptr(0)); check(multiply)
const auto& variable_b = std::dynamic_pointer_cast<ov::op::util::ReadValueBase>(matmul2->get_input_node_shared_ptr(1)); check(variable_b)
const auto& matmul1 = std::dynamic_pointer_cast<ov::op::v0::MatMul>(multiply->get_input_node_shared_ptr(0)); check(matmul1)
const auto& variable_alpha = std::dynamic_pointer_cast<ov::op::util::ReadValueBase>(multiply->get_input_node_shared_ptr(1)); check(variable_alpha)
const auto& variable_a = std::dynamic_pointer_cast<ov::op::util::ReadValueBase>(matmul1->get_input_node_shared_ptr(1)); check(variable_a)

#undef check
return true;
};

Expand Down Expand Up @@ -64,7 +67,10 @@ LoRAHorizontalFusion::LoRAHorizontalFusion() {

for (const auto& add : split->get_users()) {
add_nodes.emplace_back(add);
matmul2_nodes.emplace_back(add->get_input_node_shared_ptr(1));

bool first_input_matmul = std::dynamic_pointer_cast<ov::op::v0::MatMul>(add->get_input_node_shared_ptr(0)) != nullptr;
matmul2_nodes.emplace_back(first_input_matmul ? add->get_input_node_shared_ptr(0)
: add->get_input_node_shared_ptr(1));
}
for (const auto& matmul2 : matmul2_nodes) {
multiply_nodes.emplace_back(matmul2->get_input_node_shared_ptr(0));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,15 +31,16 @@ struct ShapeParams {
int weights_group_size;
};

using FullyConnectedHorizontalFusionParams = std::tuple<ShapeParams, // input shapes
using FullyConnectedHorizontalFusionParams = std::tuple<ShapeParams, // input shapes
ov::element::Type, // weights type
ov::element::Type, // activations type
bool, // transpose on weights
bool, // decompression subtract
bool, // reshape on decompression constants
bool, // per-tensor zero-point
bool, // has bias
uint64_t // dynamic_quantization_group_size
uint64_t, // dynamic_quantization_group_size
uint64_t // LoRA rank
>;


Expand All @@ -56,6 +57,7 @@ class FullyConnectedHorizontalFusion : public testing::WithParamInterface<FullyC
bool per_tensor_zp;
bool has_bias;
uint64_t dyn_quan_group_size;
uint64_t lora_rank;

std::tie(shape_params,
weights_precision,
Expand All @@ -65,7 +67,8 @@ class FullyConnectedHorizontalFusion : public testing::WithParamInterface<FullyC
reshape_on_decompression,
per_tensor_zp,
has_bias,
dyn_quan_group_size) = obj.param;
dyn_quan_group_size,
lora_rank) = obj.param;

std::ostringstream result;
result << "data_shape=";
Expand All @@ -84,9 +87,10 @@ class FullyConnectedHorizontalFusion : public testing::WithParamInterface<FullyC
result << "transpose_weights=" << transpose << "_";
result << "decompression_subtract=" << decompression_sub << "_";
result << "reshape_on_decompression=" << reshape_on_decompression << "_";
result << "per_tensor_zp=" << per_tensor_zp;
result << "has_bias=" << has_bias;
result << "dyn_quan_group_size=" << dyn_quan_group_size;
result << "per_tensor_zp=" << per_tensor_zp << "_";
result << "has_bias=" << has_bias << "_";
result << "dyn_quan_group_size=" << dyn_quan_group_size<< "_";
result << "lora_rank=" << lora_rank;

return result.str();
}
Expand Down Expand Up @@ -203,17 +207,36 @@ class FullyConnectedHorizontalFusion : public testing::WithParamInterface<FullyC
}
return last_node;
}
std::shared_ptr<ov::Model> init_subgraph(const ov::PartialShape& data_shape,
const std::vector<ov::Shape>& weights_shapes,

std::shared_ptr<ov::Node> init_lora_subgraph(const ov::ParameterVector& params,
std::shared_ptr<ov::op::v0::MatMul> connect_node,
size_t idx) {
size_t var_offset = 1 + idx * 3;
auto read_value_a = std::make_shared<ov::op::v3::ReadValue>(params.at(var_offset), "var_a_" + std::to_string(idx));
auto read_value_alpha = std::make_shared<ov::op::v3::ReadValue>(params.at(var_offset + 1), "var_alpha_" + std::to_string(idx));
auto read_value_b = std::make_shared<ov::op::v3::ReadValue>(params.at(var_offset + 2), "var_b_" + std::to_string(idx));
auto matmul1 = std::make_shared<ov::op::v0::MatMul>(params.at(0), read_value_a, false, true);
auto multiply = std::make_shared<ov::op::v1::Multiply>(matmul1, read_value_alpha);
auto matmul2 = std::make_shared<ov::op::v0::MatMul>(multiply, read_value_b, false, true);
auto add = std::make_shared<ov::op::v1::Add>(connect_node, matmul2);
return add;
}

std::shared_ptr<ov::Model> init_subgraph(const std::vector<ov::Shape>& weights_shapes,
const int group_size,
const ov::element::Type data_precision,
const ov::element::Type weights_precision,
const bool transpose_weights,
const bool add_subtract,
const bool reshape_on_decompression,
const bool per_tensor_zp,
const bool has_bias) {
ov::ParameterVector params{std::make_shared<ov::op::v0::Parameter>(data_precision, data_shape)};
const bool has_bias,
const uint64_t lora_rank) {
ov::ParameterVector params;
for (const auto& shape : inputDynamicShapes) {
params.push_back(std::make_shared<ov::op::v0::Parameter>(data_precision, shape));
}

const auto weight1 = init_compressed_weights_subgraph(weights_shapes[0],
group_size,
data_precision,
Expand Down Expand Up @@ -246,10 +269,20 @@ class FullyConnectedHorizontalFusion : public testing::WithParamInterface<FullyC
matmul2->set_friendly_name("fully_connected_2");
auto matmul3 = std::make_shared<ov::op::v0::MatMul>(params[0], weight3, false, transpose_weights);
matmul3->set_friendly_name("fully_connected_3");

std::shared_ptr<ov::Node> matmul1_result = matmul1;
std::shared_ptr<ov::Node> matmul2_result = matmul2;
std::shared_ptr<ov::Node> matmul3_result = matmul3;
if (lora_rank != 0) {
matmul1_result = init_lora_subgraph(params, matmul1, 0);
matmul2_result = init_lora_subgraph(params, matmul2, 1);
matmul3_result = init_lora_subgraph(params, matmul3, 2);
}

if (!has_bias) {
auto matmul4 = std::make_shared<ov::op::v0::MatMul>(matmul1, matmul2, true, false);
auto matmul4 = std::make_shared<ov::op::v0::MatMul>(matmul1_result, matmul2_result, true, false);
matmul4->set_friendly_name("gemm1");
auto matmul5 = std::make_shared<ov::op::v0::MatMul>(matmul4, matmul3, true, true);
auto matmul5 = std::make_shared<ov::op::v0::MatMul>(matmul4, matmul3_result, true, true);
matmul5->set_friendly_name("gemm2");
return std::make_shared<ov::Model>(ov::NodeVector{matmul5}, params, "FCHorizontalFusion");
} else {
Expand All @@ -261,17 +294,17 @@ class FullyConnectedHorizontalFusion : public testing::WithParamInterface<FullyC
auto bias1_shape = ov::Shape{1, weights_shapes[0].back()};
auto bias1_tensor = ov::test::utils::create_and_fill_tensor(data_precision, bias1_shape, in_data);
auto bias1_const = std::make_shared<ov::op::v0::Constant>(bias1_tensor);
auto bias_add1 = std::make_shared<ov::op::v1::Add>(matmul1, bias1_const);
auto bias_add1 = std::make_shared<ov::op::v1::Add>(matmul1_result, bias1_const);
bias_add1->set_friendly_name("add1");
auto bias2_shape = ov::Shape{1, weights_shapes[1].back()};
auto bias2_tensor = ov::test::utils::create_and_fill_tensor(data_precision, bias2_shape, in_data);
auto bias2_const = std::make_shared<ov::op::v0::Constant>(bias2_tensor);
auto bias_add2 = std::make_shared<ov::op::v1::Add>(matmul2, bias2_const);
auto bias_add2 = std::make_shared<ov::op::v1::Add>(matmul2_result, bias2_const);
bias_add2->set_friendly_name("add2");
auto bias3_shape = ov::Shape{1, weights_shapes[2].back()};
auto bias3_tensor = ov::test::utils::create_and_fill_tensor(data_precision, bias3_shape, in_data);
auto bias3_const = std::make_shared<ov::op::v0::Constant>(bias3_tensor);
auto bias_add3 = std::make_shared<ov::op::v1::Add>(matmul3, bias3_const);
auto bias_add3 = std::make_shared<ov::op::v1::Add>(matmul3_result, bias3_const);
bias_add3->set_friendly_name("add3");

auto matmul4 = std::make_shared<ov::op::v0::MatMul>(bias_add1, bias_add2, true, false);
Expand All @@ -294,6 +327,7 @@ class FullyConnectedHorizontalFusion : public testing::WithParamInterface<FullyC
bool per_tensor_zp;
bool has_bias;
uint64_t dyn_quan_group_size;
uint64_t lora_rank;

std::tie(shape_params,
weights_precision,
Expand All @@ -303,21 +337,35 @@ class FullyConnectedHorizontalFusion : public testing::WithParamInterface<FullyC
reshape_on_decompression,
per_tensor_zp,
has_bias,
dyn_quan_group_size) = GetParam();
dyn_quan_group_size,
lora_rank) = GetParam();

std::vector<InputShape> input_shapes = {shape_params.data_shape};

if (lora_rank != 0) {
for (size_t i = 0; i < shape_params.weights_shapes.size(); ++i) {
// variable_A
input_shapes.push_back({{-1, *shape_params.data_shape.first.rbegin()}, {{lora_rank, shape_params.data_shape.second.front().back()}}});
// variable_alpha
input_shapes.push_back({{1, -1}, {{1, lora_rank}}});
// variable_B
input_shapes.push_back({{ov::Dimension(shape_params.weights_shapes[i].back()), -1}, {{shape_params.weights_shapes[i].back(), lora_rank}}});
}
}

init_input_shapes({shape_params.data_shape, {{}, shape_params.weights_shapes}});
init_input_shapes(input_shapes);

inType = outType = activations_precision;
function = init_subgraph(inputDynamicShapes[0],
shape_params.weights_shapes,
function = init_subgraph(shape_params.weights_shapes,
shape_params.weights_group_size,
activations_precision,
weights_precision,
transpose_weights,
decompression_sub,
reshape_on_decompression,
per_tensor_zp,
has_bias);
has_bias,
lora_rank);

if (activations_precision == ov::element::f16) {
abs_threshold = 1.0f;
Expand All @@ -333,9 +381,15 @@ class FullyConnectedHorizontalFusion : public testing::WithParamInterface<FullyC
for (size_t i = 0; i < model_inputs.size(); ++i) {
const auto& model_input = model_inputs[i];
ov::test::utils::InputGenerateData in_data;
in_data.start_from = -1;
in_data.range = 2;
in_data.resolution = 10000;
if (i == 0) {
in_data.start_from = -1;
in_data.range = 2;
in_data.resolution = 10000;
} else {
in_data.start_from = -0.5;
in_data.range = 1;
in_data.resolution = 30000;
}
ov::Tensor tensor = ov::test::utils::create_and_fill_tensor(model_input.get_element_type(),
target_input_static_shapes[i],
in_data);
Expand All @@ -346,11 +400,17 @@ class FullyConnectedHorizontalFusion : public testing::WithParamInterface<FullyC
void check_results() {
const auto& test_param = GetParam();
ov::element::Type weights_precision = std::get<1>(test_param);
uint64_t lora_rank = std::get<9>(test_param);
bool is_lora_fused = false;
for (const auto& n : compiledModel.get_runtime_model()->get_ordered_ops()) {
if (n->get_friendly_name() == "Compressed_weights") {
ASSERT_EQ(n->get_output_element_type(0), weights_precision);
}
if (n->get_friendly_name().find("fused_3_MatMuls") != std::string::npos) {
is_lora_fused = true;
}
}
OPENVINO_ASSERT(lora_rank == 0 || is_lora_fused, "[GPU] LoRA fusion failed");
}
};

Expand All @@ -376,6 +436,8 @@ const std::vector<ShapeParams> input_shapes = {
{{{-1, -1, -1}, {{1, 4, 16}}}, weights4},
};

const std::vector<uint64_t> lora_rank = {0, 16}; // 0 means w/o LoRA

INSTANTIATE_TEST_SUITE_P(smoke_FCHorizontalFusion_no_bias,
FullyConnectedHorizontalFusion,
::testing::Combine(::testing::ValuesIn(input_shapes),
Expand All @@ -386,7 +448,8 @@ INSTANTIATE_TEST_SUITE_P(smoke_FCHorizontalFusion_no_bias,
::testing::Values(true),
::testing::ValuesIn(per_tensor_zp),
::testing::Values(false),
::testing::Values(0) /* no dyn_quan */),
::testing::Values(0) /* no dyn_quan */,
::testing::ValuesIn(lora_rank)),
FullyConnectedHorizontalFusion::get_test_case_name);

INSTANTIATE_TEST_SUITE_P(smoke_FCHorizontalFusion_with_bias,
Expand All @@ -399,7 +462,8 @@ INSTANTIATE_TEST_SUITE_P(smoke_FCHorizontalFusion_with_bias,
::testing::Values(true),
::testing::Values(true),
::testing::Values(true),
::testing::Values(0) /* no dyn_quan */),
::testing::Values(0) /* no dyn_quan */,
::testing::ValuesIn(lora_rank)),
FullyConnectedHorizontalFusion::get_test_case_name);

std::vector<ov::Shape> dyn_quan_weights = {{1, 128, 32}, {1, 128, 4}, {1, 128, 32}};
Expand All @@ -417,7 +481,8 @@ INSTANTIATE_TEST_SUITE_P(smoke_FCHorizontalFusion_no_bias_dyn_quan,
::testing::Values(true),
::testing::Values(true),
::testing::Values(false),
::testing::Values(UINT64_MAX) /* dyn_quan */),
::testing::Values(UINT64_MAX) /* dyn_quan */,
::testing::ValuesIn(lora_rank)),
FullyConnectedHorizontalFusion::get_test_case_name);


Expand Down
Loading

0 comments on commit be95170

Please sign in to comment.