-
Notifications
You must be signed in to change notification settings - Fork 2.4k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[GPU] Skip redundant gather in stateful model (#21681)
* Skip redundant gather in stateful model * Fix memory reuse issue for node skipped at runtime. If the node is not marked as can_be_optimized at build time, memory dep is not properly applied => So it can cause the wrong memory reuse
- Loading branch information
Showing
9 changed files
with
267 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
40 changes: 40 additions & 0 deletions
40
src/plugins/intel_gpu/src/graph/graph_optimizer/dynamic_shape_gather_opts.cpp
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
// Copyright (C) 2023 Intel Corporation | ||
// SPDX-License-Identifier: Apache-2.0 | ||
// | ||
|
||
#include "pass_manager.h" | ||
#include "gather_inst.h" | ||
#include "program_helpers.h" | ||
|
||
using namespace cldnn; | ||
|
||
void dynamic_shape_gather_opts::run(program& p) { | ||
auto itr = p.get_processing_order().begin(); | ||
// Set gathers that might be skipped at runtime as can_be_optimized. | ||
// If not set, memory dependency will not work for the nodes that are skipped at runtime | ||
while (itr != p.get_processing_order().end()) { | ||
auto& node = *itr++; | ||
if (!node->is_type<gather>()) | ||
continue; | ||
auto& gather_node = node->as<gather>(); | ||
// Check pattern | ||
auto impl_params = gather_node.get_kernel_impl_params(); | ||
if (gather_node.has_fused_primitives() || | ||
(impl_params->get_input_layout(0).data_type != impl_params->get_output_layout().data_type) || | ||
gather_node.get_dependency(1).is_constant() || gather_node.get_dependency(1).is_type<data>()) | ||
continue; | ||
auto idx_rank = impl_params->get_input_layout(1).get_partial_shape().size(); | ||
|
||
if (idx_rank > 1) { | ||
continue; | ||
} | ||
auto axis = impl_params->typed_desc<gather>()->axis; | ||
if (impl_params->get_input_layout(0).get_partial_shape()[axis] == -1 | ||
|| impl_params->get_input_layout(1).get_partial_shape()[0] == -1 | ||
|| impl_params->get_input_layout(0).get_partial_shape()[axis] == impl_params->get_input_layout(1).get_partial_shape()[0]) { | ||
// May be skipepd | ||
gather_node.can_be_optimized(true); | ||
GPU_DEBUG_TRACE_DETAIL << "[dynamic_shape_gather_opts] : " << gather_node.id() << "can_be_optimized" << std::endl; | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
129 changes: 129 additions & 0 deletions
129
src/plugins/intel_gpu/tests/unit/dynamic_execution/stateful_model.cpp
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,129 @@ | ||
// Copyright (C) 2023 Intel Corporation | ||
// SPDX-License-Identifier: Apache-2.0 | ||
// | ||
|
||
#include "test_utils.h" | ||
|
||
#include <intel_gpu/primitives/input_layout.hpp> | ||
#include <intel_gpu/primitives/reorder.hpp> | ||
#include <intel_gpu/primitives/data.hpp> | ||
#include <intel_gpu/primitives/gather.hpp> | ||
#include <intel_gpu/primitives/concatenation.hpp> | ||
|
||
#include "program_wrapper.h" | ||
|
||
#include <cmath> | ||
#include <algorithm> | ||
|
||
using namespace cldnn; | ||
using namespace ::tests; | ||
|
||
namespace stateful_model_tests { | ||
TEST(stateful_model, skip_gather_at_runtime) { | ||
auto& engine = get_test_engine(); | ||
|
||
auto input_kv_lay = layout{ov::PartialShape{-1, 32, -1, 128}, data_types::f32, format::bfyx}; | ||
auto input_present_lay = layout{ov::PartialShape{-1, 32, -1, 128}, data_types::f32, format::bfyx}; | ||
auto input_beam_idx_lay = layout{ov::PartialShape{-1}, data_types::i32, format::bfyx}; | ||
|
||
topology topology(input_layout("kv_cache", input_kv_lay), | ||
input_layout("beam_idx", input_beam_idx_lay), | ||
input_layout("present", input_present_lay), | ||
gather("gather", | ||
input_info("kv_cache"), | ||
input_info("beam_idx"), | ||
0, // axis | ||
input_kv_lay.get_partial_shape().size(), // input rank | ||
ov::Shape{}, // output shape | ||
0, // batch_dim | ||
true), // support_neg_ind | ||
concatenation("concat", {input_info("gather"), input_info("present")}, 0), | ||
reorder("reorder", input_info("concat"), format::bfyx, data_types::f32)); /*output padding*/ | ||
|
||
ExecutionConfig config = get_test_default_config(engine); | ||
config.set_property(ov::intel_gpu::allow_new_shape_infer(true)); | ||
|
||
network network(engine, topology, config); | ||
auto gather_inst = network.get_primitive("gather"); | ||
ASSERT_EQ(gather_inst->get_node().can_be_optimized(), true); | ||
ASSERT_EQ(gather_inst->can_be_optimized(), true); | ||
|
||
auto KV_SIZE = 24; | ||
auto BATCH_SIZE = 1; | ||
auto kv_cache_mem = engine.allocate_memory({{KV_SIZE, 32, BATCH_SIZE, 128}, data_types::f32, format::bfyx}); | ||
auto present_mem = engine.allocate_memory({{1, 32, BATCH_SIZE, 128}, data_types::f32, format::bfyx}); | ||
auto beam_idx_mem = engine.allocate_memory({{KV_SIZE}, data_types::i32, format::bfyx}); | ||
std::vector<float> kv_input_data(kv_cache_mem->get_layout().count()); | ||
std::vector<float> present_input_data(present_mem->get_layout().count()); | ||
std::vector<int32_t> beam_idx_input_data(beam_idx_mem->get_layout().count()); | ||
std::iota(kv_input_data.begin(), kv_input_data.end(), 0.f); | ||
std::iota(present_input_data.begin(), present_input_data.end(), 0.f); | ||
std::iota(beam_idx_input_data.begin(), beam_idx_input_data.end(), 0); | ||
set_values(kv_cache_mem, kv_input_data); | ||
set_values(present_mem, present_input_data); | ||
set_values(beam_idx_mem, beam_idx_input_data); | ||
|
||
network.set_input_data("kv_cache", kv_cache_mem); | ||
network.set_input_data("present", present_mem); | ||
network.set_input_data("beam_idx", beam_idx_mem); | ||
network.execute(); | ||
ASSERT_EQ(gather_inst->can_be_optimized(), true); | ||
auto gather_output_mem = network.get_output_memory("gather"); | ||
cldnn::mem_lock<float, mem_lock_type::read> gather_output_ptr(gather_output_mem, get_test_stream()); | ||
for (size_t i = 0; i < gather_output_mem->get_layout().count(); ++i) { | ||
ASSERT_EQ(gather_output_ptr[i], kv_input_data[i]); | ||
} | ||
} | ||
|
||
TEST(stateful_model, not_skip_gather_at_runtime) { | ||
auto& engine = get_test_engine(); | ||
|
||
auto input_kv_lay = layout{ov::PartialShape{-1, 32, -1, 128}, data_types::f32, format::bfyx}; | ||
auto input_present_lay = layout{ov::PartialShape{-1, 32, -1, 128}, data_types::f32, format::bfyx}; | ||
auto input_beam_idx_lay = layout{ov::PartialShape{-1}, data_types::i32, format::bfyx}; | ||
|
||
topology topology(input_layout("kv_cache", input_kv_lay), | ||
input_layout("beam_idx", input_beam_idx_lay), | ||
input_layout("present", input_present_lay), | ||
gather("gather", | ||
input_info("kv_cache"), | ||
input_info("beam_idx"), | ||
0, // axis | ||
input_kv_lay.get_partial_shape().size(), // input rank | ||
ov::Shape{}, // output shape | ||
0, // batch_dim | ||
true), // support_neg_ind | ||
concatenation("concat", {input_info("gather"), input_info("present")}, 0), | ||
reorder("reorder", input_info("concat"), format::bfyx, data_types::f32)); /*output padding*/ | ||
|
||
ExecutionConfig config = get_test_default_config(engine); | ||
config.set_property(ov::intel_gpu::allow_new_shape_infer(true)); | ||
|
||
network network(engine, topology, config); | ||
auto gather_inst = network.get_primitive("gather"); | ||
ASSERT_EQ(gather_inst->get_node().can_be_optimized(), true); | ||
ASSERT_EQ(gather_inst->can_be_optimized(), true); | ||
|
||
auto KV_SIZE = 24; | ||
auto BATCH_SIZE = 1; | ||
auto kv_cache_mem = engine.allocate_memory({{KV_SIZE, 32, BATCH_SIZE, 128}, data_types::f32, format::bfyx}); | ||
auto present_mem = engine.allocate_memory({{1, 32, BATCH_SIZE, 128}, data_types::f32, format::bfyx}); | ||
auto beam_idx_mem = engine.allocate_memory({{KV_SIZE}, data_types::i32, format::bfyx}); | ||
std::vector<float> kv_input_data(kv_cache_mem->get_layout().count()); | ||
std::vector<float> present_input_data(present_mem->get_layout().count()); | ||
std::vector<int32_t> beam_idx_input_data(beam_idx_mem->get_layout().count()); | ||
std::iota(kv_input_data.begin(), kv_input_data.end(), 0.f); | ||
std::iota(present_input_data.begin(), present_input_data.end(), 0.f); | ||
std::iota(beam_idx_input_data.begin(), beam_idx_input_data.end(), 0); | ||
std::swap(beam_idx_input_data[0], beam_idx_input_data[1]); | ||
set_values(kv_cache_mem, kv_input_data); | ||
set_values(present_mem, present_input_data); | ||
set_values(beam_idx_mem, beam_idx_input_data); | ||
|
||
network.set_input_data("kv_cache", kv_cache_mem); | ||
network.set_input_data("present", present_mem); | ||
network.set_input_data("beam_idx", beam_idx_mem); | ||
network.execute(); | ||
ASSERT_EQ(gather_inst->can_be_optimized(), false); | ||
} | ||
} // stateful_model_tests |