Skip to content

Commit

Permalink
[GPU] Optimization for gemm & fc in iGPU. (openvinotoolkit#19780)
Browse files Browse the repository at this point in the history
* Optimization for gemm & fc in iGPU.
FC: fake alignment for 16 is better in iGPU.
Gemm: permute + gemm_tiled_opt is better than transposed_input + gemm_ref kernel for unaligned shapes to 16. Note that this is an temporal optimization and will be removed once the final solution (i.e., support unaligned transposed input shape in gemm_tiled_opt kernel) is availalbe.

* Fix unittest

* Fix for model_cache

* Fix unittest
  • Loading branch information
yeonbok authored and alvoron committed Nov 6, 2023
1 parent 501db33 commit 472a4ca
Show file tree
Hide file tree
Showing 8 changed files with 79 additions and 32 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ struct kernel_impl_params {

bool has_runtime_layouts = false;
const program *prog;
cldnn::device_type dev_type;
stream::ptr strm;
std::shared_ptr<const primitive> desc;
size_t unique_id;
Expand Down Expand Up @@ -63,9 +64,11 @@ struct kernel_impl_params {
std::vector<size_t> output_size;
std::vector<size_t> img_size;

kernel_impl_params() : prog(nullptr), strm(nullptr), desc(nullptr), unique_id(0) {}
kernel_impl_params() : prog(nullptr), dev_type(cldnn::device_type::integrated_gpu), strm(nullptr), desc(nullptr), unique_id(0) {
}

kernel_impl_params(program& _prog,
cldnn::device_type _dev_type,
stream::ptr _strm,
std::shared_ptr<const primitive> _desc,
size_t _uid,
Expand All @@ -74,6 +77,7 @@ struct kernel_impl_params {
const std::vector<cldnn::fused_primitive_desc>& _fused_descs)
: has_runtime_layouts(true)
, prog(&_prog)
, dev_type(_dev_type)
, strm(std::move(_strm))
, desc(std::move(_desc))
, unique_id(_uid)
Expand Down Expand Up @@ -135,7 +139,7 @@ struct kernel_impl_params {
return std::static_pointer_cast<const PType>(desc)->type == PType::type_id();
}

virtual primitive_type_id type() const { return desc->type; }
virtual primitive_type_id type() const { return desc->type; }

void save(BinaryOutputBuffer& ob) const;
void load(BinaryInputBuffer& ib);
Expand Down
5 changes: 3 additions & 2 deletions src/plugins/intel_gpu/src/graph/fully_connected.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -187,8 +187,9 @@ kernel_impl_params fully_connected_inst::get_fake_aligned_params(kernel_impl_par
return std::move(orig_impl_param);
}

input_shape[input_row_idx] = align_to(input_shape[input_row_idx], 8);
output_shape[output_row_idx] = align_to(output_shape[output_row_idx], 8);
size_t fake_align_base = (orig_impl_param.dev_type == cldnn::device_type::integrated_gpu) ? 16 : 8;
input_shape[input_row_idx] = align_to(input_shape[input_row_idx], fake_align_base);
output_shape[output_row_idx] = align_to(output_shape[output_row_idx], fake_align_base);

updated_param.input_layouts[0] = layout(ov::PartialShape(input_shape),
orig_input_layout.data_type,
Expand Down
3 changes: 2 additions & 1 deletion src/plugins/intel_gpu/src/graph/include/program_node.h
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,8 @@ struct program_node {
}

virtual std::unique_ptr<kernel_impl_params> get_kernel_impl_params(const std::vector<layout>& in_layouts, const std::vector<layout>& out_layouts) const {
auto params = std::unique_ptr<kernel_impl_params>(new kernel_impl_params(get_program(), get_program().get_stream_ptr(), get_primitive(),
auto params = std::unique_ptr<kernel_impl_params>(new kernel_impl_params(get_program(), get_program().get_engine().get_device_info().dev_type,
get_program().get_stream_ptr(), get_primitive(),
get_unique_id(), in_layouts, out_layouts, get_fused_primitives()));
params->memory_deps = get_const_memory_deps();
params->_can_be_optimized = this->optimized;
Expand Down
5 changes: 5 additions & 0 deletions src/plugins/intel_gpu/src/graph/kernel_impl_params.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
#include "intel_gpu/graph/serialization/layout_serializer.hpp"
#include "intel_gpu/graph/serialization/string_serializer.hpp"
#include "intel_gpu/graph/serialization/vector_serializer.hpp"
#include "intel_gpu/runtime/device_info.hpp"

#include <string>
#include <vector>
Expand Down Expand Up @@ -71,6 +72,7 @@ bool kernel_impl_params::operator==(const kernel_impl_params& rhs) const {

void kernel_impl_params::save(BinaryOutputBuffer& ob) const {
ob << desc;
ob << static_cast<uint64_t>(dev_type);
ob << has_runtime_layouts;
ob << unique_id;
ob << input_layouts;
Expand Down Expand Up @@ -135,6 +137,9 @@ void kernel_impl_params::save(BinaryOutputBuffer& ob) const {
void kernel_impl_params::load(BinaryInputBuffer& ib) {
prog = nullptr;
ib >> desc;
size_t dev_type_id = 0;
ib >> dev_type_id;
dev_type = static_cast<cldnn::device_type>(dev_type_id);
ib >> has_runtime_layouts;
ib >> unique_id;
ib >> input_layouts;
Expand Down
14 changes: 12 additions & 2 deletions src/plugins/intel_gpu/src/plugin/ops/matmul.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,18 @@ static void CreateMatMulOp(ProgramBuilder& p, const std::shared_ptr<ov::op::v0::
return false;

// dynamic shapes and 1D tensors are not transposed
if (shapes[0].is_dynamic() || shapes[1].is_dynamic() ||
shapes[0].size() < 2 || shapes[1].size() < 2)
if (shapes[0].is_dynamic() || shapes[1].is_dynamic()) {
// Currently, cldnn optimized gemm kernel (gemm_tiled_opt) does not support transposed input with shape unaligned for 16.
// If the shape is not aligned for 16, gemm_ref_kernel will be selected,
// but the perf is worse than permute + gemm_tiled_opt.
// So we'll use this permute + gemm_tiled_opt strategy as a temporal solution,
// until we have an essential solution, i.e., fixing the gemm_tiled_opt kernel to support unaligned shape.
if (p.get_engine().get_device_info().dev_type == cldnn::device_type::integrated_gpu)
return true;
else
return false;
}
if (shapes[0].size() < 2 || shapes[1].size() < 2)
return false;

// don't transpose inputs if they're aligned to 16
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,11 @@ struct fc_fake_align_params {
layout input_layout;
layout weight_layout;
data_types data_type;
layout expected_input_layout;
layout expected_output_layout;
layout expected_input_layout_igpu;
layout expected_output_layout_igpu;
layout expected_input_layout_dgpu;
layout expected_output_layout_dgpu;

};

class fully_connected_fake_align_test : public testing::TestWithParam<fc_fake_align_params> {};
Expand Down Expand Up @@ -54,8 +57,13 @@ TEST_P(fully_connected_fake_align_test, fake_alignment) {
EXPECT_THROW(fully_connected_inst::get_fake_aligned_params(*impl_param), std::exception);
} else {
auto updated_param = fully_connected_inst::get_fake_aligned_params(*impl_param);
ASSERT_EQ(updated_param.get_input_layout(), p.expected_input_layout);
ASSERT_EQ(updated_param.get_output_layout(), p.expected_output_layout);
if (!engine.get_device_info().supports_immad) {
ASSERT_EQ(updated_param.get_input_layout(), p.expected_input_layout_igpu);
ASSERT_EQ(updated_param.get_output_layout(), p.expected_output_layout_igpu);
} else {
ASSERT_EQ(updated_param.get_input_layout(), p.expected_input_layout_dgpu);
ASSERT_EQ(updated_param.get_output_layout(), p.expected_output_layout_dgpu);
}
}
}

Expand All @@ -65,29 +73,38 @@ INSTANTIATE_TEST_SUITE_P(smoke, fully_connected_fake_align_test,
layout{ov::PartialShape{0, 1024}, data_types::i8, format::bfyx, padding{{1,1,1,1}, 0}}, // input_layout
layout{ov::PartialShape{1000, 1024}, data_types::i8, format::bfyx}, // weight layout
data_types::f16,
layout{ov::PartialShape{0, 1024}, data_types::i8, format::bfyx, padding{{1,1,1,1}, 0}}, // fake_aligned input layout
layout{ov::PartialShape{0, 1000}, data_types::f16, format::bfyx} // fake_aligned output layout
layout{ov::PartialShape{0, 1024}, data_types::i8, format::bfyx, padding{{1,1,1,1}, 0}}, // fake_aligned input layout_igpu
layout{ov::PartialShape{0, 1000}, data_types::f16, format::bfyx}, // fake_aligned output layout_igpu
layout{ov::PartialShape{0, 1024}, data_types::i8, format::bfyx, padding{{1,1,1,1}, 0}}, // fake_aligned input layout_dgpu
layout{ov::PartialShape{0, 1000}, data_types::f16, format::bfyx} // fake_aligned output layout_dgpu
},
{
layout{ov::PartialShape{11, 1024}, data_types::i8, format::bfyx, padding{{1,1,1,1}, 0}}, // input_layout
layout{ov::PartialShape{1000, 1024}, data_types::i8, format::bfyx}, // weight layout
data_types::f16,
layout{ov::PartialShape{16, 1024}, data_types::i8, format::bfyx, padding{{1,1,1,1}, 0}}, // fake_aligned input layout
layout{ov::PartialShape{16, 1000}, data_types::f16, format::bfyx} // fake_aligned output layout
layout{ov::PartialShape{16, 1024}, data_types::i8, format::bfyx, padding{{1,1,1,1}, 0}}, // fake_aligned input layout_igpu
layout{ov::PartialShape{16, 1000}, data_types::f16, format::bfyx}, // fake_aligned output layout_igpu
layout{ov::PartialShape{16, 1024}, data_types::i8, format::bfyx, padding{{1,1,1,1}, 0}}, // fake_aligned input layout_dgpu
layout{ov::PartialShape{16, 1000}, data_types::f16, format::bfyx} // fake_aligned output layout_dgpu

},
{
layout{ov::PartialShape{133, 511}, data_types::i8, format::bfyx, padding{{1,1,1,1}, 0}}, // input_layout
layout{ov::PartialShape{800, 511}, data_types::i8, format::bfyx}, // weight layout
data_types::f16,
layout{ov::PartialShape{136, 511}, data_types::i8, format::bfyx, padding{{1,1,1,1}, 0}}, // fake_aligned input layout
layout{ov::PartialShape{136, 800}, data_types::f16, format::bfyx} // fake_aligned output layout
layout{ov::PartialShape{144, 511}, data_types::i8, format::bfyx, padding{{1,1,1,1}, 0}}, // fake_aligned input layout_igpu
layout{ov::PartialShape{144, 800}, data_types::f16, format::bfyx}, // fake_aligned output layout_igpu
layout{ov::PartialShape{136, 511}, data_types::i8, format::bfyx, padding{{1,1,1,1}, 0}}, // fake_aligned input layout_dgpu
layout{ov::PartialShape{136, 800}, data_types::f16, format::bfyx} // fake_aligned output layout_dgpu
},
{
layout{ov::PartialShape::dynamic(2), data_types::i8, format::bfyx, padding{{1,1,1,1}, 0}}, // input_layout
layout{ov::PartialShape{1000, 1024}, data_types::i8, format::bfyx}, // weight layout
data_types::f16,
layout{ov::PartialShape{-1, -1}, data_types::i8, format::bfyx}, // fake_aligned input layout // dummy
layout{ov::PartialShape{-1, -1}, data_types::f16, format::bfyx} // fake_aligned output layout // dummy
layout{ov::PartialShape{-1, -1}, data_types::i8, format::bfyx}, // fake_aligned input layout_igpu // dummy
layout{ov::PartialShape{-1, -1}, data_types::f16, format::bfyx}, // fake_aligned output layout_igpu // dummy
layout{ov::PartialShape{-1, -1}, data_types::i8, format::bfyx}, // fake_aligned input layout_dgpu // dummy
layout{ov::PartialShape{-1, -1}, data_types::f16, format::bfyx} // fake_aligned output layout_dgpu // dummy
},

}));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1874,6 +1874,7 @@ TEST(fully_connected_onednn, impl_replacement_with_cldnn) {

const int32_t input_f = 3, input_b = 1, weight_b = 4;

auto fake_alignment_size = engine.get_device_info().supports_immad ? 8 : 16;
auto input_dyn_layout = layout{ ov::PartialShape{ ov::Dimension(1, 10), input_f }, data_types::f32,format::bfyx };
auto input_data = engine.allocate_memory(layout{ ov::PartialShape{ input_b, input_f }, data_types::f32,format::bfyx });
auto weights_data = engine.allocate_memory({ ov::PartialShape{ weight_b, input_f }, data_types::f32,format::bfyx });
Expand Down Expand Up @@ -1909,7 +1910,7 @@ TEST(fully_connected_onednn, impl_replacement_with_cldnn) {
auto output_prim_mem = outputs.begin()->second.get_memory();

auto out_l = network.get_output_layout(outputs.begin()->first);
ASSERT_EQ(output_prim_mem->get_layout().batch(), align_to(input_b, 8)); // fake_alignment
ASSERT_EQ(output_prim_mem->get_layout().batch(), align_to(input_b, fake_alignment_size)); // fake_alignment
ASSERT_EQ(out_l.batch(), input_b);
ASSERT_EQ(out_l.feature(), weight_b);
ASSERT_EQ(out_l.spatial(0), 1);
Expand Down Expand Up @@ -2045,6 +2046,7 @@ TEST(fully_connected_gpu, dynamic) {

const int32_t input_f = 3, input_b = 1, weight_b = 4;

auto fake_alignment_size = engine.get_device_info().supports_immad ? 8 : 16;
auto input_dyn_layout = layout{ ov::PartialShape{ ov::Dimension(1, 10), input_f }, data_types::f32,format::bfyx };
auto input_data = engine.allocate_memory(layout{ ov::PartialShape{ input_b, input_f }, data_types::f32,format::bfyx });
auto weights_data = engine.allocate_memory({ ov::PartialShape{ weight_b, input_f }, data_types::f32,format::bfyx });
Expand All @@ -2071,7 +2073,7 @@ TEST(fully_connected_gpu, dynamic) {
auto output_prim_mem = outputs.begin()->second.get_memory();

auto out_l = network.get_output_layout(outputs.begin()->first);
ASSERT_EQ(output_prim_mem->get_layout().batch(), align_to(input_b, 8)); // fake_alignment
ASSERT_EQ(output_prim_mem->get_layout().batch(), align_to(input_b, fake_alignment_size)); // fake_alignment
ASSERT_EQ(out_l.batch(), input_b);
ASSERT_EQ(out_l.feature(), weight_b);
ASSERT_EQ(out_l.spatial(0), 1);
Expand Down Expand Up @@ -2199,7 +2201,7 @@ TEST(fully_connected_gpu, dynamic_multi_inference_same_shape) {
auto input_data1 = engine.allocate_memory(input_actual_layout);
auto input_data2 = engine.allocate_memory(input_actual_layout);
auto weights_data = engine.allocate_memory({ ov::PartialShape{ weight_b, input_f }, data_types::f32,format::bfyx });

auto fake_alignment_size = engine.get_device_info().supports_immad ? 8 : 16;
set_values(input_data1, { 0.5f, -2.0f, -0.5f });
set_values(input_data2, { -0.5f, 2.0f, 0.5f });
set_values(weights_data, { 1.5f, 1.0f, 0.5f,
Expand Down Expand Up @@ -2228,7 +2230,7 @@ TEST(fully_connected_gpu, dynamic_multi_inference_same_shape) {
auto output_prim_mem = outputs.begin()->second.get_memory();

auto out_l = network.get_output_layout(outputs.begin()->first);
ASSERT_EQ(output_prim_mem->get_layout().batch(), align_to(input_b, 8)); // fake_alignment
ASSERT_EQ(output_prim_mem->get_layout().batch(), align_to(input_b, fake_alignment_size)); // fake_alignment
ASSERT_EQ(out_l.batch(), input_b);
ASSERT_EQ(out_l.feature(), weight_b);
ASSERT_EQ(out_l.spatial(0), 1);
Expand All @@ -2252,7 +2254,7 @@ TEST(fully_connected_gpu, dynamic_multi_inference_same_shape) {
auto output_prim_mem = outputs.begin()->second.get_memory();

auto out_l = network.get_output_layout(outputs.begin()->first);
ASSERT_EQ(output_prim_mem->get_layout().batch(), align_to(input_b, 8)); // fake_alignment
ASSERT_EQ(output_prim_mem->get_layout().batch(), align_to(input_b, fake_alignment_size)); // fake_alignment
ASSERT_EQ(out_l.batch(), input_b);
ASSERT_EQ(out_l.feature(), weight_b);
ASSERT_EQ(out_l.spatial(0), 1);
Expand All @@ -2272,6 +2274,7 @@ TEST(fully_connected_gpu, dynamic_multi_inference_different_shape) {

const int32_t input_f = 3, weight_b = 4;

auto fake_alignment_size = engine.get_device_info().supports_immad ? 8 : 16;
auto input_dyn_layout = layout{ ov::PartialShape{ ov::Dimension(1, 10), input_f }, data_types::f32,format::bfyx };
auto input_actual_layout1 = layout{ ov::PartialShape{ 2, input_f }, data_types::f32,format::bfyx};
auto input_actual_layout2 = layout{ ov::PartialShape{ 1, input_f }, data_types::f32,format::bfyx};
Expand Down Expand Up @@ -2311,7 +2314,7 @@ TEST(fully_connected_gpu, dynamic_multi_inference_different_shape) {
auto output_prim_mem = outputs.begin()->second.get_memory();

auto out_l = network.get_output_layout(outputs.begin()->first);
ASSERT_EQ(output_prim_mem->get_layout().batch(), align_to(2, 8)); // fake_alignment
ASSERT_EQ(output_prim_mem->get_layout().batch(), align_to(2, fake_alignment_size)); // fake_alignment
ASSERT_EQ(out_l.batch(), 2);
ASSERT_EQ(out_l.feature(), weight_b);
ASSERT_EQ(out_l.spatial(0), 1);
Expand Down Expand Up @@ -2340,7 +2343,7 @@ TEST(fully_connected_gpu, dynamic_multi_inference_different_shape) {
auto output_prim_mem = outputs.begin()->second.get_memory();

auto out_l = network.get_output_layout(outputs.begin()->first);
ASSERT_EQ(output_prim_mem->get_layout().batch(), align_to(1, 8)); // fake_alignment
ASSERT_EQ(output_prim_mem->get_layout().batch(), align_to(1, fake_alignment_size)); // fake_alignment
ASSERT_EQ(out_l.batch(), 1);
ASSERT_EQ(out_l.feature(), weight_b);
ASSERT_EQ(out_l.spatial(0), 1);
Expand All @@ -2360,6 +2363,7 @@ TEST(fully_connected_gpu, dynamic_multi_inference_multiple_shapes) {

const int32_t input_f = 3, weight_b = 4;

auto fake_alignment_size = engine.get_device_info().supports_immad ? 8 : 16;
auto input_dyn_layout = layout{ ov::PartialShape{ ov::Dimension(1, 10), input_f }, data_types::f32,format::bfyx };
auto input_actual_layout1 = layout{ ov::PartialShape{ 2, input_f }, data_types::f32,format::bfyx};
auto input_actual_layout2 = layout{ ov::PartialShape{ 1, input_f }, data_types::f32,format::bfyx};
Expand Down Expand Up @@ -2398,7 +2402,7 @@ TEST(fully_connected_gpu, dynamic_multi_inference_multiple_shapes) {
auto output_prim_mem = outputs.begin()->second.get_memory();

auto out_l = network.get_output_layout(outputs.begin()->first);
ASSERT_EQ(output_prim_mem->get_layout().batch(), align_to(2, 8)); // fake_alignment
ASSERT_EQ(output_prim_mem->get_layout().batch(), align_to(2, fake_alignment_size)); // fake_alignment
ASSERT_EQ(out_l.batch(), 2); // fake_alignment
ASSERT_EQ(out_l.feature(), weight_b);
ASSERT_EQ(out_l.spatial(0), 1);
Expand Down Expand Up @@ -2427,7 +2431,7 @@ TEST(fully_connected_gpu, dynamic_multi_inference_multiple_shapes) {
auto output_prim_mem = outputs.begin()->second.get_memory();

auto out_l = network.get_output_layout(outputs.begin()->first);
ASSERT_EQ(output_prim_mem->get_layout().batch(), align_to(1, 8)); // fake_alignment
ASSERT_EQ(output_prim_mem->get_layout().batch(), align_to(1, fake_alignment_size)); // fake_alignment
ASSERT_EQ(out_l.batch(), 1); // fake_alignment
ASSERT_EQ(out_l.feature(), weight_b);
ASSERT_EQ(out_l.spatial(0), 1);
Expand Down Expand Up @@ -2661,6 +2665,7 @@ TEST(fully_connected_gpu, has_cached_weights_reorder) {

const int32_t input_f = 3, input_b = 1, weight_b = 4;

auto fake_alignment_size = engine.get_device_info().supports_immad ? 8 : 16;
auto input_dyn_layout = layout{ ov::PartialShape{ ov::Dimension(1, 10), input_f }, data_types::f32,format::bfyx };
auto input_data = engine.allocate_memory(layout{ ov::PartialShape{ input_b, input_f }, data_types::f32,format::bfyx });
auto weights_data = engine.allocate_memory({ ov::PartialShape{ weight_b, input_f }, data_types::f32,format::bfyx });
Expand Down Expand Up @@ -2701,7 +2706,7 @@ TEST(fully_connected_gpu, has_cached_weights_reorder) {
ASSERT_TRUE(reorder_impl == nullptr);

auto out_l = network.get_output_layout(outputs.begin()->first);
ASSERT_EQ(output_prim_mem->get_layout().batch(), align_to(input_b, 8)); // fake_alignment
ASSERT_EQ(output_prim_mem->get_layout().batch(), align_to(input_b, fake_alignment_size)); // fake_alignment
ASSERT_EQ(out_l.batch(), input_b);
ASSERT_EQ(out_l.feature(), weight_b);
ASSERT_EQ(out_l.spatial(0), 1);
Expand Down
Loading

0 comments on commit 472a4ca

Please sign in to comment.