From 21092ad11193ecf7bfec9abc75f0ee844c1a9c5d Mon Sep 17 00:00:00 2001 From: Wang Wangwang Date: Tue, 18 Feb 2025 13:38:07 +0800 Subject: [PATCH 01/20] Update access modifiers for CPU impls (#28286) ### Details: - *Add buffer_ptr() API in gpu_buffer struct* - *Mark the output in reorder cpu implementation as read_write to enable the result(type is reorder) with cl_mem* ![image](https://github.com/user-attachments/assets/de724d96-4941-4ceb-80e5-78c83963297d) When use HETERO:GPU.0,GPU.1 pipeline parallel split the llama_v2 model, there is a `__module.model.layers.0.self_attn/prim::ListConstruct_3` node is marked in shape_of subgraph, and it's result will pass to next device, thus this node and it's result node can only get CPU implementations. In inference stage, when prepare outputs for the Result_44438, B580 dGPU will only use cl_mem in https://github.com/openvinotoolkit/openvino/blob/d757efd7fb3415a3dbda10941b3dae0ace0ac16e/src/plugins/intel_gpu/src/plugin/sync_infer_request.cpp#L548, the reorder cpu impl can not set result to cl_mem. So, we have two solutions: 1. Use this PR mark reorder output_lock in cpu impl to read_write, so it can use cl_mem 2. In [PR28476](https://github.com/openvinotoolkit/openvino/pull/28476) Skip mark_node for result node with reorder type ### Tickets: - *CVS-158971* --------- Co-authored-by: River Li --- .../include/intel_gpu/runtime/device.hpp | 1 + .../src/graph/impls/cpu/broadcast.cpp | 2 +- .../intel_gpu/src/graph/impls/cpu/concat.cpp | 2 +- .../src/graph/impls/cpu/fake_convert.cpp | 2 +- .../intel_gpu/src/graph/impls/cpu/gather.cpp | 2 +- .../intel_gpu/src/graph/impls/cpu/range.cpp | 2 +- .../intel_gpu/src/graph/impls/cpu/reorder.cpp | 2 +- .../src/graph/impls/cpu/scatter_update.cpp | 2 +- .../intel_gpu/src/graph/impls/cpu/select.cpp | 2 +- .../intel_gpu/src/graph/impls/cpu/tile.cpp | 2 +- .../intel_gpu/src/runtime/ocl/ocl_device.cpp | 4 ++ .../intel_gpu/src/runtime/ocl/ocl_device.hpp | 2 + .../intel_gpu/src/runtime/ocl/ocl_memory.hpp | 3 + .../tests/unit/module_tests/device_test.cpp | 3 + .../test_cases/activation_simple_gpu_test.cpp | 60 +++++++++++++++++ .../test_cases/concatenation_gpu_test.cpp | 65 +++++++++++++++++++ .../tests/unit/test_cases/crop_gpu_test.cpp | 53 +++++++++++++++ .../unit/test_cases/eltwise_gpu_test.cpp | 21 ++++-- .../tests/unit/test_cases/gather_gpu_test.cpp | 40 ++++++++++++ .../unit/test_cases/reorder_gpu_test.cpp | 36 ++++++++++ .../unit/test_cases/shape_of_gpu_test.cpp | 19 ++++-- .../test_cases/strided_slice_gpu_test.cpp | 14 ++-- .../tests/unit/test_cases/tile_gpu_test.cpp | 40 ++++++++++++ .../tests/unit/test_utils/test_utils.cpp | 23 +++++++ .../tests/unit/test_utils/test_utils.h | 6 ++ 25 files changed, 383 insertions(+), 25 deletions(-) diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/device.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/device.hpp index c393a0174b8c2e..9711cdfa1ea386 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/runtime/device.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/device.hpp @@ -25,6 +25,7 @@ struct device { float get_gops(cldnn::data_types dt) const; bool use_unified_shared_memory() const; + virtual void set_mem_caps(memory_capabilities memory_capabilities) = 0; virtual ~device() = default; }; diff --git a/src/plugins/intel_gpu/src/graph/impls/cpu/broadcast.cpp b/src/plugins/intel_gpu/src/graph/impls/cpu/broadcast.cpp index 772fe2751adae0..e86c69a61b3d85 100644 --- a/src/plugins/intel_gpu/src/graph/impls/cpu/broadcast.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/cpu/broadcast.cpp @@ -99,7 +99,7 @@ struct broadcast_impl : public typed_primitive_impl { auto output_mem_ptr = instance.output_memory_ptr(); - cldnn::mem_lock output_lock(output_mem_ptr, stream); + cldnn::mem_lock output_lock(output_mem_ptr, stream); output_host_tensors.push_back(make_tensor(params->output_layouts[0], output_lock.data())); OPENVINO_ASSERT(op->evaluate(output_host_tensors, input_host_tensors), diff --git a/src/plugins/intel_gpu/src/graph/impls/cpu/concat.cpp b/src/plugins/intel_gpu/src/graph/impls/cpu/concat.cpp index ad1f4979141f56..34608a8aed6a3d 100644 --- a/src/plugins/intel_gpu/src/graph/impls/cpu/concat.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/cpu/concat.cpp @@ -80,7 +80,7 @@ struct concatenation_impl : public typed_primitive_impl { auto output_mem_ptr = instance.output_memory_ptr(); - cldnn::mem_lock output_lock(output_mem_ptr, stream); + cldnn::mem_lock output_lock(output_mem_ptr, stream); output_host_tensors.push_back(make_tensor(params->output_layouts[0], output_lock.data())); diff --git a/src/plugins/intel_gpu/src/graph/impls/cpu/fake_convert.cpp b/src/plugins/intel_gpu/src/graph/impls/cpu/fake_convert.cpp index 47a9f1ef02db99..ad6f35608a0959 100644 --- a/src/plugins/intel_gpu/src/graph/impls/cpu/fake_convert.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/cpu/fake_convert.cpp @@ -74,7 +74,7 @@ struct fake_convert_impl : public typed_primitive_impl { auto output_mem_ptr = instance.output_memory_ptr(); - cldnn::mem_lock output_lock(output_mem_ptr, stream); + cldnn::mem_lock output_lock(output_mem_ptr, stream); for (size_t i = 0; i < input_mem_ptrs.size(); i++) input_host_tensors.push_back(make_tensor(params->input_layouts[i], input_mem_ptrs[i]->lock(stream, mem_lock_type::read))); diff --git a/src/plugins/intel_gpu/src/graph/impls/cpu/gather.cpp b/src/plugins/intel_gpu/src/graph/impls/cpu/gather.cpp index e46a9892d64cb5..09a2baa122be64 100644 --- a/src/plugins/intel_gpu/src/graph/impls/cpu/gather.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/cpu/gather.cpp @@ -82,7 +82,7 @@ struct gather_impl : public typed_primitive_impl { auto output_mem_ptr = instance.output_memory_ptr(); - cldnn::mem_lock output_lock(output_mem_ptr, stream); + cldnn::mem_lock output_lock(output_mem_ptr, stream); for (size_t i = 0; i < input_mem_ptrs.size(); i++) input_host_tensors.push_back(make_tensor(params->input_layouts[i], input_mem_ptrs[i]->lock(stream, mem_lock_type::read))); diff --git a/src/plugins/intel_gpu/src/graph/impls/cpu/range.cpp b/src/plugins/intel_gpu/src/graph/impls/cpu/range.cpp index c3ec254a48576b..f3c3fbcf07bada 100644 --- a/src/plugins/intel_gpu/src/graph/impls/cpu/range.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/cpu/range.cpp @@ -55,7 +55,7 @@ struct range_impl : public typed_primitive_impl { auto output_mem_ptr = instance.output_memory_ptr(); - cldnn::mem_lock output_lock(output_mem_ptr, stream); + cldnn::mem_lock output_lock(output_mem_ptr, stream); for (size_t i = 0; i < input_mem_ptrs.size(); i++) input_host_tensors.push_back(make_tensor(params->input_layouts[i], input_mem_ptrs[i]->lock(stream, mem_lock_type::read))); diff --git a/src/plugins/intel_gpu/src/graph/impls/cpu/reorder.cpp b/src/plugins/intel_gpu/src/graph/impls/cpu/reorder.cpp index abcd6569acb9b7..6fc63f8c6eeddb 100644 --- a/src/plugins/intel_gpu/src/graph/impls/cpu/reorder.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/cpu/reorder.cpp @@ -56,7 +56,7 @@ struct reorder_impl : public typed_primitive_impl { auto output_mem_ptr = instance.output_memory_ptr(); cldnn::mem_lock input_lock(input_mem_ptr, stream); - cldnn::mem_lock output_lock(output_mem_ptr, stream); + cldnn::mem_lock output_lock(output_mem_ptr, stream); input_host_tensors.push_back(make_tensor(params->input_layouts[0], input_lock.data())); output_host_tensors.push_back(make_tensor(params->output_layouts[0], output_lock.data())); diff --git a/src/plugins/intel_gpu/src/graph/impls/cpu/scatter_update.cpp b/src/plugins/intel_gpu/src/graph/impls/cpu/scatter_update.cpp index b27fe10bf64ecc..695c4781166ebe 100644 --- a/src/plugins/intel_gpu/src/graph/impls/cpu/scatter_update.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/cpu/scatter_update.cpp @@ -71,7 +71,7 @@ struct scatter_update_impl : public typed_primitive_impl { auto output_mem_ptr = instance.output_memory_ptr(); - cldnn::mem_lock output_lock(output_mem_ptr, stream); + cldnn::mem_lock output_lock(output_mem_ptr, stream); for (size_t i = 0; i < input_mem_ptrs.size(); i++) input_host_tensors.push_back(make_tensor(params->input_layouts[i], input_mem_ptrs[i]->lock(stream, mem_lock_type::read))); diff --git a/src/plugins/intel_gpu/src/graph/impls/cpu/select.cpp b/src/plugins/intel_gpu/src/graph/impls/cpu/select.cpp index 6aa27fea1d4990..69be7bc871b734 100644 --- a/src/plugins/intel_gpu/src/graph/impls/cpu/select.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/cpu/select.cpp @@ -76,7 +76,7 @@ struct select_impl : public typed_primitive_impl