From 6f7519ad4dc1920571de22eac41f66b6c09e76d3 Mon Sep 17 00:00:00 2001 From: Vladislav Golubev Date: Fri, 14 Feb 2025 11:17:58 +0100 Subject: [PATCH 01/10] [Transformations] SDPA Decomposition: avoid unnecessary ShapeOf subgraphs (#28639) ### Details: - *Currently, `ScaledDotProductAttentionDecomposition` uses `ShapeOf->Gather` subgraphs to extract a specific dimension of input shapes. If the extracted dim is static but the whole shape is dynamic, such subgraphs are not folded by `ConstantFolding` pass whereas all the needed info can be extracted. This PR updates dim extraction logic: after the subgraph formation, `get_constant_from_source` tries to compute the subgraph, and replaces it with constant if possible* - *This change unblocks SDPA quantization for some scenarios* ### Tickets: - *CVS-161062* --- ...ed_dot_product_attention_decomposition.cpp | 21 +++++- .../scaled_dot_product_decomposition_test.cpp | 73 +++++++++++++++---- 2 files changed, 77 insertions(+), 17 deletions(-) diff --git a/src/common/transformations/src/transformations/op_conversions/scaled_dot_product_attention_decomposition.cpp b/src/common/transformations/src/transformations/op_conversions/scaled_dot_product_attention_decomposition.cpp index be18ab31dc19d3..3bfa8a009f74d4 100644 --- a/src/common/transformations/src/transformations/op_conversions/scaled_dot_product_attention_decomposition.cpp +++ b/src/common/transformations/src/transformations/op_conversions/scaled_dot_product_attention_decomposition.cpp @@ -8,6 +8,7 @@ #include "itt.hpp" #include "openvino/core/rt_info.hpp" +#include "openvino/core/validation_util.hpp" #include "openvino/op/add.hpp" #include "openvino/op/broadcast.hpp" #include "openvino/op/concat.hpp" @@ -68,9 +69,23 @@ std::shared_ptr ov::pass::ScaledDotProductAttentionDecomposition::deco auto one_f = register_new_node(one_i, query); auto zero_f = register_new_node(zero_i, query); + auto build_extract_dim_subgraph = [this, &zero_i](const std::shared_ptr& shape_of, + const int64_t idx) -> std::shared_ptr { + const auto dim_to_extract_const = v0::Constant::create(element::i32, Shape{}, {idx}); + const auto gather = std::make_shared(shape_of, dim_to_extract_const, zero_i); + // When dim_to_extract is static but the whole shape is dynamic, + // ConstantFolding can't fold ShapeOf->Gather subgraph in this case. + // So it's better to explicitly extract the needed dimension. + if (auto constant = ov::util::get_constant_from_source(gather)) { + return register_new_node(constant); + } + register_new_node(dim_to_extract_const); + return register_new_node(gather); + }; + Output scale; if (node->get_input_size() < 5) { - scale = register_new_node(q_shape, minus_one, zero_i)->output(0); + scale = build_extract_dim_subgraph(q_shape, -1); scale = register_new_node(scale, query); auto sqrt_scale = register_new_node(scale); scale = register_new_node(one_f, sqrt_scale); @@ -112,8 +127,8 @@ std::shared_ptr ov::pass::ScaledDotProductAttentionDecomposition::deco atten_mask = mask; } } else { - auto target_s_len = register_new_node(q_shape, minus_two, zero_i); - auto source_s_len = register_new_node(k_shape, minus_two, zero_i); + auto target_s_len = build_extract_dim_subgraph(q_shape, -2); + auto source_s_len = build_extract_dim_subgraph(k_shape, -2); auto ssl = register_new_node(source_s_len, zero_i); auto tsl = register_new_node(target_s_len, zero_i); auto mask_shape = register_new_node(OutputVector{tsl, ssl}, 0); diff --git a/src/common/transformations/tests/op_conversions/scaled_dot_product_decomposition_test.cpp b/src/common/transformations/tests/op_conversions/scaled_dot_product_decomposition_test.cpp index c83c0c86d41f8d..82a0f89e83786c 100644 --- a/src/common/transformations/tests/op_conversions/scaled_dot_product_decomposition_test.cpp +++ b/src/common/transformations/tests/op_conversions/scaled_dot_product_decomposition_test.cpp @@ -32,12 +32,12 @@ using namespace ov; using namespace testing; const std::shared_ptr scaled_dot_product_attention_decomposition( - const std::shared_ptr query, - const std::shared_ptr key, - const std::shared_ptr value, - const std::shared_ptr attention_mask, - const std::shared_ptr scale, - const bool casual); + std::shared_ptr query, + std::shared_ptr key, + std::shared_ptr value, + std::shared_ptr attention_mask, + std::shared_ptr scale, + bool casual); TEST_F(TransformationTestsF, ScaledDotProductAttentionDecompositionStaticBasic) { const PartialShape query_shape{1, 32, 32}; @@ -129,6 +129,34 @@ TEST_F(TransformationTestsF, ScaledDotProductAttentionDecompositionStaticBroadca } } +TEST_F(TransformationTestsF, ScaledDotProductAttentionCasualPartiallyDynamic) { + const PartialShape query_shape{-1, -1, 24, 64}; + const PartialShape key_shape{-1, -1, 24, 64}; + const PartialShape value_shape{-1, -1, -1, 64}; + const PartialShape attention_mask_shape{-1, -1, -1, -1}; + const auto casual = true; + + const auto query = std::make_shared(element::f32, query_shape); + const auto key = std::make_shared(element::f32, key_shape); + const auto value = std::make_shared(element::f32, value_shape); + const auto attention_mask = std::make_shared(element::f32, attention_mask_shape); + { + const auto scaled_dot_product_attention = + std::make_shared(query, key, value, attention_mask, casual); + + model = std::make_shared(NodeVector{scaled_dot_product_attention}, + ParameterVector{query, key, value, attention_mask}); + manager.register_pass(); + } + + { + const auto scaled_dot_product_attention = + scaled_dot_product_attention_decomposition(query, key, value, attention_mask, nullptr, casual); + model_ref = std::make_shared(NodeVector{scaled_dot_product_attention}, + ParameterVector{query, key, value, attention_mask}); + } +} + TEST_F(TransformationTestsF, ScaledDotProductAttentionDecompositionDynamic) { const PartialShape query_shape{-1, -1, -1}; const PartialShape key_shape{-1, -1, -1}; @@ -160,12 +188,12 @@ TEST_F(TransformationTestsF, ScaledDotProductAttentionDecompositionDynamic) { } const std::shared_ptr scaled_dot_product_attention_decomposition( - const std::shared_ptr query, - const std::shared_ptr key, - const std::shared_ptr value, - const std::shared_ptr attention_mask, - const std::shared_ptr scale, - const bool casual) { + std::shared_ptr query, + std::shared_ptr key, + std::shared_ptr value, + std::shared_ptr attention_mask, + std::shared_ptr scale, + bool casual) { const auto q_shape = std::make_shared(query, element::i32); const auto k_shape = std::make_shared(key, element::i32); const auto minus_one = ov::op::v0::Constant::create(element::i32, Shape{}, {-1}); @@ -175,6 +203,23 @@ const std::shared_ptr scaled_dot_product_attention_decomposition( const auto one_f = std::make_shared(one_i, query); const auto zero_f = std::make_shared(zero_i, query); + auto extract_dim = [&zero_i](const std::shared_ptr& shape_of, + const int64_t idx) -> std::shared_ptr { + const auto& shape = shape_of->get_input_partial_shape(0); + const auto& dim = shape[idx]; + if (dim.is_static()) { + return ov::op::v0::Constant::create(element::i32, Shape{}, {dim.get_length()}); + } + const auto dim_to_extract_const = ov::op::v0::Constant::create(element::i32, Shape{}, {idx}); + return std::make_shared(shape_of, dim_to_extract_const, zero_i); + }; + + if (scale == nullptr) { + scale = extract_dim(q_shape, -1); + scale = std::make_shared(scale, query); + auto sqrt_scale = std::make_shared(scale); + scale = std::make_shared(one_f, sqrt_scale); + } const auto q_scaled = std::make_shared(query, scale); auto k_rank = std::make_shared(k_shape, element::i32)->output(0); const auto k_last_dim = std::make_shared(k_rank, minus_one); @@ -204,8 +249,8 @@ const std::shared_ptr scaled_dot_product_attention_decomposition( atten_mask = mask; } } else { - const auto target_s_len = std::make_shared(q_shape, minus_two, zero_i); - const auto source_s_len = std::make_shared(k_shape, minus_two, zero_i); + const auto target_s_len = extract_dim(q_shape, -2); + const auto source_s_len = extract_dim(k_shape, -2); const auto ssl = std::make_shared(source_s_len, zero_i); const auto tsl = std::make_shared(target_s_len, zero_i); const auto mask_shape = std::make_shared(OutputVector{tsl, ssl}, 0); From eb44f8d0a5c16e848fdff889dd92e19d2ab89cfe Mon Sep 17 00:00:00 2001 From: Arseniy Obolenskiy Date: Fri, 14 Feb 2025 12:51:30 +0100 Subject: [PATCH 02/10] Make custom ThreadLocal implementation copyable (#28900) ### Details: Fix ARM 32-bit build with `-DENABLE_DEBUG_CAPS=ON` CMake configuration flag. Make `ThreadLocal` class implementation copyable where TBB threading is not enabled. Original `tbb::enumerable_thread_specific` has copyable interface. ### Tickets: - N/A --- .../openvino/runtime/threading/thread_local.hpp | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/src/inference/dev_api/openvino/runtime/threading/thread_local.hpp b/src/inference/dev_api/openvino/runtime/threading/thread_local.hpp index 9a3edee83fc592..32f2a5b732b40a 100644 --- a/src/inference/dev_api/openvino/runtime/threading/thread_local.hpp +++ b/src/inference/dev_api/openvino/runtime/threading/thread_local.hpp @@ -59,8 +59,16 @@ struct ThreadLocal { _create = std::move(other._create); return *this; } - ThreadLocal(const ThreadLocal&) = delete; - ThreadLocal& operator=(const ThreadLocal&&) = delete; + ThreadLocal(const ThreadLocal& other) : _create(other._create) { + std::lock_guard lock{other._mutex}; + _map = other._map; + } + ThreadLocal& operator=(const ThreadLocal& other) { + std::lock_guard lock{other._mutex}; + _map = other._map; + _create = other._create; + return *this; + } explicit ThreadLocal(const Create& create_) : _create{create_} {} T& local() { From 9d92d9c35a83a9483677b4cb7521cac330b14345 Mon Sep 17 00:00:00 2001 From: Srinjoy Dutta <114402816+srinjoydutta03@users.noreply.github.com> Date: Fri, 14 Feb 2025 17:47:15 +0530 Subject: [PATCH 03/10] [CPU][ARM64] Implemented JIT Emitter for Eltwise Squared Difference Operation (#28989) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Details: - Implemented and added jit_squared_difference_emitter derived class for element wise squared difference operation - Added entry Algorithm::EltwiseSquaredDifference, in executors/aarch64 as one of the supported algorithms - Added entry in the get_supported_precisions and create_eltwise_emitters in kernel/aarch64 - Added `utils::EltwiseTypes::SQUARED_DIFF` in `jit` kernel check in the tests ### Tests: Passed local tests using `./bin/arm64/Release/ov_cpu_func_tests --gtest_filter="*smoke*Eltwise*SqDiff*"` Screenshot 2025-02-14 at 2 04 25 PM ### Tickets - Closes #27502 CC: @a-sidorova --- .../plugin/aarch64/jit_eltwise_emitters.cpp | 43 +++++++++++++++++++ .../plugin/aarch64/jit_eltwise_emitters.hpp | 22 ++++++++++ .../nodes/executors/aarch64/jit_eltwise.cpp | 1 + .../aarch64/jit_uni_eltwise_generic.cpp | 2 + .../single_layer_tests/classes/eltwise.cpp | 3 +- 5 files changed, 70 insertions(+), 1 deletion(-) diff --git a/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_eltwise_emitters.cpp b/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_eltwise_emitters.cpp index 9e4e7160026568..e925abcadab907 100644 --- a/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_eltwise_emitters.cpp +++ b/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_eltwise_emitters.cpp @@ -2704,6 +2704,49 @@ std::set> jit_sqrt_emitter::get_supported_precisions( return {{element::f32}}; } +/// SQUARED DIFFERENCE /// +jit_squared_difference_emitter::jit_squared_difference_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + const std::shared_ptr& node) + : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) {} + +jit_squared_difference_emitter::jit_squared_difference_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + const ov::element::Type exec_prc) + : jit_emitter(host, host_isa, exec_prc) {} + +size_t jit_squared_difference_emitter::get_inputs_count() const { + return 2; +} + +void jit_squared_difference_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { + if (host_isa_ == dnnl::impl::cpu::aarch64::asimd) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else { + OV_CPU_JIT_EMITTER_THROW("Can't create jit eltwise kernel"); + } +} + +template +void jit_squared_difference_emitter::emit_isa(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { + OV_CPU_JIT_EMITTER_ASSERT(exec_prc_ == ov::element::f32, "unsupported precision: " + exec_prc_.to_string()); + + using TReg = typename dnnl::impl::cpu::aarch64::cpu_isa_traits::TReg; + TReg src0 = TReg(in_vec_idxs[0]); + TReg src1 = TReg(in_vec_idxs[1]); + TReg dst = TReg(out_vec_idxs[0]); + + h->fsub(dst.s, src0.s, src1.s); + h->fmul(dst.s, dst.s, dst.s); +} + +std::set> jit_squared_difference_emitter::get_supported_precisions( + const std::shared_ptr& node) { + return {{element::f32, element::f32}}; +} + /// SUBTRACT /// jit_subtract_emitter::jit_subtract_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, diff --git a/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_eltwise_emitters.hpp b/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_eltwise_emitters.hpp index ca0e2dd8e157e6..f24f3a0bda4c37 100644 --- a/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_eltwise_emitters.hpp +++ b/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_eltwise_emitters.hpp @@ -1104,6 +1104,28 @@ class jit_sqrt_emitter : public jit_emitter { void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; }; +class jit_squared_difference_emitter : public jit_emitter { +public: + jit_squared_difference_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + const ov::element::Type exec_prc = ov::element::f32); + + jit_squared_difference_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + const std::shared_ptr& node); + + size_t get_inputs_count() const override; + + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); + +private: + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; + + template + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; +}; + class jit_subtract_emitter : public jit_emitter { public: jit_subtract_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, diff --git a/src/plugins/intel_cpu/src/nodes/executors/aarch64/jit_eltwise.cpp b/src/plugins/intel_cpu/src/nodes/executors/aarch64/jit_eltwise.cpp index bb5d79ad56de6d..d2238b2d9f182c 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/aarch64/jit_eltwise.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/aarch64/jit_eltwise.cpp @@ -57,6 +57,7 @@ bool JitEltwiseExecutor::isSupported(const Algorithm& algorithm, Algorithm::EltwiseSigmoid, Algorithm::EltwiseSoftSign, Algorithm::EltwiseSqrt, + Algorithm::EltwiseSquaredDifference, Algorithm::EltwiseSubtract, Algorithm::EltwiseSwish, Algorithm::EltwiseTanh); diff --git a/src/plugins/intel_cpu/src/nodes/kernels/aarch64/jit_uni_eltwise_generic.cpp b/src/plugins/intel_cpu/src/nodes/kernels/aarch64/jit_uni_eltwise_generic.cpp index d820bf9ea12775..818d7bdbfef684 100644 --- a/src/plugins/intel_cpu/src/nodes/kernels/aarch64/jit_uni_eltwise_generic.cpp +++ b/src/plugins/intel_cpu/src/nodes/kernels/aarch64/jit_uni_eltwise_generic.cpp @@ -702,6 +702,7 @@ std::shared_ptr jit_uni_eltwise_generic::create_eltwise_emitte OV_CASE(Algorithm::EltwiseSigmoid, ov::intel_cpu::aarch64::jit_sigmoid_emitter), OV_CASE(Algorithm::EltwiseSoftSign, ov::intel_cpu::aarch64::jit_soft_sign_emitter), OV_CASE(Algorithm::EltwiseSqrt, ov::intel_cpu::aarch64::jit_sqrt_emitter), + OV_CASE(Algorithm::EltwiseSquaredDifference, ov::intel_cpu::aarch64::jit_squared_difference_emitter), OV_CASE(Algorithm::EltwiseSubtract, ov::intel_cpu::aarch64::jit_subtract_emitter), OV_CASE(Algorithm::EltwiseSwish, ov::intel_cpu::aarch64::jit_swish_emitter), OV_CASE(Algorithm::EltwiseTanh, ov::intel_cpu::aarch64::jit_tanh_emitter)); @@ -836,6 +837,7 @@ std::set> eltwise_precision_helper::get_supported_pre OV_CASE(Algorithm::EltwiseSigmoid, jit_sigmoid_emitter), OV_CASE(Algorithm::EltwiseSoftSign, jit_soft_sign_emitter), OV_CASE(Algorithm::EltwiseSqrt, jit_sqrt_emitter), + OV_CASE(Algorithm::EltwiseSquaredDifference, jit_squared_difference_emitter), OV_CASE(Algorithm::EltwiseSubtract, jit_subtract_emitter), OV_CASE(Algorithm::EltwiseSwish, jit_swish_emitter), OV_CASE(Algorithm::EltwiseTanh, jit_tanh_emitter)); diff --git a/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/classes/eltwise.cpp b/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/classes/eltwise.cpp index 3f48b1f0b1e976..1fea147aa63318 100644 --- a/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/classes/eltwise.cpp +++ b/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/classes/eltwise.cpp @@ -260,7 +260,8 @@ std::string EltwiseLayerCPUTest::getPrimitiveType(const utils::EltwiseTypes& elt (eltwise_type == utils::EltwiseTypes::SUBTRACT) || (eltwise_type == utils::EltwiseTypes::DIVIDE) || (eltwise_type == utils::EltwiseTypes::FLOOR_MOD) || - (eltwise_type == utils::EltwiseTypes::MOD)) { + (eltwise_type == utils::EltwiseTypes::MOD) || + (eltwise_type == utils::EltwiseTypes::SQUARED_DIFF)) { return "jit"; } #endif From d7ecf527148b9bea47e386592047d8ce2b4c3a00 Mon Sep 17 00:00:00 2001 From: Stefania Hergane Date: Fri, 14 Feb 2025 14:32:20 +0200 Subject: [PATCH 04/10] [NPU] Add datatype NF4 support (#27903) ### Details: - Add datatype NF4 support in NPU plugin - *...* --------- Signed-off-by: Stefania Hergane Co-authored-by: Stepan --- .../intel_npu/src/backend/src/zero_infer_request.cpp | 7 +++++-- .../src/compiler_adapter/src/driver_compiler_adapter.cpp | 2 ++ src/plugins/intel_npu/src/utils/src/zero/zero_wrappers.cpp | 2 +- src/plugins/intel_npu/thirdparty/level-zero-ext | 2 +- 4 files changed, 9 insertions(+), 4 deletions(-) diff --git a/src/plugins/intel_npu/src/backend/src/zero_infer_request.cpp b/src/plugins/intel_npu/src/backend/src/zero_infer_request.cpp index aee73a2b73fa31..df9cc4eb328133 100644 --- a/src/plugins/intel_npu/src/backend/src/zero_infer_request.cpp +++ b/src/plugins/intel_npu/src/backend/src/zero_infer_request.cpp @@ -726,6 +726,8 @@ void ZeroInferRequest::check_network_precision(const ov::element::Type_t precisi break; case ov::element::Type_t::bf16: break; + case ov::element::Type_t::nf4: + break; case ov::element::Type_t::u4: break; case ov::element::Type_t::i4: @@ -749,8 +751,9 @@ void ZeroInferRequest::check_network_precision(const ov::element::Type_t precisi case ov::element::Type_t::f64: break; default: - OPENVINO_THROW("Unsupported tensor precision: " + ov::element::Type(precision).get_type_name() + - "! Supported precisions: FP32, FP16, BF16, U4, I4, U8, I8, U16, I16, U32, I32, U64, I64, FP64"); + OPENVINO_THROW( + "Unsupported tensor precision: " + ov::element::Type(precision).get_type_name() + + "! Supported precisions: FP32, FP16, BF16, NF4, U4, I4, U8, I8, U16, I16, U32, I32, U64, I64, FP64"); } } diff --git a/src/plugins/intel_npu/src/compiler_adapter/src/driver_compiler_adapter.cpp b/src/plugins/intel_npu/src/compiler_adapter/src/driver_compiler_adapter.cpp index 624ba448fed44f..51e42478e7cebc 100644 --- a/src/plugins/intel_npu/src/compiler_adapter/src/driver_compiler_adapter.cpp +++ b/src/plugins/intel_npu/src/compiler_adapter/src/driver_compiler_adapter.cpp @@ -78,6 +78,8 @@ std::string ovPrecisionToLegacyPrecisionString(const ov::element::Type& precisio return "FP64"; case ov::element::Type_t::bf16: return "BF16"; + case ov::element::Type_t::nf4: + return "NF4"; case ov::element::Type_t::i4: return "I4"; case ov::element::Type_t::i8: diff --git a/src/plugins/intel_npu/src/utils/src/zero/zero_wrappers.cpp b/src/plugins/intel_npu/src/utils/src/zero/zero_wrappers.cpp index 4868d6326c5fe4..1d3c0cc23e6a98 100644 --- a/src/plugins/intel_npu/src/utils/src/zero/zero_wrappers.cpp +++ b/src/plugins/intel_npu/src/utils/src/zero/zero_wrappers.cpp @@ -69,7 +69,7 @@ CommandList::CommandList(const std::shared_ptr& initStruc if (mtci_is_supported) { ze_mutable_command_id_exp_desc_t mutableCmdIdDesc = {ZE_STRUCTURE_TYPE_MUTABLE_COMMAND_ID_EXP_DESC, nullptr, - ZE_MUTABLE_COMMAND_EXP_FLAG_GRAPH_ARGUMENT}; + ZE_MUTABLE_COMMAND_EXP_FLAG_GRAPH_ARGUMENT_DEPRECATED}; THROW_ON_FAIL_FOR_LEVELZERO("zeCommandListGetNextCommandIdExp", zeCommandListGetNextCommandIdExp(_handle, &mutableCmdIdDesc, &_command_id)); } diff --git a/src/plugins/intel_npu/thirdparty/level-zero-ext b/src/plugins/intel_npu/thirdparty/level-zero-ext index 110f48ee8eda22..c0156a3390ae39 160000 --- a/src/plugins/intel_npu/thirdparty/level-zero-ext +++ b/src/plugins/intel_npu/thirdparty/level-zero-ext @@ -1 +1 @@ -Subproject commit 110f48ee8eda22d8b40daeeecdbbed0fc3b08f8b +Subproject commit c0156a3390ae39671ff8f2a6f5471f04bb65bb12 From 1d1d09952d4412fa400474a5f10a170974c89466 Mon Sep 17 00:00:00 2001 From: Pawel Raasz Date: Fri, 14 Feb 2025 14:49:51 +0100 Subject: [PATCH 05/10] [CI] Update reviewdog to not used deprecated option and check all files (#28974) ### Details: - Update reviewdog for code style check: * remove deprecated option usage * add nofilter for check files to check all not diff only * exclude temp and thirdparty code - Apply code style in affected files ### Related PRs: - #28946 ### Tickets: - N/A --------- Signed-off-by: Raasz, Pawel --- .github/workflows/code_style.yml | 14 +++++++++++--- .../onnx/frontend/src/utils/onnx_internal.cpp | 3 ++- .../onnx/tests/onnx_import_convpool.in.cpp | 9 ++++++--- .../src/graph_iterator_saved_model.cpp | 3 +-- .../snippets/aarch64/jit_loop_emitters.cpp | 6 +++--- .../snippets/aarch64/jit_loop_emitters.hpp | 6 +++--- .../emitters/snippets/x64/jit_loop_emitters.cpp | 6 +++--- .../emitters/snippets/x64/jit_loop_emitters.hpp | 6 +++--- .../snippets/x64/jit_reg_spill_emitters.cpp | 6 +++--- .../snippets/x64/jit_reg_spill_emitters.hpp | 12 ++++++------ .../src/nodes/executors/aarch64/jit_eltwise.cpp | 2 +- .../kernels/aarch64/jit_uni_eltwise_generic.cpp | 12 ++++++------ .../src/nodes/kernels/x64/mlp_kernel.hpp | 6 +++--- .../sea_itt_lib/IttNotifyStdSrc.cpp | 16 +++++++--------- 14 files changed, 58 insertions(+), 49 deletions(-) diff --git a/.github/workflows/code_style.yml b/.github/workflows/code_style.yml index 8f69ee7cd4ca74..89527187272227 100644 --- a/.github/workflows/code_style.yml +++ b/.github/workflows/code_style.yml @@ -39,7 +39,11 @@ jobs: with: github_token: ${{ secrets.GITHUB_TOKEN }} level: warning - fail_on_error: true + fail_level: error + filter_mode: nofilter + exclude: | + "*/thirdparty/*" + "./temp/*" clang-format-aarch64: runs-on: ubuntu-22.04 @@ -71,7 +75,11 @@ jobs: with: github_token: ${{ secrets.GITHUB_TOKEN }} level: warning - fail_on_error: true + fail_level: error + filter_mode: nofilter + exclude: | + "*/thirdparty/*" + "./temp/*" ShellCheck: runs-on: ubuntu-22.04 @@ -103,7 +111,7 @@ jobs: level: style reporter: github-pr-review check_all_files_with_shebangs: true - fail_on_error: true + fail_level: error exclude: | "*/thirdparty/*" "./temp/*" diff --git a/src/frontends/onnx/frontend/src/utils/onnx_internal.cpp b/src/frontends/onnx/frontend/src/utils/onnx_internal.cpp index a5b68b8f9d98a5..96aad10dbf928c 100644 --- a/src/frontends/onnx/frontend/src/utils/onnx_internal.cpp +++ b/src/frontends/onnx/frontend/src/utils/onnx_internal.cpp @@ -100,7 +100,8 @@ std::shared_ptr decode_to_framework_nodes(std::shared_ptr detail::MappedMemoryHandles mmap_cache, ov::frontend::ExtensionHolder extensions) { apply_transformations(*model_proto); - auto graph = std::make_shared(ov::util::get_directory(model_path).string(), model_proto, mmap_cache, extensions); + auto graph = + std::make_shared(ov::util::get_directory(model_path).string(), model_proto, mmap_cache, extensions); return graph->decode(); } } // namespace ov::frontend::onnx::detail diff --git a/src/frontends/onnx/tests/onnx_import_convpool.in.cpp b/src/frontends/onnx/tests/onnx_import_convpool.in.cpp index cd944d44929abf..ae9188b8e7df5e 100644 --- a/src/frontends/onnx/tests/onnx_import_convpool.in.cpp +++ b/src/frontends/onnx/tests/onnx_import_convpool.in.cpp @@ -426,14 +426,17 @@ OPENVINO_TEST(${BACKEND_NAME}, onnx_model_convtranspose_output_shape) { test_case.add_input_from_file(util::path_join({ov::test::utils::getExecutableDirectory(), TEST_ONNX_MODELS_DIRNAME, - "files/convtranspose_output_shape/x.bin"}).string()); + "files/convtranspose_output_shape/x.bin"}) + .string()); test_case.add_input_from_file(util::path_join({ov::test::utils::getExecutableDirectory(), TEST_ONNX_MODELS_DIRNAME, - "files/convtranspose_output_shape/w.bin"}).string()); + "files/convtranspose_output_shape/w.bin"}) + .string()); test_case.add_expected_output_from_file({1, 2, 10, 8}, util::path_join({ov::test::utils::getExecutableDirectory(), TEST_ONNX_MODELS_DIRNAME, - "files/convtranspose_output_shape/y.bin"}).string()); + "files/convtranspose_output_shape/y.bin"}) + .string()); test_case.run(); } diff --git a/src/frontends/tensorflow/src/graph_iterator_saved_model.cpp b/src/frontends/tensorflow/src/graph_iterator_saved_model.cpp index 91ac2efc73d834..8d64d6f2bcb045 100644 --- a/src/frontends/tensorflow/src/graph_iterator_saved_model.cpp +++ b/src/frontends/tensorflow/src/graph_iterator_saved_model.cpp @@ -43,8 +43,7 @@ bool GraphIteratorSavedModel::is_supported(const std::string& path) { #if defined(OPENVINO_ENABLE_UNICODE_PATH_SUPPORT) && defined(_WIN32) bool GraphIteratorSavedModel::is_supported(const std::wstring& path) { - return ov::util::directory_exists(path) && - ov::util::file_exists(ov::util::path_join_w({path, L"saved_model.pb"})); + return ov::util::directory_exists(path) && ov::util::file_exists(ov::util::path_join_w({path, L"saved_model.pb"})); } #endif diff --git a/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_loop_emitters.cpp b/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_loop_emitters.cpp index bac1591dc0b76a..3fbdc645a03a4f 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_loop_emitters.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_loop_emitters.cpp @@ -124,9 +124,9 @@ void jit_loop_end_emitter::validate_arguments(const std::vector& in, con } void jit_loop_end_emitter::emit_code_impl(const std::vector& in, - const std::vector& out, - const std::vector& pool_vec_idxs, - const std::vector& pool_gpr_idxs) const { + const std::vector& out, + const std::vector& pool_vec_idxs, + const std::vector& pool_gpr_idxs) const { validate_arguments(in, out); emit_impl(in, out); } diff --git a/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_loop_emitters.hpp b/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_loop_emitters.hpp index eddfe64cdc90dd..8f3480cc064c7b 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_loop_emitters.hpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_loop_emitters.hpp @@ -54,9 +54,9 @@ class jit_loop_end_emitter : public jit_emitter { } void emit_code_impl(const std::vector& in_idxs, - const std::vector& out_idxs, - const std::vector& pool_vec_idxs, - const std::vector& pool_gpr_idxs) const override; + const std::vector& out_idxs, + const std::vector& pool_vec_idxs, + const std::vector& pool_gpr_idxs) const override; protected: void validate_arguments(const std::vector& in, const std::vector& out) const override; diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_loop_emitters.cpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_loop_emitters.cpp index 15c7b85c1928da..54597a90a8504b 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_loop_emitters.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_loop_emitters.cpp @@ -200,9 +200,9 @@ void jit_loop_end_emitter::validate_arguments(const std::vector& in, con } void jit_loop_end_emitter::emit_code_impl(const std::vector& in, - const std::vector& out, - const std::vector& pool_vec_idxs, - const std::vector& pool_gpr_idxs) const { + const std::vector& out, + const std::vector& pool_vec_idxs, + const std::vector& pool_gpr_idxs) const { validate_arguments(in, out); jit_emitter::emit_code_impl(in, out, pool_vec_idxs, pool_gpr_idxs); } diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_loop_emitters.hpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_loop_emitters.hpp index 7b66cd8b54b48a..3a15ab0cf6fbea 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_loop_emitters.hpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_loop_emitters.hpp @@ -67,9 +67,9 @@ class jit_loop_end_emitter : public jit_emitter { } void emit_code_impl(const std::vector& in_idxs, - const std::vector& out_idxs, - const std::vector& pool_vec_idxs, - const std::vector& pool_gpr_idxs) const override; + const std::vector& out_idxs, + const std::vector& pool_vec_idxs, + const std::vector& pool_gpr_idxs) const override; protected: void validate_arguments(const std::vector& in, const std::vector& out) const override; diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_reg_spill_emitters.cpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_reg_spill_emitters.cpp index 59f84c75ee28ee..8c848567f579e0 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_reg_spill_emitters.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_reg_spill_emitters.cpp @@ -71,9 +71,9 @@ void jit_reg_spill_end_emitter::validate_arguments(const std::vector& in } void jit_reg_spill_end_emitter::emit_code_impl(const std::vector& in, - const std::vector& out, - const std::vector& pool_vec_idxs, - const std::vector& pool_gpr_idxs) const { + const std::vector& out, + const std::vector& pool_vec_idxs, + const std::vector& pool_gpr_idxs) const { validate_arguments(in, out); emit_impl(in, out); } diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_reg_spill_emitters.hpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_reg_spill_emitters.hpp index a67caeb15ed6fb..0c7518d9df4b07 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_reg_spill_emitters.hpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_reg_spill_emitters.hpp @@ -27,9 +27,9 @@ class jit_reg_spill_begin_emitter : public jit_emitter { void validate_arguments(const std::vector& in, const std::vector& out) const override; void emit_impl(const std::vector& in, const std::vector& out) const override; void emit_code_impl(const std::vector& in_idxs, - const std::vector& out_idxs, - const std::vector& pool_vec_idxs, - const std::vector& pool_gpr_idxs) const override; + const std::vector& out_idxs, + const std::vector& pool_vec_idxs, + const std::vector& pool_gpr_idxs) const override; std::set m_regs_to_spill; std::shared_ptr m_abi_reg_spiller; }; @@ -49,9 +49,9 @@ class jit_reg_spill_end_emitter : public jit_emitter { } void emit_code_impl(const std::vector& in_idxs, - const std::vector& out_idxs, - const std::vector& pool_vec_idxs, - const std::vector& pool_gpr_idxs) const override; + const std::vector& out_idxs, + const std::vector& pool_vec_idxs, + const std::vector& pool_gpr_idxs) const override; protected: void validate_arguments(const std::vector& in, const std::vector& out) const override; diff --git a/src/plugins/intel_cpu/src/nodes/executors/aarch64/jit_eltwise.cpp b/src/plugins/intel_cpu/src/nodes/executors/aarch64/jit_eltwise.cpp index d2238b2d9f182c..492bf9d2899790 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/aarch64/jit_eltwise.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/aarch64/jit_eltwise.cpp @@ -47,7 +47,7 @@ bool JitEltwiseExecutor::isSupported(const Algorithm& algorithm, Algorithm::EltwiseMod, Algorithm::EltwiseMultiply, Algorithm::EltwiseMulAdd, - Algorithm::EltwiseNotEqual, + Algorithm::EltwiseNotEqual, Algorithm::EltwisePowerStatic, Algorithm::EltwisePrelu, Algorithm::EltwiseRelu, diff --git a/src/plugins/intel_cpu/src/nodes/kernels/aarch64/jit_uni_eltwise_generic.cpp b/src/plugins/intel_cpu/src/nodes/kernels/aarch64/jit_uni_eltwise_generic.cpp index 818d7bdbfef684..ca82079b58ba68 100644 --- a/src/plugins/intel_cpu/src/nodes/kernels/aarch64/jit_uni_eltwise_generic.cpp +++ b/src/plugins/intel_cpu/src/nodes/kernels/aarch64/jit_uni_eltwise_generic.cpp @@ -29,7 +29,10 @@ void jit_uni_eltwise_generic::generate() { preamble(); static const std::vector exec_precisions_priority = {element::f16, element::f32}; - auto const exec_prc = eltwise_precision_helper::get_precision(jep_.inputs_number, jep_.src_prc, eltwise_data_, exec_precisions_priority); + auto const exec_prc = eltwise_precision_helper::get_precision(jep_.inputs_number, + jep_.src_prc, + eltwise_data_, + exec_precisions_priority); eltwise_emitter = create_eltwise_emitter(eltwise_data_.front(), exec_prc); for (size_t i = 1; i < eltwise_data_.size(); ++i) { @@ -46,8 +49,7 @@ void jit_uni_eltwise_generic::generate() { for (size_t i = 0; i < jep.inputs_number; i++) { ldr(start_to_offsets, ptr(reg_const_params, - static_cast(offsetof(jit_eltwise_call_args_ptrs, src_offsets) + - i * sizeof(size_t)))); + static_cast(offsetof(jit_eltwise_call_args_ptrs, src_offsets) + i * sizeof(size_t)))); ldr(get_src_reg(i), ptr(reg_const_params, static_cast(offsetof(jit_eltwise_call_args_ptrs, src_ptr[0]) + i * sizeof(size_t)))); @@ -91,8 +93,7 @@ void jit_uni_eltwise_generic::generate() { for (size_t i = 0; i < jep.inputs_number; i++) { ldr(get_src_reg(i), - ptr(param1, - static_cast(offsetof(jit_eltwise_call_args_ptrs, src_ptr) + i * sizeof(size_t)))); + ptr(param1, static_cast(offsetof(jit_eltwise_call_args_ptrs, src_ptr) + i * sizeof(size_t)))); init_ptrs_with_offsets(get_src_reg(i), jep.src_offsets[i]); } @@ -786,7 +787,6 @@ struct SupportedPrecisions { }; } // namespace - using namespace aarch64; std::set> eltwise_precision_helper::get_supported_precisions(const Algorithm& algo) { diff --git a/src/plugins/intel_cpu/src/nodes/kernels/x64/mlp_kernel.hpp b/src/plugins/intel_cpu/src/nodes/kernels/x64/mlp_kernel.hpp index 438d84c16b3ece..e3aae6ab38eaec 100644 --- a/src/plugins/intel_cpu/src/nodes/kernels/x64/mlp_kernel.hpp +++ b/src/plugins/intel_cpu/src/nodes/kernels/x64/mlp_kernel.hpp @@ -529,8 +529,8 @@ class ReduceAdd2bh : public dnnl::impl::cpu::x64::jit_generator { struct CallArgs { float* src0; float* src1; - int16_t * dst; - int16_t * prefetch_dst; + int16_t* dst; + int16_t* prefetch_dst; int64_t num_cols; }; // add two float input eltwise and convert to bf16 : ConvertFP32toBF16(src0 + src1) @@ -545,7 +545,7 @@ class ReduceAdd2bh : public dnnl::impl::cpu::x64::jit_generator { // the prefetch distance is increased to ensure by the time store happens // prefetch has done and no HW prefetcher is triggered args.prefetch_dst = (m + 2 < num_rows) ? (args.dst + 2 * dst_stride) : (args.dst); - + (*this)(&args); } } diff --git a/thirdparty/itt_collector/sea_itt_lib/IttNotifyStdSrc.cpp b/thirdparty/itt_collector/sea_itt_lib/IttNotifyStdSrc.cpp index ee2daf88d79ece..c2372a5f3f7bb6 100644 --- a/thirdparty/itt_collector/sea_itt_lib/IttNotifyStdSrc.cpp +++ b/thirdparty/itt_collector/sea_itt_lib/IttNotifyStdSrc.cpp @@ -1577,15 +1577,13 @@ void FillApiList(__itt_api_info* api_list_ptr) { #define ITT_STUB_IMPL_ORIG(name) ITT_STUB_IMPL(name) #ifdef _DEBUG // dangerous stub that doesn't return anything (even when expected) but records the function call for // statistics sake -# define ITT_STUB_NO_IMPL(fn) \ - if (0 == strcmp("__itt_" ITT_TO_STR(fn), api_list_ptr[i].name)) { \ - struct local { \ - static void stub(...) { \ - CIttFnStat oIttFnStat("NO IMPL:\t" ITT_TO_STR(fn)); \ - } \ - }; \ - *api_list_ptr[i].func_ptr = reinterpret_cast(local::stub); \ - continue; \ +# define ITT_STUB_NO_IMPL(fn) \ + if (0 == strcmp("__itt_" ITT_TO_STR(fn), api_list_ptr[i].name)) { \ + struct local { \ + static void stub(...) { CIttFnStat oIttFnStat("NO IMPL:\t" ITT_TO_STR(fn)); } \ + }; \ + *api_list_ptr[i].func_ptr = reinterpret_cast(local::stub); \ + continue; \ } #else # define ITT_STUB_NO_IMPL(fn) From 0aa889db67b63cc661bb4e5fe7a0706313865d6b Mon Sep 17 00:00:00 2001 From: Aleksandr Voron Date: Fri, 14 Feb 2025 17:00:53 +0100 Subject: [PATCH 06/10] [CPU][ACL] int8 MatMul support (#28870) ### Details: - oneDNN has `acl_lowp_matmul_t` support brought by ACL team here: https://github.com/oneapi-src/oneDNN/pull/1885 - The goal is to leverage this integration instead of implementing int8 MM executor from the scratch here: https://github.com/openvinotoolkit/openvino/pull/27861 - int8 MatMul is disabled on purpose by applying `MatMulTransformation` against `FullyConnected` nodes only, since int8 MM primitive `jit_gemm_i8` is slower than fp16 ACL MatMul. - `FakeQuantize` tokenisation is disabled in snippets ### Tickets: - CVS-149495 --- .../transformation_pipeline.cpp | 16 +++++++ .../aarch64/mat_mul_transformation.cpp | 42 +++++++++++++++++++ .../skip_tests_config.cpp | 4 ++ .../mat_mul_transformation.cpp | 4 +- 4 files changed, 65 insertions(+), 1 deletion(-) create mode 100644 src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/aarch64/mat_mul_transformation.cpp diff --git a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp index 67667d4794aecd..d69b96a8fe9402 100644 --- a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp +++ b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp @@ -121,6 +121,7 @@ #include "low_precision/group_convolution.hpp" #include "low_precision/multiply_to_group_convolution.hpp" #include "low_precision/network_helper.hpp" +#include "low_precision/mat_mul.hpp" #include "low_precision/recurrent_cell.hpp" #include "low_precision/rt_info/bias_attribute.hpp" #include "transformations/low_precision/mark_dequantization_subgraph.hpp" @@ -840,6 +841,21 @@ void Transformations::Lpt(const std::vector& defaultPrecision }, FuseConvertTransformation); + // Enable MatMulTransformation against FC nodes only + // int8 MatMul is disabled because acl_lowp_matmul_t supports 2D case only + // most models have 3D/4D cases, so fallback to jit_gemm_i8 gives worse perf than gemm_acl_f16 + // oneDNN ticket #2696 + CPU_SET_CALLBACK_ARM( + lptManager, + [&](const_node_ptr& node) -> bool { + if (NetworkHelper::isConstantPath(node->get_input_node_shared_ptr(1)) && + one_of(node->input_value(1).get_partial_shape().rank().get_length(), 2, 3)) { + return false; + } + return true; + }, + MatMulTransformation); + CPU_DISABLE_PASS_ARM(lptManager, RecurrentCellTransformation); CPU_DISABLE_PASS_COMMON(lptManager, MultiplyToGroupConvolutionTransformation); diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/aarch64/mat_mul_transformation.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/aarch64/mat_mul_transformation.cpp new file mode 100644 index 00000000000000..8a1bf320436a03 --- /dev/null +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/aarch64/mat_mul_transformation.cpp @@ -0,0 +1,42 @@ +// Copyright (C) 2018-2025 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include +#include "low_precision_transformations/mat_mul_transformation.hpp" + +using namespace LayerTestsDefinitions; + +namespace { +const std::vector precisions = { + ov::element::f32 +}; + +std::vector testValues = { + { + { 12, 2 }, + { 256ul, ov::Shape({}), {0.f}, {25.5f}, {0.f}, {25.5f} }, + { 2, 12 }, + { 256ul, ov::Shape({}), {-12.8f}, {12.7f}, {-12.8f}, {12.7f} }, + "matMul_original", + "u8" + }, + { + { 12, 2 }, + { 256ul, ov::Shape({}), {-12.8f}, {12.7f}, {-12.8f}, {12.7f} }, + { 2, 12 }, + { 256ul, ov::Shape({}), {-12.8f}, {12.7f}, {-12.8f}, {12.7f} }, + "matMul_original", + "i8" + } +}; + +INSTANTIATE_TEST_SUITE_P(smoke_LPT, MatMulTransformation, + ::testing::Combine( + ::testing::ValuesIn(precisions), + ::testing::Values(ov::PartialShape({ 1, 384, 1024 })), + ::testing::Values(ov::test::utils::DEVICE_CPU), + ::testing::ValuesIn(testValues)), + MatMulTransformation::getTestCaseName); +} // namespace diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp index 7970312f59ff3f..a25fba30ec73ca 100644 --- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp @@ -306,6 +306,10 @@ std::vector disabledTestPatterns() { retVector.emplace_back(R"(.*LSTMCellFusion/LSTMCellFusionWithSplitWeights.SubgraphFusedToLSTMCell/(1|8|15))"); // Ticket: 131541 retVector.emplace_back(R"(.*smoke_MulticlassNmsLayerTest_dynamic2.*_outType=i32_.*)"); + // Ticket: 162434 + retVector.emplace_back(R"(smoke_LPT/MatMulTransformation.*)"); + // Ticket: 162260 + retVector.emplace_back(R"(smoke_Snippets_FQDecomposition.*netPRC=f32_D=CPU.*)"); } // invalid test: checks u8 precision for runtime graph, while it should be f32 retVector.emplace_back(R"(smoke_NegativeQuantizedMatMulMultiplyFusion.*)"); diff --git a/src/tests/functional/plugin/shared/src/low_precision_transformations/mat_mul_transformation.cpp b/src/tests/functional/plugin/shared/src/low_precision_transformations/mat_mul_transformation.cpp index 5f1726eff6d3b3..a52961be2ef5ef 100644 --- a/src/tests/functional/plugin/shared/src/low_precision_transformations/mat_mul_transformation.cpp +++ b/src/tests/functional/plugin/shared/src/low_precision_transformations/mat_mul_transformation.cpp @@ -30,7 +30,9 @@ std::string MatMulTransformation::getTestCaseName(const testing::TestParamInfo Date: Fri, 14 Feb 2025 16:06:18 +0000 Subject: [PATCH 07/10] [NPUW] Fix weights_path property (#28997) --- .../intel_npu/src/al/include/intel_npu/config/runtime.hpp | 2 +- src/plugins/intel_npu/src/al/src/config/runtime.cpp | 1 + .../src/compiler_adapter/src/driver_compiler_adapter.cpp | 4 ++++ 3 files changed, 6 insertions(+), 1 deletion(-) diff --git a/src/plugins/intel_npu/src/al/include/intel_npu/config/runtime.hpp b/src/plugins/intel_npu/src/al/include/intel_npu/config/runtime.hpp index e90cf708b9d54d..f1fb8219ed19ad 100644 --- a/src/plugins/intel_npu/src/al/include/intel_npu/config/runtime.hpp +++ b/src/plugins/intel_npu/src/al/include/intel_npu/config/runtime.hpp @@ -215,7 +215,7 @@ struct WEIGHTS_PATH final : OptionBase { } static OptionMode mode() { - return OptionMode::CompileTime; + return OptionMode::RunTime; } }; diff --git a/src/plugins/intel_npu/src/al/src/config/runtime.cpp b/src/plugins/intel_npu/src/al/src/config/runtime.cpp index 3586cf18e08c8b..70eb06c1d6fa8d 100644 --- a/src/plugins/intel_npu/src/al/src/config/runtime.cpp +++ b/src/plugins/intel_npu/src/al/src/config/runtime.cpp @@ -26,6 +26,7 @@ void intel_npu::registerRunTimeOptions(OptionsDesc& desc) { desc.add(); desc.add(); desc.add(); + desc.add(); desc.add(); desc.add(); } diff --git a/src/plugins/intel_npu/src/compiler_adapter/src/driver_compiler_adapter.cpp b/src/plugins/intel_npu/src/compiler_adapter/src/driver_compiler_adapter.cpp index 51e42478e7cebc..5445a1b776bf90 100644 --- a/src/plugins/intel_npu/src/compiler_adapter/src/driver_compiler_adapter.cpp +++ b/src/plugins/intel_npu/src/compiler_adapter/src/driver_compiler_adapter.cpp @@ -580,6 +580,10 @@ std::string DriverCompilerAdapter::serializeConfig(const Config& config, std::ostringstream turbostring; turbostring << ov::intel_npu::turbo.name() << KEY_VALUE_SEPARATOR << VALUE_DELIMITER << "\\S+" << VALUE_DELIMITER; content = std::regex_replace(content, std::regex(turbostring.str()), ""); + // Remove weights path property as it is not used by compiler + std::ostringstream weightspathstream; + weightspathstream << ov::weights_path.name() << KEY_VALUE_SEPARATOR << VALUE_DELIMITER << "\\S+" << VALUE_DELIMITER; + content = std::regex_replace(content, std::regex(weightspathstream.str()), ""); // Remove Bypass UMD Caching propery std::ostringstream umdcachestring; umdcachestring << ov::intel_npu::bypass_umd_caching.name() << KEY_VALUE_SEPARATOR << VALUE_DELIMITER << "\\S+" From 5755945f5b6683ee5fdf49d9a3ccfea47c31d5c5 Mon Sep 17 00:00:00 2001 From: Bogdan Pereanu Date: Fri, 14 Feb 2025 18:53:28 +0200 Subject: [PATCH 08/10] [CORE] Fix tensor is_continuous method (#28973) ### Details: - *If data is a contiguous memory area, return true* - *Strides can be different and data can still be contiguous in memory. Extra checks are needed in this case* ### Tickets: - *E#156237* --------- Signed-off-by: Bogdan Pereanu --- src/core/src/runtime/itensor.cpp | 17 ++- src/core/tests/ov_tensor_test.cpp | 101 ++++++++++++++ .../remote_tensor_tests/remote_run.hpp | 128 ++++++++++++++++++ 3 files changed, 245 insertions(+), 1 deletion(-) diff --git a/src/core/src/runtime/itensor.cpp b/src/core/src/runtime/itensor.cpp index 4a5d011122f068..67dde4e38aa463 100644 --- a/src/core/src/runtime/itensor.cpp +++ b/src/core/src/runtime/itensor.cpp @@ -6,6 +6,7 @@ #include +#include "compare.hpp" #include "openvino/core/except.hpp" #include "openvino/core/shape_util.hpp" #include "openvino/core/type/element_iterator.hpp" @@ -46,7 +47,21 @@ bool ITensor::is_continuous() const { // OpenVINO doesn't support strides for lp types return true; } - return default_byte_strides(get_shape(), get_element_type()) == get_strides(); + + const auto& strides = get_strides(); + auto stride = strides.rbegin(); + const auto default_strides = default_byte_strides(get_shape(), get_element_type()); + auto default_stride = default_strides.rbegin(); + + for (; stride != strides.rend(); ++stride, ++default_stride) { + if (*stride != *default_stride) { + break; + } + } + + const auto default_last = default_strides.rend(); + return (default_stride == default_last) || (*default_stride < *stride && (get_shape()[0] == 1) && + std::all_of(default_stride, default_last, cmp::Equal(*default_stride))); } void ITensor::copy_to(const std::shared_ptr& dst) const { diff --git a/src/core/tests/ov_tensor_test.cpp b/src/core/tests/ov_tensor_test.cpp index fdb4fa28416408..6a386f0a659246 100644 --- a/src/core/tests/ov_tensor_test.cpp +++ b/src/core/tests/ov_tensor_test.cpp @@ -709,6 +709,107 @@ TEST_F(OVTensorTest, readRangeRoiBlobStringTensor) { } } +TEST_F(OVTensorTest, checkIsContinuousTensorScalar) { + ov::Tensor tensor(ov::element::f32, ov::Shape{}); + auto data = tensor.data(); + auto strides = tensor.get_strides(); + + ov::Tensor view_tensor(ov::element::f32, ov::Shape{}, data, strides); + EXPECT_EQ(view_tensor.is_continuous(), true); +} + +TEST_F(OVTensorTest, checkIsContinuousTensor1Dimension) { + ov::Tensor tensor(ov::element::f32, ov::Shape{128}); + auto data = tensor.data(); + auto strides = tensor.get_strides(); + + ov::Tensor view_tensor; + + view_tensor = ov::Tensor(ov::element::f32, ov::Shape{128}, data, strides); + EXPECT_EQ(view_tensor.is_continuous(), true); + + view_tensor = ov::Tensor(ov::element::f32, ov::Shape{16}, data, strides); + EXPECT_EQ(view_tensor.is_continuous(), true); +} + +TEST_F(OVTensorTest, checkIsContinuousTensor2Dimensions) { + ov::Tensor tensor(ov::element::f32, ov::Shape{32, 128}); + auto data = tensor.data(); + auto strides = tensor.get_strides(); + + ov::Tensor view_tensor; + + view_tensor = ov::Tensor(ov::element::f32, ov::Shape{16, 128}, data, strides); + EXPECT_EQ(view_tensor.is_continuous(), true); + + view_tensor = ov::Tensor(ov::element::f32, ov::Shape{1, 128}, data, strides); + EXPECT_EQ(view_tensor.is_continuous(), true); + + view_tensor = ov::Tensor(ov::element::f32, ov::Shape{1, 16}, data, strides); + EXPECT_EQ(view_tensor.is_continuous(), true); + + view_tensor = ov::Tensor(ov::element::f32, ov::Shape{2, 16}, data, strides); + EXPECT_EQ(view_tensor.is_continuous(), false); +} + +TEST_F(OVTensorTest, checkIsContinuousTensor3Dimensions) { + ov::Tensor tensor(ov::element::f32, ov::Shape{5, 32, 128}); + auto data = tensor.data(); + auto strides = tensor.get_strides(); + + ov::Tensor view_tensor; + + view_tensor = ov::Tensor(ov::element::f32, ov::Shape{2, 32, 128}, data, strides); + EXPECT_EQ(view_tensor.is_continuous(), true); + + view_tensor = ov::Tensor(ov::element::f32, ov::Shape{2, 16, 128}, data, strides); + EXPECT_EQ(view_tensor.is_continuous(), false); + + view_tensor = ov::Tensor(ov::element::f32, ov::Shape{1, 1, 128}, data, strides); + EXPECT_EQ(view_tensor.is_continuous(), true); + + view_tensor = ov::Tensor(ov::element::f32, ov::Shape{1, 1, 64}, data, strides); + EXPECT_EQ(view_tensor.is_continuous(), true); + + view_tensor = ov::Tensor(ov::element::f32, ov::Shape{1, 16, 128}, data, strides); + EXPECT_EQ(view_tensor.is_continuous(), true); +} + +TEST_F(OVTensorTest, checkIsContinuousTensor4Dimensions) { + ov::Tensor tensor(ov::element::f32, ov::Shape{3, 5, 32, 128}); + auto data = tensor.data(); + auto strides = tensor.get_strides(); + + ov::Tensor view_tensor; + + view_tensor = ov::Tensor(ov::element::f32, ov::Shape{1, 2, 32, 128}, data, strides); + EXPECT_EQ(view_tensor.is_continuous(), true); + + view_tensor = ov::Tensor(ov::element::f32, ov::Shape{2, 5, 32, 128}, data, strides); + EXPECT_EQ(view_tensor.is_continuous(), true); + + view_tensor = ov::Tensor(ov::element::f32, ov::Shape{2, 2, 32, 128}, data, strides); + EXPECT_EQ(view_tensor.is_continuous(), false); + + view_tensor = ov::Tensor(ov::element::f32, ov::Shape{1, 2, 5, 128}, data, strides); + EXPECT_EQ(view_tensor.is_continuous(), false); + + view_tensor = ov::Tensor(ov::element::f32, ov::Shape{3, 5, 32, 64}, data, strides); + EXPECT_EQ(view_tensor.is_continuous(), false); + + view_tensor = ov::Tensor(ov::element::f32, ov::Shape{1, 1, 16, 128}, data, strides); + EXPECT_EQ(view_tensor.is_continuous(), true); + + view_tensor = ov::Tensor(ov::element::f32, ov::Shape{2, 1, 16, 128}, data, strides); + EXPECT_EQ(view_tensor.is_continuous(), false); + + view_tensor = ov::Tensor(ov::element::f32, ov::Shape{1, 1, 1, 128}, data, strides); + EXPECT_EQ(view_tensor.is_continuous(), true); + + view_tensor = ov::Tensor(ov::element::f32, ov::Shape{1, 1, 1, 32}, data, strides); + EXPECT_EQ(view_tensor.is_continuous(), true); +} + struct TestParams { ov::Shape src_shape; ov::Strides src_strides; diff --git a/src/plugins/intel_npu/tests/functional/behavior/remote_tensor_tests/remote_run.hpp b/src/plugins/intel_npu/tests/functional/behavior/remote_tensor_tests/remote_run.hpp index c1992b3047996d..b410ce70a5d3b8 100644 --- a/src/plugins/intel_npu/tests/functional/behavior/remote_tensor_tests/remote_run.hpp +++ b/src/plugins/intel_npu/tests/functional/behavior/remote_tensor_tests/remote_run.hpp @@ -93,6 +93,134 @@ class RemoteRunTests : public ov::test::behavior::OVPluginTestBase, } }; +TEST_P(RemoteRunTests, CheckIsContinuousHostTensorScalar) { + // Skip test according to plugin specific disabledTestPatterns() (if any) + SKIP_IF_CURRENT_TEST_IS_DISABLED() + + auto zero_context = core->get_default_context(target_device); + + auto host_tensor = zero_context.create_host_tensor(ov::element::f32, Shape{}); + auto data = host_tensor.data(); + auto strides = host_tensor.get_strides(); + + ov::Tensor view_tensor; + + view_tensor = ov::Tensor(ov::element::f32, ov::Shape{}, data, strides); + EXPECT_EQ(view_tensor.is_continuous(), true); +} + +TEST_P(RemoteRunTests, CheckIsContinuousHostTensor1Dimension) { + // Skip test according to plugin specific disabledTestPatterns() (if any) + SKIP_IF_CURRENT_TEST_IS_DISABLED() + + auto zero_context = core->get_default_context(target_device); + + auto host_tensor = zero_context.create_host_tensor(ov::element::f32, Shape{128}); + auto data = host_tensor.data(); + auto strides = host_tensor.get_strides(); + + ov::Tensor view_tensor; + + view_tensor = ov::Tensor(ov::element::f32, ov::Shape{128}, data, strides); + EXPECT_EQ(view_tensor.is_continuous(), true); + + view_tensor = ov::Tensor(ov::element::f32, ov::Shape{16}, data, strides); + EXPECT_EQ(view_tensor.is_continuous(), true); +} + +TEST_P(RemoteRunTests, CheckIsContinuousHostTensor2Dimensions) { + // Skip test according to plugin specific disabledTestPatterns() (if any) + SKIP_IF_CURRENT_TEST_IS_DISABLED() + + auto zero_context = core->get_default_context(target_device); + + auto host_tensor = zero_context.create_host_tensor(ov::element::f32, Shape{32, 128}); + auto data = host_tensor.data(); + auto strides = host_tensor.get_strides(); + + ov::Tensor view_tensor; + + view_tensor = ov::Tensor(ov::element::f32, Shape{16, 128}, data, strides); + EXPECT_EQ(view_tensor.is_continuous(), true); + + view_tensor = ov::Tensor(ov::element::f32, Shape{1, 128}, data, strides); + EXPECT_EQ(view_tensor.is_continuous(), true); + + view_tensor = ov::Tensor(ov::element::f32, Shape{1, 16}, data, strides); + EXPECT_EQ(view_tensor.is_continuous(), true); + + view_tensor = ov::Tensor(ov::element::f32, Shape{2, 16}, data, strides); + EXPECT_EQ(view_tensor.is_continuous(), false); +} + +TEST_P(RemoteRunTests, CheckIsContinuousHostTensor3Dimensions) { + // Skip test according to plugin specific disabledTestPatterns() (if any) + SKIP_IF_CURRENT_TEST_IS_DISABLED() + + auto zero_context = core->get_default_context(target_device); + + auto host_tensor = zero_context.create_host_tensor(ov::element::f32, Shape{5, 32, 128}); + auto data = host_tensor.data(); + auto strides = host_tensor.get_strides(); + + ov::Tensor view_tensor; + + view_tensor = ov::Tensor(ov::element::f32, Shape{2, 32, 128}, data, strides); + EXPECT_EQ(view_tensor.is_continuous(), true); + + view_tensor = ov::Tensor(ov::element::f32, Shape{2, 16, 128}, data, strides); + EXPECT_EQ(view_tensor.is_continuous(), false); + + view_tensor = ov::Tensor(ov::element::f32, Shape{1, 1, 128}, data, strides); + EXPECT_EQ(view_tensor.is_continuous(), true); + + view_tensor = ov::Tensor(ov::element::f32, Shape{1, 1, 64}, data, strides); + EXPECT_EQ(view_tensor.is_continuous(), true); + + view_tensor = ov::Tensor(ov::element::f32, Shape{1, 16, 128}, data, strides); + EXPECT_EQ(view_tensor.is_continuous(), true); +} + +TEST_P(RemoteRunTests, CheckIsContinuousHostTensor4Dimensions) { + // Skip test according to plugin specific disabledTestPatterns() (if any) + SKIP_IF_CURRENT_TEST_IS_DISABLED() + + auto zero_context = core->get_default_context(target_device); + + auto host_tensor = zero_context.create_host_tensor(ov::element::f32, Shape{3, 5, 32, 128}); + auto data = host_tensor.data(); + auto strides = host_tensor.get_strides(); + + ov::Tensor view_tensor; + + view_tensor = ov::Tensor(ov::element::f32, Shape{1, 2, 32, 128}, data, strides); + EXPECT_EQ(view_tensor.is_continuous(), true); + + view_tensor = ov::Tensor(ov::element::f32, Shape{2, 5, 32, 128}, data, strides); + EXPECT_EQ(view_tensor.is_continuous(), true); + + view_tensor = ov::Tensor(ov::element::f32, Shape{2, 2, 32, 128}, data, strides); + EXPECT_EQ(view_tensor.is_continuous(), false); + + view_tensor = ov::Tensor(ov::element::f32, Shape{1, 2, 5, 128}, data, strides); + EXPECT_EQ(view_tensor.is_continuous(), false); + + view_tensor = ov::Tensor(ov::element::f32, Shape{3, 5, 32, 64}, data, strides); + EXPECT_EQ(view_tensor.is_continuous(), false); + + view_tensor = ov::Tensor(ov::element::f32, Shape{1, 1, 16, 128}, data, strides); + EXPECT_EQ(view_tensor.is_continuous(), true); + + view_tensor = ov::Tensor(ov::element::f32, Shape{2, 1, 16, 128}, data, strides); + EXPECT_EQ(view_tensor.is_continuous(), false); + + view_tensor = ov::Tensor(ov::element::f32, Shape{1, 1, 1, 128}, data, strides); + EXPECT_EQ(view_tensor.is_continuous(), true); + + view_tensor = ov::Tensor(ov::element::f32, Shape{1, 1, 1, 32}, data, strides); + EXPECT_EQ(view_tensor.is_continuous(), true); +} + TEST_P(RemoteRunTests, CheckRemoteTensorInternalBuf) { // Skip test according to plugin specific disabledTestPatterns() (if any) SKIP_IF_CURRENT_TEST_IS_DISABLED() From 20ad7cb11906d30fb24bc131afa2b6e39566cbba Mon Sep 17 00:00:00 2001 From: Mustafa Cavus Date: Fri, 14 Feb 2025 13:07:23 -0800 Subject: [PATCH 09/10] Executorch initial support (#28425) ### Details: - OV side changes for initial ExecuTorch OV backend ### Tickets: - [*ticket-id*](https://jira.devtools.intel.com/browse/CVS-157257) --------- Co-authored-by: ynimmaga Co-authored-by: Roman Kazantsev Co-authored-by: Maxim Vafin --- .../openvino/frontend/pytorch/fx_decoder.py | 55 ++++++++++++------- .../pytorch/torchdynamo/op_support.py | 9 +++ src/frontends/pytorch/src/op_table.cpp | 9 +++ .../pytorch_tests/test_as_strided.py | 34 ++++++++++++ .../layer_tests/pytorch_tests/test_expand.py | 25 +++++++++ .../layer_tests/pytorch_tests/test_permute.py | 26 +++++++++ .../layer_tests/pytorch_tests/test_select.py | 28 ++++++++++ tests/layer_tests/pytorch_tests/test_split.py | 25 +++++++++ .../layer_tests/pytorch_tests/test_squeeze.py | 30 ++++++++++ .../pytorch_tests/test_unsqueeze.py | 28 ++++++++++ tests/layer_tests/pytorch_tests/test_view.py | 43 +++++++++++++++ 11 files changed, 292 insertions(+), 20 deletions(-) diff --git a/src/bindings/python/src/openvino/frontend/pytorch/fx_decoder.py b/src/bindings/python/src/openvino/frontend/pytorch/fx_decoder.py index b636ad806e2df7..483a5e82c7a881 100644 --- a/src/bindings/python/src/openvino/frontend/pytorch/fx_decoder.py +++ b/src/bindings/python/src/openvino/frontend/pytorch/fx_decoder.py @@ -178,7 +178,7 @@ def __init__(self, pt_module, fx_gm=None, nodes=None, self._input_signature = [] self._example_input = None - if issubclass(type(pt_module), torch.fx.graph_module.GraphModule): + if isinstance(pt_module, torch.fx.graph_module.GraphModule): self._input_is_list = None self._nodes = list(pt_module.graph.nodes) found_types = [] @@ -187,38 +187,34 @@ def __init__(self, pt_module, fx_gm=None, nodes=None, if value.op == 'placeholder': self._inputs.append(i) self._input_signature.append(value.name) - if hasattr(value, "meta") and ('tensor_meta' in value.meta.keys()) and value.meta['tensor_meta']: - found_shapes.append(value.meta['tensor_meta'].shape) - found_types.append( - OVAny(pt_to_ov_type_map[str(value.meta['tensor_meta'].dtype)])) - else: - found_shapes.append(None) - found_types.append(None) + + found_shapes.append(self.get_found_shape(value)) + found_types.append(self.get_found_dtype(value)) + if found_shapes[-1] is not None: + new_shape = [] + for dim in found_shapes[-1]: + if (dynamic_shapes or type(dim).__name__ == "SymInt"): + new_shape.append(-1) + else: + new_shape.append(dim) + found_shapes[-1] = torch.Size(new_shape) + elif value.op == 'output': # Instead of putting output index, refer to its target uargs = self.unpack_containers(value.args) self._outputs = [(arg[0], self._nodes.index(arg[1])) for arg in uargs if arg[1] is not None] - for idx, shape in enumerate(found_shapes): - if shape is not None: - new_shape = [] - for dim in shape: - if (dynamic_shapes or type(dim).__name__ == "SymInt"): - new_shape.append(-1) - else: - new_shape.append(dim) - found_shapes[idx] = torch.Size(new_shape) if not input_shapes or len(input_shapes) == 0: self.input_shapes = found_shapes if not input_types or len(input_types) == 0: self.input_types = found_types - if hasattr(pt_module, "forward"): - input_params = inspect.signature(pt_module.forward).parameters + if hasattr(self.pt_module, "forward"): + input_params = inspect.signature(self.pt_module.forward).parameters self._input_signature = list(input_params) - elif issubclass(type(pt_module), torch.fx.Node): + elif isinstance(pt_module, torch.fx.Node): self._nodes = nodes # passed from outer context # FIXME: Quadratic complexity nodes*nodes considering the outer loop over all nodes @@ -234,6 +230,23 @@ def __init__(self, pt_module, fx_gm=None, nodes=None, self.input_types.append( BaseFXDecoder.get_type_for_value(arg)) + @staticmethod + def get_found_shape(value) -> str: + # If input is a tensor, read the shape from meta data + if hasattr(value, "meta"): + if ('tensor_meta' in value.meta.keys()) and value.meta['tensor_meta']: + return value.meta['tensor_meta'].shape + if ('val' in value.meta.keys()) and isinstance(value.meta["val"], torch.Tensor): + return value.meta['val'].shape + return None + + @staticmethod + def get_found_dtype(value) -> str: + # If input is a tensor, read the data type from meta data + if hasattr(value, "meta") and ('tensor_meta' in value.meta.keys()) and value.meta['tensor_meta']: + return OVAny(pt_to_ov_type_map[str(value.meta['tensor_meta'].dtype)]) + return None + def get_input_signature_name(self, index: int) -> str: if self._input_signature is not None and index < len(self._input_signature): return self._input_signature[index] @@ -331,6 +344,8 @@ def get_subgraph_decoder(self, index): def get_op_type(self): if self.pt_module.op == 'call_function': + if type(self.pt_module.target).__name__ == "EdgeOpOverload": + return self.pt_module.target.__name__ return str(self.pt_module.target) elif self.pt_module.op == 'get_attr': return 'get_attr' # FIXME should be aligned with get_attr from TS implementation diff --git a/src/bindings/python/src/openvino/frontend/pytorch/torchdynamo/op_support.py b/src/bindings/python/src/openvino/frontend/pytorch/torchdynamo/op_support.py index 8ca3b7b489f665..da36e091ee2ab6 100644 --- a/src/bindings/python/src/openvino/frontend/pytorch/torchdynamo/op_support.py +++ b/src/bindings/python/src/openvino/frontend/pytorch/torchdynamo/op_support.py @@ -75,6 +75,7 @@ def __init__(self, options): "torch.ops.aten.argmin.default": None, "torch.ops.aten.as_strided.default": None, "torch.ops.aten.as_strided_.default": None, + "torch.ops.aten.as_strided_copy.default": None, "torch.ops.aten.asin.default": None, "torch.ops.aten.asinh.default": None, "torch.ops.aten.asinh.default": None, @@ -118,6 +119,7 @@ def __init__(self, options): "torch.ops.aten.erf.default": None, "torch.ops.aten.exp.default": None, "torch.ops.aten.expand.default": None, + "torch.ops.aten.expand_copy.default": None, "torch.ops.aten.fake_quantize_per_channel_affine_cachemask.default": None, "torch.ops.aten.fill.Scalar": None, "torch.ops.aten.fill_.Scalar": None, @@ -196,6 +198,7 @@ def __init__(self, options): "torch.ops.aten.new_zeros.default": None, "torch.ops.aten.ones.default": None, "torch.ops.aten.permute.default": None, + "torch.ops.aten.permute_copy.default": None, "torch.ops.aten.pow.Scalar": None, "torch.ops.aten.pow.Tensor_Scalar": None, "torch.ops.aten.pow.Tensor_Tensor": None, @@ -213,6 +216,7 @@ def __init__(self, options): "torch.ops.aten.scatter.src": None, "torch.ops.aten.scatter.value": None, "torch.ops.aten.select.int": None, + "torch.ops.aten.select_copy.int": None, "torch.ops.aten.select_scatter.default": None, "torch.ops.aten.sigmoid.default": None, "torch.ops.aten.sigmoid_.default": None, @@ -222,13 +226,16 @@ def __init__(self, options): "torch.ops.aten.sin.default": None, "torch.ops.aten.sinh.default": None, "torch.ops.aten.slice.Tensor": None, + "torch.ops.aten.slice_copy.Tensor": None, "torch.ops.aten.slice_scatter.default": None, "torch.ops.aten.sort.default": None, "torch.ops.aten.split.Tensor": None, "torch.ops.aten.split_with_sizes.default": None, + "torch.ops.aten.split_with_sizes_copy.default": None, "torch.ops.aten.sqrt.default": None, "torch.ops.aten.squeeze.dim": None, "torch.ops.aten.squeeze.dims": None, + "torch.ops.aten.squeeze_copy.dims": None, "torch.ops.aten.stack.default": None, "torch.ops.aten.std.correction": None, "torch.ops.aten.sub.default": None, @@ -246,10 +253,12 @@ def __init__(self, options): "torch.ops.aten.unbind.int": None, "torch.ops.aten.unfold.default": None, "torch.ops.aten.unsqueeze.default": None, + "torch.ops.aten.unsqueeze_copy.default": None, "torch.ops.aten.upsample_nearest2d.default": None, "torch.ops.aten.var.correction": None, "torch.ops.aten.var_mean.correction": None, "torch.ops.aten.view.default": None, + "torch.ops.aten.view_copy.default": None, "torch.ops.aten.where.self": None, "torch.ops.aten.zeros.default": None, "torch.ops.aten.zeros_like.default": None, diff --git a/src/frontends/pytorch/src/op_table.cpp b/src/frontends/pytorch/src/op_table.cpp index ef75a253f7506a..2d33b32472ba36 100644 --- a/src/frontends/pytorch/src/op_table.cpp +++ b/src/frontends/pytorch/src/op_table.cpp @@ -804,6 +804,7 @@ const std::unordered_map get_supported_ops_fx() { {"aten.argmin.default", op::translate_argmin}, {"aten.as_strided.default", op::translate_as_strided}, {"aten.as_strided_.default", op::translate_as_strided}, + {"aten.as_strided_copy.default", op::translate_as_strided}, {"aten.asin.default", op::translate_1to1_match_1_inputs_with_fp32_type_alignment}, {"aten.asinh.default", op::translate_1to1_match_1_inputs_with_fp32_type_alignment}, {"aten.atan.default", op::translate_1to1_match_1_inputs_with_fp32_type_alignment}, @@ -854,6 +855,7 @@ const std::unordered_map get_supported_ops_fx() { {"aten.exp.default", op::translate_1to1_match_1_inputs_with_fp32_type_alignment}, {"aten.expm1.default", op::translate_expm1}, {"aten.expand.default", op::translate_expand}, + {"aten.expand_copy.default", op::translate_expand}, {"aten.eye.m", op::translate_eye_fx}, {"aten.fake_quantize_per_channel_affine_cachemask.default", op::translate_fake_quantize_per_channel_affine_fx}, {"aten.fill.Scalar", op::translate_fill}, @@ -936,6 +938,7 @@ const std::unordered_map get_supported_ops_fx() { {"aten.ones.names", op::translate_ones_fx}, {"aten.ones_like.default", op::translate_ones_like_fx}, {"aten.permute.default", op::translate_permute}, + {"aten.permute_copy.default", op::translate_1to1_match_2_inputs}, {"aten.pow.Scalar", op::translate_pow}, {"aten.pow.Tensor_Scalar", op::translate_pow}, {"aten.pow.Tensor_Tensor", op::translate_pow}, @@ -958,6 +961,7 @@ const std::unordered_map get_supported_ops_fx() { {"aten.scatter.value", op::translate_scatter}, {"aten.scatter_add.default", op::translate_scatter_add}, {"aten.select.int", op::translate_select}, + {"aten.select_copy.int", op::translate_select}, {"aten.select_scatter.default", op::translate_select_scatter_fx}, {"aten.sigmoid.default", op::translate_1to1_match_1_inputs_with_fp32_type_alignment}, {"aten.sigmoid_.default", op::translate_1to1_match_1_inputs_with_fp32_type_alignment}, @@ -967,13 +971,16 @@ const std::unordered_map get_supported_ops_fx() { {"aten.sin.default", op::translate_1to1_match_1_inputs_with_fp32_type_alignment}, {"aten.sinh.default", op::translate_1to1_match_1_inputs_with_fp32_type_alignment}, {"aten.slice.Tensor", op::translate_slice_fx}, + {"aten.slice_copy.Tensor", op::translate_slice_fx}, {"aten.slice_scatter.default", op::translate_slice_scatter_fx}, {"aten.sort.default", op::translate_sort_fx}, {"aten.split.Tensor", op::translate_chunk_fx}, {"aten.split_with_sizes.default", op::translate_split_with_sizes_fx}, + {"aten.split_with_sizes_copy.default", op::translate_split_with_sizes_fx}, {"aten.sqrt.default", op::translate_1to1_match_1_inputs_with_fp32_type_alignment}, {"aten.squeeze.dim", op::translate_squeeze}, {"aten.squeeze.dims", op::translate_squeeze}, + {"aten.squeeze_copy.dims", op::translate_squeeze}, {"aten.stack.default", op::translate_stack_fx}, {"aten.std.correction", op::translate_std_fx}, {"aten.sub.default", op::translate_sub_fx}, @@ -991,10 +998,12 @@ const std::unordered_map get_supported_ops_fx() { {"aten.unbind.int", op::translate_unbind_int_fx}, {"aten.unfold.default", op::translate_unfold}, {"aten.unsqueeze.default", op::translate_1to1_match_2_inputs}, + {"aten.unsqueeze_copy.default", op::translate_1to1_match_2_inputs}, {"aten.upsample_nearest2d.default", op::translate_upsample_nearest2d}, {"aten.var.correction", op::translate_var_fx}, {"aten.var_mean.correction", op::translate_var_mean_fx}, {"aten.view.default", op::translate_reshape}, + {"aten.view_copy.default", op::translate_reshape}, {"aten.view_as_complex.default", op::translate_view_as_complex}, {"aten.view_as_real.default", op::translate_view_as_real}, {"aten.where.self", op::translate_where}, diff --git a/tests/layer_tests/pytorch_tests/test_as_strided.py b/tests/layer_tests/pytorch_tests/test_as_strided.py index afdc25fd8657a7..674797ffda64cb 100644 --- a/tests/layer_tests/pytorch_tests/test_as_strided.py +++ b/tests/layer_tests/pytorch_tests/test_as_strided.py @@ -45,6 +45,40 @@ def forward(self, x): def test_as_strided(self, size, stride, offset, ie_device, precision, ir_version): self._test(*self.create_model(size, stride, offset), ie_device, precision, ir_version, trace_model=True) +class TestAsStridedCopy(PytorchLayerTest): + def _prepare_input(self): + return (np.random.randn(8, 8).astype(np.float32),) + + def create_model(self, size, stride, offset): + class aten_as_strided_copy(torch.nn.Module): + def __init__(self, size, stride, offset): + super().__init__() + self.size = size + self.stride = stride + self.offset = offset + + def forward(self, x): + return torch.as_strided_copy(x, self.size, self.stride, self.offset) + + ref_net = None + + return aten_as_strided_copy(size, stride, offset), ref_net, "aten::as_strided_copy" + + @pytest.mark.parametrize( + "size,stride", + [ + ([1], [1]), + ([2, 2], [1, 1]), + ([5, 4, 3], [1, 3, 7]), + ([5, 5, 5], [5, 0, 5]), + ([1, 2, 3, 4], [4, 3, 2, 1]), + ], + ) + @pytest.mark.parametrize("offset", [None, 1, 3, 7]) + @pytest.mark.precommit_fx_backend + def test_as_strided_copy(self, size, stride, offset, ie_device, precision, ir_version): + self._test(*self.create_model(size, stride, offset), ie_device, precision, ir_version, trace_model=True) + class TestAsStridedListConstruct(PytorchLayerTest): def _prepare_input(self, size_shape_tensor=[1], stride_shape_tensor=[1]): diff --git a/tests/layer_tests/pytorch_tests/test_expand.py b/tests/layer_tests/pytorch_tests/test_expand.py index e0f673fb927aaf..23d6eedf38bafe 100644 --- a/tests/layer_tests/pytorch_tests/test_expand.py +++ b/tests/layer_tests/pytorch_tests/test_expand.py @@ -41,6 +41,31 @@ def forward_broadcast(self, x): def test_expand(self, dims, op_type, ie_device, precision, ir_version): self._test(*self.create_model(dims, op_type), ie_device, precision, ir_version) +class TestExpandCopy(PytorchLayerTest): + def _prepare_input(self): + import numpy as np + return (np.random.randn(1, 3).astype(np.float32),) + + def create_model(self, dim): + import torch + + class aten_expand_copy(torch.nn.Module): + def __init__(self, dims): + super(aten_expand_copy, self).__init__() + self.dims = dims + + def forward(self, x): + return torch.expand_copy(x, self.dims) + + ref_net = None + + return aten_expand_copy(dim), ref_net, f"aten::expand_copy" + + @pytest.mark.parametrize("dims", [(4, 3), (-1, -1), (1, 2, 3), (1, 2, 2, 3)]) + @pytest.mark.precommit_fx_backend + def test_expand_copy(self, dims, ie_device, precision, ir_version): + self._test(*self.create_model(dims), ie_device, precision, ir_version) + class TestExpandList(PytorchLayerTest): def _prepare_input(self, broadcast_shape): import numpy as np diff --git a/tests/layer_tests/pytorch_tests/test_permute.py b/tests/layer_tests/pytorch_tests/test_permute.py index efbd77d371eb89..d4b342e67273bc 100644 --- a/tests/layer_tests/pytorch_tests/test_permute.py +++ b/tests/layer_tests/pytorch_tests/test_permute.py @@ -38,9 +38,35 @@ def forward(self, x): @pytest.mark.nightly @pytest.mark.precommit @pytest.mark.precommit_torch_export + @pytest.mark.precommit_fx_backend def test_permute(self, order, complex_type, ie_device, precision, ir_version): self._test(*self.create_model(order, complex_type), ie_device, precision, ir_version) +class TestPermuteCopy(PytorchLayerTest): + def _prepare_input(self): + import numpy as np + return (np.random.randn(1, 3, 224, 224).astype(np.float32),) + + def create_model(self, order): + import torch + + class aten_permute_copy(torch.nn.Module): + def __init__(self, order): + super(aten_permute_copy, self).__init__() + self.order = order + + def forward(self, x): + return torch.permute_copy(x, self.order) + + ref_net = None + + return aten_permute_copy(order), ref_net, "aten::permute_copy" + + @pytest.mark.parametrize("order", [[0, 2, 3, 1], [0, 3, 1, 2], [0, -1, 1, -2]]) + @pytest.mark.precommit_fx_backend + def test_permute_copy(self, order, ie_device, precision, ir_version): + self._test(*self.create_model(order), ie_device, precision, ir_version) + class TestPermuteList(PytorchLayerTest): def _prepare_input(self, permute_shape): diff --git a/tests/layer_tests/pytorch_tests/test_select.py b/tests/layer_tests/pytorch_tests/test_select.py index 5bd897e88148dd..828d0a57d60bdd 100644 --- a/tests/layer_tests/pytorch_tests/test_select.py +++ b/tests/layer_tests/pytorch_tests/test_select.py @@ -33,6 +33,34 @@ def forward(self, input_tensor): @pytest.mark.nightly @pytest.mark.precommit @pytest.mark.precommit_torch_export + @pytest.mark.precommit_fx_backend def test_select(self, ie_device, precision, ir_version, input_dim, input_index): self._test(*self.create_model(input_dim, input_index), ie_device, precision, ir_version) + +@pytest.mark.parametrize('input_dim', list(range(-3, 4))) +@pytest.mark.parametrize('input_index', list(range(-3, 4))) +class TestSelectCopy(PytorchLayerTest): + + def _prepare_input(self): + return (np.random.randn(4, 4, 5, 5).astype(np.float32),) + + def create_model(self, input_dim, input_index): + class aten_select_copy(torch.nn.Module): + + def __init__(self, input_dim, input_index) -> None: + super().__init__() + self.dim = input_dim + self.index = input_index + + def forward(self, input_tensor): + return torch.select_copy(input_tensor, int(self.dim), int(self.index)) + + ref_net = None + + return aten_select_copy(input_dim, input_index), ref_net, "aten::select_copy" + + @pytest.mark.precommit_fx_backend + def test_select_copy(self, ie_device, precision, ir_version, input_dim, input_index): + self._test(*self.create_model(input_dim, input_index), + ie_device, precision, ir_version) diff --git a/tests/layer_tests/pytorch_tests/test_split.py b/tests/layer_tests/pytorch_tests/test_split.py index e1ab4ed19ff701..497f314b7470c5 100644 --- a/tests/layer_tests/pytorch_tests/test_split.py +++ b/tests/layer_tests/pytorch_tests/test_split.py @@ -99,6 +99,31 @@ def forward(self, x, y): @pytest.mark.nightly @pytest.mark.precommit + @pytest.mark.precommit_fx_backend def test_split_with_sizes(self, ie_device, precision, ir_version): self._test(*self.create_model(), ie_device, precision, ir_version, trace_model=True) + +class TestSplitWithSizesCopy(PytorchLayerTest): + def _prepare_input(self): + import numpy as np + return (np.random.randn(20).astype(np.float32),np.random.randn(20).astype(np.float32)) + + def create_model(self): + import torch + + class aten_split_with_sizes_copy(torch.nn.Module): + def __init__(self): + super(aten_split_with_sizes_copy, self).__init__() + + def forward(self, x, y): + return torch.split_with_sizes_copy(x, [y.shape[0]], dim=0) + + ref_net = None + + return aten_split_with_sizes_copy(), ref_net, ["aten::split_with_sizes", "prim::ListConstruct"] + + @pytest.mark.precommit_fx_backend + def test_split_with_sizes_copy(self, ie_device, precision, ir_version): + self._test(*self.create_model(), + ie_device, precision, ir_version, trace_model=True) diff --git a/tests/layer_tests/pytorch_tests/test_squeeze.py b/tests/layer_tests/pytorch_tests/test_squeeze.py index 2f67ec89fcd481..3fa70532a46ee6 100644 --- a/tests/layer_tests/pytorch_tests/test_squeeze.py +++ b/tests/layer_tests/pytorch_tests/test_squeeze.py @@ -45,3 +45,33 @@ def test_squeeze(self, dim, dynamic_shapes, ie_device, precision, ir_version): def test_squeeze_non_1(self, dim, ie_device, precision, ir_version): # Dynamic shapes are introducing dynamic rank, with is not suppoerted by Squeeze operation. self._test(*self.create_model(dim), ie_device, precision, ir_version, dynamic_shapes=False) + +class TestSqueezeCopy(PytorchLayerTest): + def _prepare_input(self): + import numpy as np + + return (np.random.randn(1, 1, 32).astype(np.float32),) + + def create_model(self, dim): + import torch + + class aten_squeeze_copy(torch.nn.Module): + def __init__(self, dim): + super(aten_squeeze_copy, self).__init__() + self.dim = dim + + def forward(self, x): + if self.dim is not None: + return torch.squeeze_copy(x, self.dim) + return torch.squeeze_copy(x) + + ref_net = None + + return aten_squeeze_copy(dim), ref_net, "aten::squeeze_copy" + + @pytest.mark.parametrize("dim,dynamic_shapes", [(-2, True), (0, True), (None, False)]) + @pytest.mark.precommit_fx_backend + def test_squeeze_copy(self, dim, dynamic_shapes, ie_device, precision, ir_version): + if PytorchLayerTest.use_torch_export() and dim is None: + pytest.xfail(reason="export fails if dim is not provided") + self._test(*self.create_model(dim), ie_device, precision, ir_version, dynamic_shapes=dynamic_shapes) diff --git a/tests/layer_tests/pytorch_tests/test_unsqueeze.py b/tests/layer_tests/pytorch_tests/test_unsqueeze.py index e77a43a7d79d8c..d7ba91be8b1487 100644 --- a/tests/layer_tests/pytorch_tests/test_unsqueeze.py +++ b/tests/layer_tests/pytorch_tests/test_unsqueeze.py @@ -41,5 +41,33 @@ def forward(self, x): @pytest.mark.nightly @pytest.mark.precommit @pytest.mark.precommit_torch_export + @pytest.mark.precommit_fx_backend def test_unsqueeze(self, inplace, dim, ie_device, precision, ir_version): self._test(*self.create_model(inplace, dim), ie_device, precision, ir_version) + +class TestUnsqueezeCopy(PytorchLayerTest): + def _prepare_input(self): + import numpy as np + return (np.random.randn(5, 10).astype(np.float32),) + + def create_model(self, dim=0): + import torch + + class aten_unsqueeze_copy(torch.nn.Module): + def __init__(self, dim): + super(aten_unsqueeze_copy, self).__init__() + self.op = torch.unsqueeze_copy + self.dim = dim + + def forward(self, x): + return x, self.op(x, self.dim) + + ref_net = None + model_class, op = (aten_unsqueeze_copy, "aten::unsqueeze_copy") + + return model_class(dim), ref_net, op + + @pytest.mark.parametrize("dim", [0, 1, -1]) + @pytest.mark.precommit_fx_backend + def test_unsqueeze_copy(self, dim, ie_device, precision, ir_version): + self._test(*self.create_model(dim), ie_device, precision, ir_version) diff --git a/tests/layer_tests/pytorch_tests/test_view.py b/tests/layer_tests/pytorch_tests/test_view.py index 3cdd42779b80e8..326249ce87dca9 100644 --- a/tests/layer_tests/pytorch_tests/test_view.py +++ b/tests/layer_tests/pytorch_tests/test_view.py @@ -142,6 +142,7 @@ def forward(self, input_tensor): @pytest.mark.nightly @pytest.mark.precommit + @pytest.mark.precommit_fx_backend def test_view(self, ie_device, precision, ir_version, input_shapes): self.input_data = [] for input_shape in input_shapes: @@ -150,3 +151,45 @@ def test_view(self, ie_device, precision, ir_version, input_shapes): else: self.input_data.append(input_shape) self._test(*self.create_model(), ie_device, precision, ir_version) + +@pytest.mark.parametrize('input_shapes', +[ + [ + [2, 3, 2], 2, 6 + ], + [ + [4], 2, 2 + ], + [ + [4], 2, 2.1 + ] +]) +class TestViewCopy(PytorchLayerTest): + + def _prepare_input(self): + return (self.input_data[0],) + + def create_model(self): + class aten_view_copy(torch.nn.Module): + + def __init__(self, input_data) -> None: + super().__init__() + self.dim1 = input_data[1] + self.dim2 = input_data[2] + + def forward(self, input_tensor): + return torch.view_copy(input_tensor, [self.dim1, int(self.dim2)]) + + ref_net = None + + return aten_view_copy(self.input_data), ref_net, "aten::view_copy" + + @pytest.mark.precommit_fx_backend + def test_view_copy(self, ie_device, precision, ir_version, input_shapes): + self.input_data = [] + for input_shape in input_shapes: + if type(input_shape) is list: + self.input_data.append(np.random.randn(*input_shape).astype(np.float32)) + else: + self.input_data.append(input_shape) + self._test(*self.create_model(), ie_device, precision, ir_version) From e737014105e300628201daa13dde9127b6187caf Mon Sep 17 00:00:00 2001 From: Arseniy Obolenskiy Date: Sat, 15 Feb 2025 12:10:10 +0100 Subject: [PATCH 10/10] [Transformations] Hotfix for clang-format remarks (#29004) ### Details: Fix failing clang-format in pre-commit ### Tickets: - N/A --- .../scaled_dot_product_decomposition_test.cpp | 26 +++++++++---------- .../transformation_pipeline.cpp | 4 +-- 2 files changed, 14 insertions(+), 16 deletions(-) diff --git a/src/common/transformations/tests/op_conversions/scaled_dot_product_decomposition_test.cpp b/src/common/transformations/tests/op_conversions/scaled_dot_product_decomposition_test.cpp index 82a0f89e83786c..c7fc7d2557ce54 100644 --- a/src/common/transformations/tests/op_conversions/scaled_dot_product_decomposition_test.cpp +++ b/src/common/transformations/tests/op_conversions/scaled_dot_product_decomposition_test.cpp @@ -31,13 +31,12 @@ using namespace ov; using namespace testing; -const std::shared_ptr scaled_dot_product_attention_decomposition( - std::shared_ptr query, - std::shared_ptr key, - std::shared_ptr value, - std::shared_ptr attention_mask, - std::shared_ptr scale, - bool casual); +const std::shared_ptr scaled_dot_product_attention_decomposition(std::shared_ptr query, + std::shared_ptr key, + std::shared_ptr value, + std::shared_ptr attention_mask, + std::shared_ptr scale, + bool casual); TEST_F(TransformationTestsF, ScaledDotProductAttentionDecompositionStaticBasic) { const PartialShape query_shape{1, 32, 32}; @@ -187,13 +186,12 @@ TEST_F(TransformationTestsF, ScaledDotProductAttentionDecompositionDynamic) { } } -const std::shared_ptr scaled_dot_product_attention_decomposition( - std::shared_ptr query, - std::shared_ptr key, - std::shared_ptr value, - std::shared_ptr attention_mask, - std::shared_ptr scale, - bool casual) { +const std::shared_ptr scaled_dot_product_attention_decomposition(std::shared_ptr query, + std::shared_ptr key, + std::shared_ptr value, + std::shared_ptr attention_mask, + std::shared_ptr scale, + bool casual) { const auto q_shape = std::make_shared(query, element::i32); const auto k_shape = std::make_shared(key, element::i32); const auto minus_one = ov::op::v0::Constant::create(element::i32, Shape{}, {-1}); diff --git a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp index d69b96a8fe9402..5d095a4c80119b 100644 --- a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp +++ b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp @@ -119,9 +119,9 @@ #include "low_precision/fold_convert.hpp" #include "low_precision/fuse_convert.hpp" #include "low_precision/group_convolution.hpp" +#include "low_precision/mat_mul.hpp" #include "low_precision/multiply_to_group_convolution.hpp" #include "low_precision/network_helper.hpp" -#include "low_precision/mat_mul.hpp" #include "low_precision/recurrent_cell.hpp" #include "low_precision/rt_info/bias_attribute.hpp" #include "transformations/low_precision/mark_dequantization_subgraph.hpp" @@ -850,7 +850,7 @@ void Transformations::Lpt(const std::vector& defaultPrecision [&](const_node_ptr& node) -> bool { if (NetworkHelper::isConstantPath(node->get_input_node_shared_ptr(1)) && one_of(node->input_value(1).get_partial_shape().rank().get_length(), 2, 3)) { - return false; + return false; } return true; },