From 6f7519ad4dc1920571de22eac41f66b6c09e76d3 Mon Sep 17 00:00:00 2001
From: Vladislav Golubev <vladislav.golubev@intel.com>
Date: Fri, 14 Feb 2025 11:17:58 +0100
Subject: [PATCH 01/10] [Transformations] SDPA Decomposition: avoid unnecessary
 ShapeOf subgraphs  (#28639)

### Details:
- *Currently, `ScaledDotProductAttentionDecomposition` uses
`ShapeOf->Gather` subgraphs to extract a specific dimension of input
shapes. If the extracted dim is static but the whole shape is dynamic,
such subgraphs are not folded by `ConstantFolding` pass whereas all the
needed info can be extracted. This PR updates dim extraction logic:
after the subgraph formation, `get_constant_from_source` tries to
compute the subgraph, and replaces it with constant if possible*
 - *This change unblocks SDPA quantization for some scenarios*

### Tickets:
 - *CVS-161062*
---
 ...ed_dot_product_attention_decomposition.cpp | 21 +++++-
 .../scaled_dot_product_decomposition_test.cpp | 73 +++++++++++++++----
 2 files changed, 77 insertions(+), 17 deletions(-)
diff --git a/src/common/transformations/src/transformations/op_conversions/scaled_dot_product_attention_decomposition.cpp b/src/common/transformations/src/transformations/op_conversions/scaled_dot_product_attention_decomposition.cpp
index be18ab31dc19d3..3bfa8a009f74d4 100644
--- a/src/common/transformations/src/transformations/op_conversions/scaled_dot_product_attention_decomposition.cpp
+++ b/src/common/transformations/src/transformations/op_conversions/scaled_dot_product_attention_decomposition.cpp
@@ -8,6 +8,7 @@
 
 #include "itt.hpp"
 #include "openvino/core/rt_info.hpp"
+#include "openvino/core/validation_util.hpp"
 #include "openvino/op/add.hpp"
 #include "openvino/op/broadcast.hpp"
 #include "openvino/op/concat.hpp"
@@ -68,9 +69,23 @@ std::shared_ptr<ov::Node> ov::pass::ScaledDotProductAttentionDecomposition::deco
     auto one_f = register_new_node<v1::ConvertLike>(one_i, query);
     auto zero_f = register_new_node<v1::ConvertLike>(zero_i, query);
 
+    auto build_extract_dim_subgraph = [this, &zero_i](const std::shared_ptr<v3::ShapeOf>& shape_of,
+                                                      const int64_t idx) -> std::shared_ptr<ov::Node> {
+        const auto dim_to_extract_const = v0::Constant::create(element::i32, Shape{}, {idx});
+        const auto gather = std::make_shared<v8::Gather>(shape_of, dim_to_extract_const, zero_i);
+        // When dim_to_extract is static but the whole shape is dynamic,
+        // ConstantFolding can't fold ShapeOf->Gather subgraph in this case.
+        // So it's better to explicitly extract the needed dimension.
+        if (auto constant = ov::util::get_constant_from_source(gather)) {
+            return register_new_node(constant);
+        }
+        register_new_node(dim_to_extract_const);
+        return register_new_node(gather);
+    };
+
     Output<Node> scale;
     if (node->get_input_size() < 5) {
-        scale = register_new_node<v8::Gather>(q_shape, minus_one, zero_i)->output(0);
+        scale = build_extract_dim_subgraph(q_shape, -1);
         scale = register_new_node<v1::ConvertLike>(scale, query);
         auto sqrt_scale = register_new_node<v0::Sqrt>(scale);
         scale = register_new_node<v1::Divide>(one_f, sqrt_scale);
@@ -112,8 +127,8 @@ std::shared_ptr<ov::Node> ov::pass::ScaledDotProductAttentionDecomposition::deco
                 atten_mask = mask;
             }
         } else {
-            auto target_s_len = register_new_node<v8::Gather>(q_shape, minus_two, zero_i);
-            auto source_s_len = register_new_node<v8::Gather>(k_shape, minus_two, zero_i);
+            auto target_s_len = build_extract_dim_subgraph(q_shape, -2);
+            auto source_s_len = build_extract_dim_subgraph(k_shape, -2);
             auto ssl = register_new_node<v0::Unsqueeze>(source_s_len, zero_i);
             auto tsl = register_new_node<v0::Unsqueeze>(target_s_len, zero_i);
             auto mask_shape = register_new_node<v0::Concat>(OutputVector{tsl, ssl}, 0);
diff --git a/src/common/transformations/tests/op_conversions/scaled_dot_product_decomposition_test.cpp b/src/common/transformations/tests/op_conversions/scaled_dot_product_decomposition_test.cpp
index c83c0c86d41f8d..82a0f89e83786c 100644
--- a/src/common/transformations/tests/op_conversions/scaled_dot_product_decomposition_test.cpp
+++ b/src/common/transformations/tests/op_conversions/scaled_dot_product_decomposition_test.cpp
@@ -32,12 +32,12 @@ using namespace ov;
 using namespace testing;
 
 const std::shared_ptr<ov::Node> scaled_dot_product_attention_decomposition(
-    const std::shared_ptr<ov::Node> query,
-    const std::shared_ptr<ov::Node> key,
-    const std::shared_ptr<ov::Node> value,
-    const std::shared_ptr<ov::Node> attention_mask,
-    const std::shared_ptr<ov::Node> scale,
-    const bool casual);
+    std::shared_ptr<ov::Node> query,
+    std::shared_ptr<ov::Node> key,
+    std::shared_ptr<ov::Node> value,
+    std::shared_ptr<ov::Node> attention_mask,
+    std::shared_ptr<ov::Node> scale,
+    bool casual);
 
 TEST_F(TransformationTestsF, ScaledDotProductAttentionDecompositionStaticBasic) {
     const PartialShape query_shape{1, 32, 32};
@@ -129,6 +129,34 @@ TEST_F(TransformationTestsF, ScaledDotProductAttentionDecompositionStaticBroadca
     }
 }
 
+TEST_F(TransformationTestsF, ScaledDotProductAttentionCasualPartiallyDynamic) {
+    const PartialShape query_shape{-1, -1, 24, 64};
+    const PartialShape key_shape{-1, -1, 24, 64};
+    const PartialShape value_shape{-1, -1, -1, 64};
+    const PartialShape attention_mask_shape{-1, -1, -1, -1};
+    const auto casual = true;
+
+    const auto query = std::make_shared<ov::op::v0::Parameter>(element::f32, query_shape);
+    const auto key = std::make_shared<ov::op::v0::Parameter>(element::f32, key_shape);
+    const auto value = std::make_shared<ov::op::v0::Parameter>(element::f32, value_shape);
+    const auto attention_mask = std::make_shared<ov::op::v0::Parameter>(element::f32, attention_mask_shape);
+    {
+        const auto scaled_dot_product_attention =
+            std::make_shared<ov::op::v13::ScaledDotProductAttention>(query, key, value, attention_mask, casual);
+
+        model = std::make_shared<ov::Model>(NodeVector{scaled_dot_product_attention},
+                                            ParameterVector{query, key, value, attention_mask});
+        manager.register_pass<ov::pass::ScaledDotProductAttentionDecomposition>();
+    }
+
+    {
+        const auto scaled_dot_product_attention =
+            scaled_dot_product_attention_decomposition(query, key, value, attention_mask, nullptr, casual);
+        model_ref = std::make_shared<ov::Model>(NodeVector{scaled_dot_product_attention},
+                                                ParameterVector{query, key, value, attention_mask});
+    }
+}
+
 TEST_F(TransformationTestsF, ScaledDotProductAttentionDecompositionDynamic) {
     const PartialShape query_shape{-1, -1, -1};
     const PartialShape key_shape{-1, -1, -1};
@@ -160,12 +188,12 @@ TEST_F(TransformationTestsF, ScaledDotProductAttentionDecompositionDynamic) {
 }
 
 const std::shared_ptr<ov::Node> scaled_dot_product_attention_decomposition(
-    const std::shared_ptr<ov::Node> query,
-    const std::shared_ptr<ov::Node> key,
-    const std::shared_ptr<ov::Node> value,
-    const std::shared_ptr<ov::Node> attention_mask,
-    const std::shared_ptr<ov::Node> scale,
-    const bool casual) {
+    std::shared_ptr<ov::Node> query,
+    std::shared_ptr<ov::Node> key,
+    std::shared_ptr<ov::Node> value,
+    std::shared_ptr<ov::Node> attention_mask,
+    std::shared_ptr<ov::Node> scale,
+    bool casual) {
     const auto q_shape = std::make_shared<ov::op::v3::ShapeOf>(query, element::i32);
     const auto k_shape = std::make_shared<ov::op::v3::ShapeOf>(key, element::i32);
     const auto minus_one = ov::op::v0::Constant::create(element::i32, Shape{}, {-1});
@@ -175,6 +203,23 @@ const std::shared_ptr<ov::Node> scaled_dot_product_attention_decomposition(
     const auto one_f = std::make_shared<ov::op::v1::ConvertLike>(one_i, query);
     const auto zero_f = std::make_shared<ov::op::v1::ConvertLike>(zero_i, query);
 
+    auto extract_dim = [&zero_i](const std::shared_ptr<ov::op::v3::ShapeOf>& shape_of,
+                                 const int64_t idx) -> std::shared_ptr<ov::Node> {
+        const auto& shape = shape_of->get_input_partial_shape(0);
+        const auto& dim = shape[idx];
+        if (dim.is_static()) {
+            return ov::op::v0::Constant::create(element::i32, Shape{}, {dim.get_length()});
+        }
+        const auto dim_to_extract_const = ov::op::v0::Constant::create(element::i32, Shape{}, {idx});
+        return std::make_shared<ov::op::v8::Gather>(shape_of, dim_to_extract_const, zero_i);
+    };
+
+    if (scale == nullptr) {
+        scale = extract_dim(q_shape, -1);
+        scale = std::make_shared<ov::op::v1::ConvertLike>(scale, query);
+        auto sqrt_scale = std::make_shared<ov::op::v0::Sqrt>(scale);
+        scale = std::make_shared<ov::op::v1::Divide>(one_f, sqrt_scale);
+    }
     const auto q_scaled = std::make_shared<ov::op::v1::Multiply>(query, scale);
     auto k_rank = std::make_shared<ov::op::v3::ShapeOf>(k_shape, element::i32)->output(0);
     const auto k_last_dim = std::make_shared<ov::op::v1::Add>(k_rank, minus_one);
@@ -204,8 +249,8 @@ const std::shared_ptr<ov::Node> scaled_dot_product_attention_decomposition(
             atten_mask = mask;
         }
     } else {
-        const auto target_s_len = std::make_shared<ov::op::v8::Gather>(q_shape, minus_two, zero_i);
-        const auto source_s_len = std::make_shared<ov::op::v8::Gather>(k_shape, minus_two, zero_i);
+        const auto target_s_len = extract_dim(q_shape, -2);
+        const auto source_s_len = extract_dim(k_shape, -2);
         const auto ssl = std::make_shared<ov::op::v0::Unsqueeze>(source_s_len, zero_i);
         const auto tsl = std::make_shared<ov::op::v0::Unsqueeze>(target_s_len, zero_i);
         const auto mask_shape = std::make_shared<ov::op::v0::Concat>(OutputVector{tsl, ssl}, 0);

From eb44f8d0a5c16e848fdff889dd92e19d2ab89cfe Mon Sep 17 00:00:00 2001
From: Arseniy Obolenskiy <arseniy.obolenskiy@intel.com>
Date: Fri, 14 Feb 2025 12:51:30 +0100
Subject: [PATCH 02/10] Make custom ThreadLocal implementation copyable
 (#28900)

### Details:
Fix ARM 32-bit build with `-DENABLE_DEBUG_CAPS=ON` CMake configuration
flag. Make `ThreadLocal` class implementation copyable where TBB
threading is not enabled. Original `tbb::enumerable_thread_specific<T>`
has copyable interface.

### Tickets:
 - N/A
---
 .../openvino/runtime/threading/thread_local.hpp      | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/src/inference/dev_api/openvino/runtime/threading/thread_local.hpp b/src/inference/dev_api/openvino/runtime/threading/thread_local.hpp
index 9a3edee83fc592..32f2a5b732b40a 100644
--- a/src/inference/dev_api/openvino/runtime/threading/thread_local.hpp
+++ b/src/inference/dev_api/openvino/runtime/threading/thread_local.hpp
@@ -59,8 +59,16 @@ struct ThreadLocal {
         _create = std::move(other._create);
         return *this;
     }
-    ThreadLocal(const ThreadLocal&) = delete;
-    ThreadLocal& operator=(const ThreadLocal&&) = delete;
+    ThreadLocal(const ThreadLocal& other) : _create(other._create) {
+        std::lock_guard<std::mutex> lock{other._mutex};
+        _map = other._map;
+    }
+    ThreadLocal& operator=(const ThreadLocal& other) {
+        std::lock_guard<std::mutex> lock{other._mutex};
+        _map = other._map;
+        _create = other._create;
+        return *this;
+    }
     explicit ThreadLocal(const Create& create_) : _create{create_} {}
 
     T& local() {

From 9d92d9c35a83a9483677b4cb7521cac330b14345 Mon Sep 17 00:00:00 2001
From: Srinjoy Dutta <114402816+srinjoydutta03@users.noreply.github.com>
Date: Fri, 14 Feb 2025 17:47:15 +0530
Subject: [PATCH 03/10] [CPU][ARM64] Implemented JIT Emitter for Eltwise
 Squared Difference Operation (#28989)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

### Details:
- Implemented and added jit_squared_difference_emitter derived class for
element wise squared difference operation
- Added entry Algorithm::EltwiseSquaredDifference, in executors/aarch64
as one of the supported algorithms
- Added entry in the get_supported_precisions and
create_eltwise_emitters in kernel/aarch64
- Added `utils::EltwiseTypes::SQUARED_DIFF` in `jit` kernel check in the
tests

### Tests:
Passed local tests using `./bin/arm64/Release/ov_cpu_func_tests
--gtest_filter="*smoke*Eltwise*SqDiff*"`
<img width="487" alt="Screenshot 2025-02-14 at 2 04 25 PM"
src="https://github.com/user-attachments/assets/deaec2f6-0dcd-4764-86ec-c543ac4742d3"
/>

### Tickets
- Closes #27502

CC:
@a-sidorova
---
 .../plugin/aarch64/jit_eltwise_emitters.cpp   | 43 +++++++++++++++++++
 .../plugin/aarch64/jit_eltwise_emitters.hpp   | 22 ++++++++++
 .../nodes/executors/aarch64/jit_eltwise.cpp   |  1 +
 .../aarch64/jit_uni_eltwise_generic.cpp       |  2 +
 .../single_layer_tests/classes/eltwise.cpp    |  3 +-
 5 files changed, 70 insertions(+), 1 deletion(-)

diff --git a/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_eltwise_emitters.cpp b/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_eltwise_emitters.cpp
index 9e4e7160026568..e925abcadab907 100644
--- a/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_eltwise_emitters.cpp
+++ b/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_eltwise_emitters.cpp
@@ -2704,6 +2704,49 @@ std::set<std::vector<element::Type>> jit_sqrt_emitter::get_supported_precisions(
     return {{element::f32}};
 }
 
+/// SQUARED DIFFERENCE ///
+jit_squared_difference_emitter::jit_squared_difference_emitter(dnnl::impl::cpu::aarch64::jit_generator* host,
+                                                               dnnl::impl::cpu::aarch64::cpu_isa_t host_isa,
+                                                               const std::shared_ptr<ov::Node>& node)
+    : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) {}
+
+jit_squared_difference_emitter::jit_squared_difference_emitter(dnnl::impl::cpu::aarch64::jit_generator* host,
+                                                               dnnl::impl::cpu::aarch64::cpu_isa_t host_isa,
+                                                               const ov::element::Type exec_prc)
+    : jit_emitter(host, host_isa, exec_prc) {}
+
+size_t jit_squared_difference_emitter::get_inputs_count() const {
+    return 2;
+}
+
+void jit_squared_difference_emitter::emit_impl(const std::vector<size_t>& in_vec_idxs,
+                                               const std::vector<size_t>& out_vec_idxs) const {
+    if (host_isa_ == dnnl::impl::cpu::aarch64::asimd) {
+        emit_isa<dnnl::impl::cpu::aarch64::asimd>(in_vec_idxs, out_vec_idxs);
+    } else {
+        OV_CPU_JIT_EMITTER_THROW("Can't create jit eltwise kernel");
+    }
+}
+
+template <dnnl::impl::cpu::aarch64::cpu_isa_t isa>
+void jit_squared_difference_emitter::emit_isa(const std::vector<size_t>& in_vec_idxs,
+                                              const std::vector<size_t>& out_vec_idxs) const {
+    OV_CPU_JIT_EMITTER_ASSERT(exec_prc_ == ov::element::f32, "unsupported precision: " + exec_prc_.to_string());
+
+    using TReg = typename dnnl::impl::cpu::aarch64::cpu_isa_traits<isa>::TReg;
+    TReg src0 = TReg(in_vec_idxs[0]);
+    TReg src1 = TReg(in_vec_idxs[1]);
+    TReg dst = TReg(out_vec_idxs[0]);
+
+    h->fsub(dst.s, src0.s, src1.s);
+    h->fmul(dst.s, dst.s, dst.s);
+}
+
+std::set<std::vector<element::Type>> jit_squared_difference_emitter::get_supported_precisions(
+    const std::shared_ptr<ov::Node>& node) {
+    return {{element::f32, element::f32}};
+}
+
 /// SUBTRACT ///
 jit_subtract_emitter::jit_subtract_emitter(dnnl::impl::cpu::aarch64::jit_generator* host,
                                            dnnl::impl::cpu::aarch64::cpu_isa_t host_isa,
diff --git a/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_eltwise_emitters.hpp b/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_eltwise_emitters.hpp
index ca0e2dd8e157e6..f24f3a0bda4c37 100644
--- a/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_eltwise_emitters.hpp
+++ b/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_eltwise_emitters.hpp
@@ -1104,6 +1104,28 @@ class jit_sqrt_emitter : public jit_emitter {
     void emit_isa(const std::vector<size_t>& in_vec_idxs, const std::vector<size_t>& out_vec_idxs) const;
 };
 
+class jit_squared_difference_emitter : public jit_emitter {
+public:
+    jit_squared_difference_emitter(dnnl::impl::cpu::aarch64::jit_generator* host,
+                                   dnnl::impl::cpu::aarch64::cpu_isa_t host_isa,
+                                   const ov::element::Type exec_prc = ov::element::f32);
+
+    jit_squared_difference_emitter(dnnl::impl::cpu::aarch64::jit_generator* host,
+                                   dnnl::impl::cpu::aarch64::cpu_isa_t host_isa,
+                                   const std::shared_ptr<ov::Node>& node);
+
+    size_t get_inputs_count() const override;
+
+    static std::set<std::vector<element::Type>> get_supported_precisions(
+        const std::shared_ptr<ov::Node>& node = nullptr);
+
+private:
+    void emit_impl(const std::vector<size_t>& in_vec_idxs, const std::vector<size_t>& out_vec_idxs) const override;
+
+    template <dnnl::impl::cpu::aarch64::cpu_isa_t isa>
+    void emit_isa(const std::vector<size_t>& in_vec_idxs, const std::vector<size_t>& out_vec_idxs) const;
+};
+
 class jit_subtract_emitter : public jit_emitter {
 public:
     jit_subtract_emitter(dnnl::impl::cpu::aarch64::jit_generator* host,
diff --git a/src/plugins/intel_cpu/src/nodes/executors/aarch64/jit_eltwise.cpp b/src/plugins/intel_cpu/src/nodes/executors/aarch64/jit_eltwise.cpp
index bb5d79ad56de6d..d2238b2d9f182c 100644
--- a/src/plugins/intel_cpu/src/nodes/executors/aarch64/jit_eltwise.cpp
+++ b/src/plugins/intel_cpu/src/nodes/executors/aarch64/jit_eltwise.cpp
@@ -57,6 +57,7 @@ bool JitEltwiseExecutor::isSupported(const Algorithm& algorithm,
                                      Algorithm::EltwiseSigmoid,
                                      Algorithm::EltwiseSoftSign,
                                      Algorithm::EltwiseSqrt,
+                                     Algorithm::EltwiseSquaredDifference,
                                      Algorithm::EltwiseSubtract,
                                      Algorithm::EltwiseSwish,
                                      Algorithm::EltwiseTanh);
diff --git a/src/plugins/intel_cpu/src/nodes/kernels/aarch64/jit_uni_eltwise_generic.cpp b/src/plugins/intel_cpu/src/nodes/kernels/aarch64/jit_uni_eltwise_generic.cpp
index d820bf9ea12775..818d7bdbfef684 100644
--- a/src/plugins/intel_cpu/src/nodes/kernels/aarch64/jit_uni_eltwise_generic.cpp
+++ b/src/plugins/intel_cpu/src/nodes/kernels/aarch64/jit_uni_eltwise_generic.cpp
@@ -702,6 +702,7 @@ std::shared_ptr<jit_emitter> jit_uni_eltwise_generic<isa>::create_eltwise_emitte
         OV_CASE(Algorithm::EltwiseSigmoid, ov::intel_cpu::aarch64::jit_sigmoid_emitter),
         OV_CASE(Algorithm::EltwiseSoftSign, ov::intel_cpu::aarch64::jit_soft_sign_emitter),
         OV_CASE(Algorithm::EltwiseSqrt, ov::intel_cpu::aarch64::jit_sqrt_emitter),
+        OV_CASE(Algorithm::EltwiseSquaredDifference, ov::intel_cpu::aarch64::jit_squared_difference_emitter),
         OV_CASE(Algorithm::EltwiseSubtract, ov::intel_cpu::aarch64::jit_subtract_emitter),
         OV_CASE(Algorithm::EltwiseSwish, ov::intel_cpu::aarch64::jit_swish_emitter),
         OV_CASE(Algorithm::EltwiseTanh, ov::intel_cpu::aarch64::jit_tanh_emitter));
@@ -836,6 +837,7 @@ std::set<std::vector<element::Type>> eltwise_precision_helper::get_supported_pre
               OV_CASE(Algorithm::EltwiseSigmoid, jit_sigmoid_emitter),
               OV_CASE(Algorithm::EltwiseSoftSign, jit_soft_sign_emitter),
               OV_CASE(Algorithm::EltwiseSqrt, jit_sqrt_emitter),
+              OV_CASE(Algorithm::EltwiseSquaredDifference, jit_squared_difference_emitter),
               OV_CASE(Algorithm::EltwiseSubtract, jit_subtract_emitter),
               OV_CASE(Algorithm::EltwiseSwish, jit_swish_emitter),
               OV_CASE(Algorithm::EltwiseTanh, jit_tanh_emitter));
diff --git a/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/classes/eltwise.cpp b/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/classes/eltwise.cpp
index 3f48b1f0b1e976..1fea147aa63318 100644
--- a/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/classes/eltwise.cpp
+++ b/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/classes/eltwise.cpp
@@ -260,7 +260,8 @@ std::string EltwiseLayerCPUTest::getPrimitiveType(const utils::EltwiseTypes& elt
        (eltwise_type == utils::EltwiseTypes::SUBTRACT) ||
        (eltwise_type == utils::EltwiseTypes::DIVIDE) ||
        (eltwise_type == utils::EltwiseTypes::FLOOR_MOD) ||
-       (eltwise_type == utils::EltwiseTypes::MOD)) {
+       (eltwise_type == utils::EltwiseTypes::MOD) ||
+       (eltwise_type == utils::EltwiseTypes::SQUARED_DIFF)) {
         return "jit";
     }
 #endif

From d7ecf527148b9bea47e386592047d8ce2b4c3a00 Mon Sep 17 00:00:00 2001
From: Stefania Hergane <stefania-persida.hergane@intel.com>
Date: Fri, 14 Feb 2025 14:32:20 +0200
Subject: [PATCH 04/10] [NPU] Add datatype NF4 support (#27903)

### Details:
 - Add datatype NF4 support in NPU plugin
 - *...*

---------

Signed-off-by: Stefania Hergane <stefania-persida.hergane@intel.com>
Co-authored-by: Stepan <patrik.stepan@intel.com>
---
 .../intel_npu/src/backend/src/zero_infer_request.cpp       | 7 +++++--
 .../src/compiler_adapter/src/driver_compiler_adapter.cpp   | 2 ++
 src/plugins/intel_npu/src/utils/src/zero/zero_wrappers.cpp | 2 +-
 src/plugins/intel_npu/thirdparty/level-zero-ext            | 2 +-
 4 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/src/plugins/intel_npu/src/backend/src/zero_infer_request.cpp b/src/plugins/intel_npu/src/backend/src/zero_infer_request.cpp
index aee73a2b73fa31..df9cc4eb328133 100644
--- a/src/plugins/intel_npu/src/backend/src/zero_infer_request.cpp
+++ b/src/plugins/intel_npu/src/backend/src/zero_infer_request.cpp
@@ -726,6 +726,8 @@ void ZeroInferRequest::check_network_precision(const ov::element::Type_t precisi
         break;
     case ov::element::Type_t::bf16:
         break;
+    case ov::element::Type_t::nf4:
+        break;
     case ov::element::Type_t::u4:
         break;
     case ov::element::Type_t::i4:
@@ -749,8 +751,9 @@ void ZeroInferRequest::check_network_precision(const ov::element::Type_t precisi
     case ov::element::Type_t::f64:
         break;
     default:
-        OPENVINO_THROW("Unsupported tensor precision: " + ov::element::Type(precision).get_type_name() +
-                       "! Supported precisions: FP32, FP16, BF16, U4, I4, U8, I8, U16, I16, U32, I32, U64, I64, FP64");
+        OPENVINO_THROW(
+            "Unsupported tensor precision: " + ov::element::Type(precision).get_type_name() +
+            "! Supported precisions: FP32, FP16, BF16, NF4, U4, I4, U8, I8, U16, I16, U32, I32, U64, I64, FP64");
     }
 }
 
diff --git a/src/plugins/intel_npu/src/compiler_adapter/src/driver_compiler_adapter.cpp b/src/plugins/intel_npu/src/compiler_adapter/src/driver_compiler_adapter.cpp
index 624ba448fed44f..51e42478e7cebc 100644
--- a/src/plugins/intel_npu/src/compiler_adapter/src/driver_compiler_adapter.cpp
+++ b/src/plugins/intel_npu/src/compiler_adapter/src/driver_compiler_adapter.cpp
@@ -78,6 +78,8 @@ std::string ovPrecisionToLegacyPrecisionString(const ov::element::Type& precisio
         return "FP64";
     case ov::element::Type_t::bf16:
         return "BF16";
+    case ov::element::Type_t::nf4:
+        return "NF4";
     case ov::element::Type_t::i4:
         return "I4";
     case ov::element::Type_t::i8:
diff --git a/src/plugins/intel_npu/src/utils/src/zero/zero_wrappers.cpp b/src/plugins/intel_npu/src/utils/src/zero/zero_wrappers.cpp
index 4868d6326c5fe4..1d3c0cc23e6a98 100644
--- a/src/plugins/intel_npu/src/utils/src/zero/zero_wrappers.cpp
+++ b/src/plugins/intel_npu/src/utils/src/zero/zero_wrappers.cpp
@@ -69,7 +69,7 @@ CommandList::CommandList(const std::shared_ptr<ZeroInitStructsHolder>& initStruc
     if (mtci_is_supported) {
         ze_mutable_command_id_exp_desc_t mutableCmdIdDesc = {ZE_STRUCTURE_TYPE_MUTABLE_COMMAND_ID_EXP_DESC,
                                                              nullptr,
-                                                             ZE_MUTABLE_COMMAND_EXP_FLAG_GRAPH_ARGUMENT};
+                                                             ZE_MUTABLE_COMMAND_EXP_FLAG_GRAPH_ARGUMENT_DEPRECATED};
         THROW_ON_FAIL_FOR_LEVELZERO("zeCommandListGetNextCommandIdExp",
                                     zeCommandListGetNextCommandIdExp(_handle, &mutableCmdIdDesc, &_command_id));
     }
diff --git a/src/plugins/intel_npu/thirdparty/level-zero-ext b/src/plugins/intel_npu/thirdparty/level-zero-ext
index 110f48ee8eda22..c0156a3390ae39 160000
--- a/src/plugins/intel_npu/thirdparty/level-zero-ext
+++ b/src/plugins/intel_npu/thirdparty/level-zero-ext
@@ -1 +1 @@
-Subproject commit 110f48ee8eda22d8b40daeeecdbbed0fc3b08f8b
+Subproject commit c0156a3390ae39671ff8f2a6f5471f04bb65bb12

From 1d1d09952d4412fa400474a5f10a170974c89466 Mon Sep 17 00:00:00 2001
From: Pawel Raasz <pawel.raasz@intel.com>
Date: Fri, 14 Feb 2025 14:49:51 +0100
Subject: [PATCH 05/10] [CI] Update reviewdog to not used deprecated option and
 check all files (#28974)

### Details:
 - Update reviewdog for code style check:
   * remove deprecated option usage
   * add nofilter for check files to check all not diff only
   * exclude temp and thirdparty code
- Apply code style in affected files

### Related PRs:
- #28946

### Tickets:
 - N/A

---------

Signed-off-by: Raasz, Pawel <pawel.raasz@intel.com>
---
 .github/workflows/code_style.yml                 | 14 +++++++++++---
 .../onnx/frontend/src/utils/onnx_internal.cpp    |  3 ++-
 .../onnx/tests/onnx_import_convpool.in.cpp       |  9 ++++++---
 .../src/graph_iterator_saved_model.cpp           |  3 +--
 .../snippets/aarch64/jit_loop_emitters.cpp       |  6 +++---
 .../snippets/aarch64/jit_loop_emitters.hpp       |  6 +++---
 .../emitters/snippets/x64/jit_loop_emitters.cpp  |  6 +++---
 .../emitters/snippets/x64/jit_loop_emitters.hpp  |  6 +++---
 .../snippets/x64/jit_reg_spill_emitters.cpp      |  6 +++---
 .../snippets/x64/jit_reg_spill_emitters.hpp      | 12 ++++++------
 .../src/nodes/executors/aarch64/jit_eltwise.cpp  |  2 +-
 .../kernels/aarch64/jit_uni_eltwise_generic.cpp  | 12 ++++++------
 .../src/nodes/kernels/x64/mlp_kernel.hpp         |  6 +++---
 .../sea_itt_lib/IttNotifyStdSrc.cpp              | 16 +++++++---------
 14 files changed, 58 insertions(+), 49 deletions(-)

diff --git a/.github/workflows/code_style.yml b/.github/workflows/code_style.yml
index 8f69ee7cd4ca74..89527187272227 100644
--- a/.github/workflows/code_style.yml
+++ b/.github/workflows/code_style.yml
@@ -39,7 +39,11 @@ jobs:
         with:
           github_token: ${{ secrets.GITHUB_TOKEN }}
           level: warning
-          fail_on_error: true
+          fail_level: error
+          filter_mode: nofilter
+          exclude: |
+              "*/thirdparty/*"
+              "./temp/*"
 
   clang-format-aarch64:
     runs-on: ubuntu-22.04
@@ -71,7 +75,11 @@ jobs:
         with:
           github_token: ${{ secrets.GITHUB_TOKEN }}
           level: warning
-          fail_on_error: true
+          fail_level: error
+          filter_mode: nofilter
+          exclude: |
+              "*/thirdparty/*"
+              "./temp/*"
 
   ShellCheck:
     runs-on: ubuntu-22.04
@@ -103,7 +111,7 @@ jobs:
           level: style
           reporter: github-pr-review
           check_all_files_with_shebangs: true
-          fail_on_error: true
+          fail_level: error
           exclude: |
               "*/thirdparty/*"
               "./temp/*"
diff --git a/src/frontends/onnx/frontend/src/utils/onnx_internal.cpp b/src/frontends/onnx/frontend/src/utils/onnx_internal.cpp
index a5b68b8f9d98a5..96aad10dbf928c 100644
--- a/src/frontends/onnx/frontend/src/utils/onnx_internal.cpp
+++ b/src/frontends/onnx/frontend/src/utils/onnx_internal.cpp
@@ -100,7 +100,8 @@ std::shared_ptr<ov::Model> decode_to_framework_nodes(std::shared_ptr<ModelProto>
                                                      detail::MappedMemoryHandles mmap_cache,
                                                      ov::frontend::ExtensionHolder extensions) {
     apply_transformations(*model_proto);
-    auto graph = std::make_shared<Graph>(ov::util::get_directory(model_path).string(), model_proto, mmap_cache, extensions);
+    auto graph =
+        std::make_shared<Graph>(ov::util::get_directory(model_path).string(), model_proto, mmap_cache, extensions);
     return graph->decode();
 }
 }  // namespace ov::frontend::onnx::detail
diff --git a/src/frontends/onnx/tests/onnx_import_convpool.in.cpp b/src/frontends/onnx/tests/onnx_import_convpool.in.cpp
index cd944d44929abf..ae9188b8e7df5e 100644
--- a/src/frontends/onnx/tests/onnx_import_convpool.in.cpp
+++ b/src/frontends/onnx/tests/onnx_import_convpool.in.cpp
@@ -426,14 +426,17 @@ OPENVINO_TEST(${BACKEND_NAME}, onnx_model_convtranspose_output_shape) {
 
     test_case.add_input_from_file<float>(util::path_join({ov::test::utils::getExecutableDirectory(),
                                                           TEST_ONNX_MODELS_DIRNAME,
-                                                          "files/convtranspose_output_shape/x.bin"}).string());
+                                                          "files/convtranspose_output_shape/x.bin"})
+                                             .string());
     test_case.add_input_from_file<float>(util::path_join({ov::test::utils::getExecutableDirectory(),
                                                           TEST_ONNX_MODELS_DIRNAME,
-                                                          "files/convtranspose_output_shape/w.bin"}).string());
+                                                          "files/convtranspose_output_shape/w.bin"})
+                                             .string());
     test_case.add_expected_output_from_file<float>({1, 2, 10, 8},
                                                    util::path_join({ov::test::utils::getExecutableDirectory(),
                                                                     TEST_ONNX_MODELS_DIRNAME,
-                                                                    "files/convtranspose_output_shape/y.bin"}).string());
+                                                                    "files/convtranspose_output_shape/y.bin"})
+                                                       .string());
 
     test_case.run();
 }
diff --git a/src/frontends/tensorflow/src/graph_iterator_saved_model.cpp b/src/frontends/tensorflow/src/graph_iterator_saved_model.cpp
index 91ac2efc73d834..8d64d6f2bcb045 100644
--- a/src/frontends/tensorflow/src/graph_iterator_saved_model.cpp
+++ b/src/frontends/tensorflow/src/graph_iterator_saved_model.cpp
@@ -43,8 +43,7 @@ bool GraphIteratorSavedModel::is_supported(const std::string& path) {
 
 #if defined(OPENVINO_ENABLE_UNICODE_PATH_SUPPORT) && defined(_WIN32)
 bool GraphIteratorSavedModel::is_supported(const std::wstring& path) {
-    return ov::util::directory_exists(path) &&
-           ov::util::file_exists(ov::util::path_join_w({path, L"saved_model.pb"}));
+    return ov::util::directory_exists(path) && ov::util::file_exists(ov::util::path_join_w({path, L"saved_model.pb"}));
 }
 #endif
 
diff --git a/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_loop_emitters.cpp b/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_loop_emitters.cpp
index bac1591dc0b76a..3fbdc645a03a4f 100644
--- a/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_loop_emitters.cpp
+++ b/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_loop_emitters.cpp
@@ -124,9 +124,9 @@ void jit_loop_end_emitter::validate_arguments(const std::vector<size_t>& in, con
 }
 
 void jit_loop_end_emitter::emit_code_impl(const std::vector<size_t>& in,
-                                     const std::vector<size_t>& out,
-                                     const std::vector<size_t>& pool_vec_idxs,
-                                     const std::vector<size_t>& pool_gpr_idxs) const {
+                                          const std::vector<size_t>& out,
+                                          const std::vector<size_t>& pool_vec_idxs,
+                                          const std::vector<size_t>& pool_gpr_idxs) const {
     validate_arguments(in, out);
     emit_impl(in, out);
 }
diff --git a/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_loop_emitters.hpp b/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_loop_emitters.hpp
index eddfe64cdc90dd..8f3480cc064c7b 100644
--- a/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_loop_emitters.hpp
+++ b/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_loop_emitters.hpp
@@ -54,9 +54,9 @@ class jit_loop_end_emitter : public jit_emitter {
     }
 
     void emit_code_impl(const std::vector<size_t>& in_idxs,
-                   const std::vector<size_t>& out_idxs,
-                   const std::vector<size_t>& pool_vec_idxs,
-                   const std::vector<size_t>& pool_gpr_idxs) const override;
+                        const std::vector<size_t>& out_idxs,
+                        const std::vector<size_t>& pool_vec_idxs,
+                        const std::vector<size_t>& pool_gpr_idxs) const override;
 
 protected:
     void validate_arguments(const std::vector<size_t>& in, const std::vector<size_t>& out) const override;
diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_loop_emitters.cpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_loop_emitters.cpp
index 15c7b85c1928da..54597a90a8504b 100644
--- a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_loop_emitters.cpp
+++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_loop_emitters.cpp
@@ -200,9 +200,9 @@ void jit_loop_end_emitter::validate_arguments(const std::vector<size_t>& in, con
 }
 
 void jit_loop_end_emitter::emit_code_impl(const std::vector<size_t>& in,
-                                     const std::vector<size_t>& out,
-                                     const std::vector<size_t>& pool_vec_idxs,
-                                     const std::vector<size_t>& pool_gpr_idxs) const {
+                                          const std::vector<size_t>& out,
+                                          const std::vector<size_t>& pool_vec_idxs,
+                                          const std::vector<size_t>& pool_gpr_idxs) const {
     validate_arguments(in, out);
     jit_emitter::emit_code_impl(in, out, pool_vec_idxs, pool_gpr_idxs);
 }
diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_loop_emitters.hpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_loop_emitters.hpp
index 7b66cd8b54b48a..3a15ab0cf6fbea 100644
--- a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_loop_emitters.hpp
+++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_loop_emitters.hpp
@@ -67,9 +67,9 @@ class jit_loop_end_emitter : public jit_emitter {
     }
 
     void emit_code_impl(const std::vector<size_t>& in_idxs,
-                   const std::vector<size_t>& out_idxs,
-                   const std::vector<size_t>& pool_vec_idxs,
-                   const std::vector<size_t>& pool_gpr_idxs) const override;
+                        const std::vector<size_t>& out_idxs,
+                        const std::vector<size_t>& pool_vec_idxs,
+                        const std::vector<size_t>& pool_gpr_idxs) const override;
 
 protected:
     void validate_arguments(const std::vector<size_t>& in, const std::vector<size_t>& out) const override;
diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_reg_spill_emitters.cpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_reg_spill_emitters.cpp
index 59f84c75ee28ee..8c848567f579e0 100644
--- a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_reg_spill_emitters.cpp
+++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_reg_spill_emitters.cpp
@@ -71,9 +71,9 @@ void jit_reg_spill_end_emitter::validate_arguments(const std::vector<size_t>& in
 }
 
 void jit_reg_spill_end_emitter::emit_code_impl(const std::vector<size_t>& in,
-                                          const std::vector<size_t>& out,
-                                          const std::vector<size_t>& pool_vec_idxs,
-                                          const std::vector<size_t>& pool_gpr_idxs) const {
+                                               const std::vector<size_t>& out,
+                                               const std::vector<size_t>& pool_vec_idxs,
+                                               const std::vector<size_t>& pool_gpr_idxs) const {
     validate_arguments(in, out);
     emit_impl(in, out);
 }
diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_reg_spill_emitters.hpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_reg_spill_emitters.hpp
index a67caeb15ed6fb..0c7518d9df4b07 100644
--- a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_reg_spill_emitters.hpp
+++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_reg_spill_emitters.hpp
@@ -27,9 +27,9 @@ class jit_reg_spill_begin_emitter : public jit_emitter {
     void validate_arguments(const std::vector<size_t>& in, const std::vector<size_t>& out) const override;
     void emit_impl(const std::vector<size_t>& in, const std::vector<size_t>& out) const override;
     void emit_code_impl(const std::vector<size_t>& in_idxs,
-                    const std::vector<size_t>& out_idxs,
-                    const std::vector<size_t>& pool_vec_idxs,
-                    const std::vector<size_t>& pool_gpr_idxs) const override;
+                        const std::vector<size_t>& out_idxs,
+                        const std::vector<size_t>& pool_vec_idxs,
+                        const std::vector<size_t>& pool_gpr_idxs) const override;
     std::set<snippets::Reg> m_regs_to_spill;
     std::shared_ptr<EmitABIRegSpills> m_abi_reg_spiller;
 };
@@ -49,9 +49,9 @@ class jit_reg_spill_end_emitter : public jit_emitter {
     }
 
     void emit_code_impl(const std::vector<size_t>& in_idxs,
-                   const std::vector<size_t>& out_idxs,
-                   const std::vector<size_t>& pool_vec_idxs,
-                   const std::vector<size_t>& pool_gpr_idxs) const override;
+                        const std::vector<size_t>& out_idxs,
+                        const std::vector<size_t>& pool_vec_idxs,
+                        const std::vector<size_t>& pool_gpr_idxs) const override;
 
 protected:
     void validate_arguments(const std::vector<size_t>& in, const std::vector<size_t>& out) const override;
diff --git a/src/plugins/intel_cpu/src/nodes/executors/aarch64/jit_eltwise.cpp b/src/plugins/intel_cpu/src/nodes/executors/aarch64/jit_eltwise.cpp
index d2238b2d9f182c..492bf9d2899790 100644
--- a/src/plugins/intel_cpu/src/nodes/executors/aarch64/jit_eltwise.cpp
+++ b/src/plugins/intel_cpu/src/nodes/executors/aarch64/jit_eltwise.cpp
@@ -47,7 +47,7 @@ bool JitEltwiseExecutor::isSupported(const Algorithm& algorithm,
                                      Algorithm::EltwiseMod,
                                      Algorithm::EltwiseMultiply,
                                      Algorithm::EltwiseMulAdd,
-                                     Algorithm::EltwiseNotEqual,                                   
+                                     Algorithm::EltwiseNotEqual,
                                      Algorithm::EltwisePowerStatic,
                                      Algorithm::EltwisePrelu,
                                      Algorithm::EltwiseRelu,
diff --git a/src/plugins/intel_cpu/src/nodes/kernels/aarch64/jit_uni_eltwise_generic.cpp b/src/plugins/intel_cpu/src/nodes/kernels/aarch64/jit_uni_eltwise_generic.cpp
index 818d7bdbfef684..ca82079b58ba68 100644
--- a/src/plugins/intel_cpu/src/nodes/kernels/aarch64/jit_uni_eltwise_generic.cpp
+++ b/src/plugins/intel_cpu/src/nodes/kernels/aarch64/jit_uni_eltwise_generic.cpp
@@ -29,7 +29,10 @@ void jit_uni_eltwise_generic<isa>::generate() {
     preamble();
 
     static const std::vector<element::Type> exec_precisions_priority = {element::f16, element::f32};
-    auto const exec_prc = eltwise_precision_helper::get_precision(jep_.inputs_number, jep_.src_prc, eltwise_data_, exec_precisions_priority);
+    auto const exec_prc = eltwise_precision_helper::get_precision(jep_.inputs_number,
+                                                                  jep_.src_prc,
+                                                                  eltwise_data_,
+                                                                  exec_precisions_priority);
 
     eltwise_emitter = create_eltwise_emitter(eltwise_data_.front(), exec_prc);
     for (size_t i = 1; i < eltwise_data_.size(); ++i) {
@@ -46,8 +49,7 @@ void jit_uni_eltwise_generic<isa>::generate() {
         for (size_t i = 0; i < jep.inputs_number; i++) {
             ldr(start_to_offsets,
                 ptr(reg_const_params,
-                    static_cast<int32_t>(offsetof(jit_eltwise_call_args_ptrs, src_offsets) +
-                                         i * sizeof(size_t))));
+                    static_cast<int32_t>(offsetof(jit_eltwise_call_args_ptrs, src_offsets) + i * sizeof(size_t))));
             ldr(get_src_reg(i),
                 ptr(reg_const_params,
                     static_cast<int32_t>(offsetof(jit_eltwise_call_args_ptrs, src_ptr[0]) + i * sizeof(size_t))));
@@ -91,8 +93,7 @@ void jit_uni_eltwise_generic<isa>::generate() {
 
         for (size_t i = 0; i < jep.inputs_number; i++) {
             ldr(get_src_reg(i),
-                ptr(param1,
-                    static_cast<int32_t>(offsetof(jit_eltwise_call_args_ptrs, src_ptr) + i * sizeof(size_t))));
+                ptr(param1, static_cast<int32_t>(offsetof(jit_eltwise_call_args_ptrs, src_ptr) + i * sizeof(size_t))));
             init_ptrs_with_offsets(get_src_reg(i), jep.src_offsets[i]);
         }
 
@@ -786,7 +787,6 @@ struct SupportedPrecisions {
 };
 }  // namespace
 
-
 using namespace aarch64;
 
 std::set<std::vector<element::Type>> eltwise_precision_helper::get_supported_precisions(const Algorithm& algo) {
diff --git a/src/plugins/intel_cpu/src/nodes/kernels/x64/mlp_kernel.hpp b/src/plugins/intel_cpu/src/nodes/kernels/x64/mlp_kernel.hpp
index 438d84c16b3ece..e3aae6ab38eaec 100644
--- a/src/plugins/intel_cpu/src/nodes/kernels/x64/mlp_kernel.hpp
+++ b/src/plugins/intel_cpu/src/nodes/kernels/x64/mlp_kernel.hpp
@@ -529,8 +529,8 @@ class ReduceAdd2bh : public dnnl::impl::cpu::x64::jit_generator {
     struct CallArgs {
         float* src0;
         float* src1;
-        int16_t * dst;
-        int16_t * prefetch_dst;
+        int16_t* dst;
+        int16_t* prefetch_dst;
         int64_t num_cols;
     };
     // add two float input eltwise and convert to bf16 : ConvertFP32toBF16(src0 + src1)
@@ -545,7 +545,7 @@ class ReduceAdd2bh : public dnnl::impl::cpu::x64::jit_generator {
             // the prefetch distance is increased to ensure by the time store happens
             // prefetch has done and no HW prefetcher is triggered
             args.prefetch_dst = (m + 2 < num_rows) ? (args.dst + 2 * dst_stride) : (args.dst);
-            
+
             (*this)(&args);
         }
     }
diff --git a/thirdparty/itt_collector/sea_itt_lib/IttNotifyStdSrc.cpp b/thirdparty/itt_collector/sea_itt_lib/IttNotifyStdSrc.cpp
index ee2daf88d79ece..c2372a5f3f7bb6 100644
--- a/thirdparty/itt_collector/sea_itt_lib/IttNotifyStdSrc.cpp
+++ b/thirdparty/itt_collector/sea_itt_lib/IttNotifyStdSrc.cpp
@@ -1577,15 +1577,13 @@ void FillApiList(__itt_api_info* api_list_ptr) {
 #define ITT_STUB_IMPL_ORIG(name) ITT_STUB_IMPL(name)
 #ifdef _DEBUG  // dangerous stub that doesn't return anything (even when expected) but records the function call for
                // statistics sake
-#    define ITT_STUB_NO_IMPL(fn)                                              \
-        if (0 == strcmp("__itt_" ITT_TO_STR(fn), api_list_ptr[i].name)) {     \
-            struct local {                                                    \
-                static void stub(...) {                                       \
-                    CIttFnStat oIttFnStat("NO IMPL:\t" ITT_TO_STR(fn));       \
-                }                                                             \
-            };                                                                \
-            *api_list_ptr[i].func_ptr = reinterpret_cast<void*>(local::stub); \
-            continue;                                                         \
+#    define ITT_STUB_NO_IMPL(fn)                                                              \
+        if (0 == strcmp("__itt_" ITT_TO_STR(fn), api_list_ptr[i].name)) {                     \
+            struct local {                                                                    \
+                static void stub(...) { CIttFnStat oIttFnStat("NO IMPL:\t" ITT_TO_STR(fn)); } \
+            };                                                                                \
+            *api_list_ptr[i].func_ptr = reinterpret_cast<void*>(local::stub);                 \
+            continue;                                                                         \
         }
 #else
 #    define ITT_STUB_NO_IMPL(fn)

From 0aa889db67b63cc661bb4e5fe7a0706313865d6b Mon Sep 17 00:00:00 2001
From: Aleksandr Voron <aleksandr.voron@intel.com>
Date: Fri, 14 Feb 2025 17:00:53 +0100
Subject: [PATCH 06/10] [CPU][ACL] int8 MatMul support (#28870)

### Details:
- oneDNN has `acl_lowp_matmul_t` support brought by ACL team here:
https://github.com/oneapi-src/oneDNN/pull/1885
- The goal is to leverage this integration instead of implementing int8
MM executor from the scratch here:
https://github.com/openvinotoolkit/openvino/pull/27861
- int8 MatMul is disabled on purpose by applying `MatMulTransformation`
against `FullyConnected` nodes only, since int8 MM primitive
`jit_gemm_i8` is slower than fp16 ACL MatMul.
 - `FakeQuantize` tokenisation is disabled in snippets

### Tickets:
 - CVS-149495
---
 .../transformation_pipeline.cpp               | 16 +++++++
 .../aarch64/mat_mul_transformation.cpp        | 42 +++++++++++++++++++
 .../skip_tests_config.cpp                     |  4 ++
 .../mat_mul_transformation.cpp                |  4 +-
 4 files changed, 65 insertions(+), 1 deletion(-)
 create mode 100644 src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/aarch64/mat_mul_transformation.cpp

diff --git a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp
index 67667d4794aecd..d69b96a8fe9402 100644
--- a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp
+++ b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp
@@ -121,6 +121,7 @@
 #include "low_precision/group_convolution.hpp"
 #include "low_precision/multiply_to_group_convolution.hpp"
 #include "low_precision/network_helper.hpp"
+#include "low_precision/mat_mul.hpp"
 #include "low_precision/recurrent_cell.hpp"
 #include "low_precision/rt_info/bias_attribute.hpp"
 #include "transformations/low_precision/mark_dequantization_subgraph.hpp"
@@ -840,6 +841,21 @@ void Transformations::Lpt(const std::vector<ov::element::Type>& defaultPrecision
         },
         FuseConvertTransformation);
 
+    // Enable MatMulTransformation against FC nodes only
+    // int8 MatMul is disabled because acl_lowp_matmul_t supports 2D case only
+    // most models have 3D/4D cases, so fallback to jit_gemm_i8 gives worse perf than gemm_acl_f16
+    // oneDNN ticket #2696
+    CPU_SET_CALLBACK_ARM(
+        lptManager,
+        [&](const_node_ptr& node) -> bool {
+            if (NetworkHelper::isConstantPath(node->get_input_node_shared_ptr(1)) &&
+                one_of(node->input_value(1).get_partial_shape().rank().get_length(), 2, 3)) {
+                    return false;
+            }
+            return true;
+        },
+        MatMulTransformation);
+
     CPU_DISABLE_PASS_ARM(lptManager, RecurrentCellTransformation);
     CPU_DISABLE_PASS_COMMON(lptManager, MultiplyToGroupConvolutionTransformation);
 
diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/aarch64/mat_mul_transformation.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/aarch64/mat_mul_transformation.cpp
new file mode 100644
index 00000000000000..8a1bf320436a03
--- /dev/null
+++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/aarch64/mat_mul_transformation.cpp
@@ -0,0 +1,42 @@
+// Copyright (C) 2018-2025 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vector>
+#include <gtest/gtest.h>
+#include "low_precision_transformations/mat_mul_transformation.hpp"
+
+using namespace LayerTestsDefinitions;
+
+namespace {
+const std::vector<ov::element::Type> precisions = {
+        ov::element::f32
+};
+
+std::vector<MatMulTransformationTestValues> testValues = {
+    {
+        { 12, 2 },
+        { 256ul, ov::Shape({}), {0.f}, {25.5f}, {0.f}, {25.5f} },
+        { 2, 12 },
+        { 256ul, ov::Shape({}), {-12.8f}, {12.7f}, {-12.8f}, {12.7f} },
+        "matMul_original",
+        "u8"
+    },
+    {
+        { 12, 2 },
+        { 256ul, ov::Shape({}), {-12.8f}, {12.7f}, {-12.8f}, {12.7f} },
+        { 2, 12 },
+        { 256ul, ov::Shape({}), {-12.8f}, {12.7f}, {-12.8f}, {12.7f} },
+        "matMul_original",
+        "i8"
+    }
+};
+
+INSTANTIATE_TEST_SUITE_P(smoke_LPT, MatMulTransformation,
+    ::testing::Combine(
+        ::testing::ValuesIn(precisions),
+        ::testing::Values(ov::PartialShape({ 1, 384, 1024 })),
+        ::testing::Values(ov::test::utils::DEVICE_CPU),
+        ::testing::ValuesIn(testValues)),
+    MatMulTransformation::getTestCaseName);
+}  // namespace
diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp
index 7970312f59ff3f..a25fba30ec73ca 100644
--- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp
+++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp
@@ -306,6 +306,10 @@ std::vector<std::string> disabledTestPatterns() {
         retVector.emplace_back(R"(.*LSTMCellFusion/LSTMCellFusionWithSplitWeights.SubgraphFusedToLSTMCell/(1|8|15))");
         // Ticket: 131541
         retVector.emplace_back(R"(.*smoke_MulticlassNmsLayerTest_dynamic2.*_outType=i32_.*)");
+        // Ticket: 162434
+        retVector.emplace_back(R"(smoke_LPT/MatMulTransformation.*)");
+        // Ticket: 162260
+        retVector.emplace_back(R"(smoke_Snippets_FQDecomposition.*netPRC=f32_D=CPU.*)");
     }
     // invalid test: checks u8 precision for runtime graph, while it should be f32
     retVector.emplace_back(R"(smoke_NegativeQuantizedMatMulMultiplyFusion.*)");
diff --git a/src/tests/functional/plugin/shared/src/low_precision_transformations/mat_mul_transformation.cpp b/src/tests/functional/plugin/shared/src/low_precision_transformations/mat_mul_transformation.cpp
index 5f1726eff6d3b3..a52961be2ef5ef 100644
--- a/src/tests/functional/plugin/shared/src/low_precision_transformations/mat_mul_transformation.cpp
+++ b/src/tests/functional/plugin/shared/src/low_precision_transformations/mat_mul_transformation.cpp
@@ -30,7 +30,9 @@ std::string MatMulTransformation::getTestCaseName(const testing::TestParamInfo<M
         testValues.inputShape1 << "_" <<
         testValues.fqOnData1 << "_" <<
         testValues.inputShape2 << "_" <<
-        testValues.fqOnData2;
+        testValues.fqOnData2 << "_" <<
+        testValues.expectedRuntimePrecision << "_" <<
+        testValues.expectedKernelName;
 
     return result.str();
 }

From aed285c0a1cc0433de1286b20b0df712faf80c44 Mon Sep 17 00:00:00 2001
From: Alexey Smirnov <alexey.smirnov@intel.com>
Date: Fri, 14 Feb 2025 16:06:18 +0000
Subject: [PATCH 07/10] [NPUW] Fix weights_path property (#28997)

---
 .../intel_npu/src/al/include/intel_npu/config/runtime.hpp     | 2 +-
 src/plugins/intel_npu/src/al/src/config/runtime.cpp           | 1 +
 .../src/compiler_adapter/src/driver_compiler_adapter.cpp      | 4 ++++
 3 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/plugins/intel_npu/src/al/include/intel_npu/config/runtime.hpp b/src/plugins/intel_npu/src/al/include/intel_npu/config/runtime.hpp
index e90cf708b9d54d..f1fb8219ed19ad 100644
--- a/src/plugins/intel_npu/src/al/include/intel_npu/config/runtime.hpp
+++ b/src/plugins/intel_npu/src/al/include/intel_npu/config/runtime.hpp
@@ -215,7 +215,7 @@ struct WEIGHTS_PATH final : OptionBase<WEIGHTS_PATH, std::string> {
     }
 
     static OptionMode mode() {
-        return OptionMode::CompileTime;
+        return OptionMode::RunTime;
     }
 };
 
diff --git a/src/plugins/intel_npu/src/al/src/config/runtime.cpp b/src/plugins/intel_npu/src/al/src/config/runtime.cpp
index 3586cf18e08c8b..70eb06c1d6fa8d 100644
--- a/src/plugins/intel_npu/src/al/src/config/runtime.cpp
+++ b/src/plugins/intel_npu/src/al/src/config/runtime.cpp
@@ -26,6 +26,7 @@ void intel_npu::registerRunTimeOptions(OptionsDesc& desc) {
     desc.add<ENABLE_CPU_PINNING>();
     desc.add<WORKLOAD_TYPE>();
     desc.add<TURBO>();
+    desc.add<WEIGHTS_PATH>();
     desc.add<BYPASS_UMD_CACHING>();
     desc.add<RUN_INFERENCES_SEQUENTIALLY>();
 }
diff --git a/src/plugins/intel_npu/src/compiler_adapter/src/driver_compiler_adapter.cpp b/src/plugins/intel_npu/src/compiler_adapter/src/driver_compiler_adapter.cpp
index 51e42478e7cebc..5445a1b776bf90 100644
--- a/src/plugins/intel_npu/src/compiler_adapter/src/driver_compiler_adapter.cpp
+++ b/src/plugins/intel_npu/src/compiler_adapter/src/driver_compiler_adapter.cpp
@@ -580,6 +580,10 @@ std::string DriverCompilerAdapter::serializeConfig(const Config& config,
     std::ostringstream turbostring;
     turbostring << ov::intel_npu::turbo.name() << KEY_VALUE_SEPARATOR << VALUE_DELIMITER << "\\S+" << VALUE_DELIMITER;
     content = std::regex_replace(content, std::regex(turbostring.str()), "");
+    // Remove weights path property as it is not used by compiler
+    std::ostringstream weightspathstream;
+    weightspathstream << ov::weights_path.name() << KEY_VALUE_SEPARATOR << VALUE_DELIMITER << "\\S+" << VALUE_DELIMITER;
+    content = std::regex_replace(content, std::regex(weightspathstream.str()), "");
     // Remove Bypass UMD Caching propery
     std::ostringstream umdcachestring;
     umdcachestring << ov::intel_npu::bypass_umd_caching.name() << KEY_VALUE_SEPARATOR << VALUE_DELIMITER << "\\S+"

From 5755945f5b6683ee5fdf49d9a3ccfea47c31d5c5 Mon Sep 17 00:00:00 2001
From: Bogdan Pereanu <bogdan.pereanu@intel.com>
Date: Fri, 14 Feb 2025 18:53:28 +0200
Subject: [PATCH 08/10] [CORE] Fix tensor is_continuous method  (#28973)

### Details:
 - *If data is a contiguous memory area, return true*
- *Strides can be different and data can still be contiguous in memory.
Extra checks are needed in this case*

### Tickets:
 - *E#156237*

---------

Signed-off-by: Bogdan Pereanu <bogdan.pereanu@intel.com>
---
 src/core/src/runtime/itensor.cpp              |  17 ++-
 src/core/tests/ov_tensor_test.cpp             | 101 ++++++++++++++
 .../remote_tensor_tests/remote_run.hpp        | 128 ++++++++++++++++++
 3 files changed, 245 insertions(+), 1 deletion(-)

diff --git a/src/core/src/runtime/itensor.cpp b/src/core/src/runtime/itensor.cpp
index 4a5d011122f068..67dde4e38aa463 100644
--- a/src/core/src/runtime/itensor.cpp
+++ b/src/core/src/runtime/itensor.cpp
@@ -6,6 +6,7 @@
 
 #include <memory>
 
+#include "compare.hpp"
 #include "openvino/core/except.hpp"
 #include "openvino/core/shape_util.hpp"
 #include "openvino/core/type/element_iterator.hpp"
@@ -46,7 +47,21 @@ bool ITensor::is_continuous() const {
         // OpenVINO doesn't support strides for lp types
         return true;
     }
-    return default_byte_strides(get_shape(), get_element_type()) == get_strides();
+
+    const auto& strides = get_strides();
+    auto stride = strides.rbegin();
+    const auto default_strides = default_byte_strides(get_shape(), get_element_type());
+    auto default_stride = default_strides.rbegin();
+
+    for (; stride != strides.rend(); ++stride, ++default_stride) {
+        if (*stride != *default_stride) {
+            break;
+        }
+    }
+
+    const auto default_last = default_strides.rend();
+    return (default_stride == default_last) || (*default_stride < *stride && (get_shape()[0] == 1) &&
+                                                std::all_of(default_stride, default_last, cmp::Equal(*default_stride)));
 }
 
 void ITensor::copy_to(const std::shared_ptr<ov::ITensor>& dst) const {
diff --git a/src/core/tests/ov_tensor_test.cpp b/src/core/tests/ov_tensor_test.cpp
index fdb4fa28416408..6a386f0a659246 100644
--- a/src/core/tests/ov_tensor_test.cpp
+++ b/src/core/tests/ov_tensor_test.cpp
@@ -709,6 +709,107 @@ TEST_F(OVTensorTest, readRangeRoiBlobStringTensor) {
     }
 }
 
+TEST_F(OVTensorTest, checkIsContinuousTensorScalar) {
+    ov::Tensor tensor(ov::element::f32, ov::Shape{});
+    auto data = tensor.data();
+    auto strides = tensor.get_strides();
+
+    ov::Tensor view_tensor(ov::element::f32, ov::Shape{}, data, strides);
+    EXPECT_EQ(view_tensor.is_continuous(), true);
+}
+
+TEST_F(OVTensorTest, checkIsContinuousTensor1Dimension) {
+    ov::Tensor tensor(ov::element::f32, ov::Shape{128});
+    auto data = tensor.data();
+    auto strides = tensor.get_strides();
+
+    ov::Tensor view_tensor;
+
+    view_tensor = ov::Tensor(ov::element::f32, ov::Shape{128}, data, strides);
+    EXPECT_EQ(view_tensor.is_continuous(), true);
+
+    view_tensor = ov::Tensor(ov::element::f32, ov::Shape{16}, data, strides);
+    EXPECT_EQ(view_tensor.is_continuous(), true);
+}
+
+TEST_F(OVTensorTest, checkIsContinuousTensor2Dimensions) {
+    ov::Tensor tensor(ov::element::f32, ov::Shape{32, 128});
+    auto data = tensor.data();
+    auto strides = tensor.get_strides();
+
+    ov::Tensor view_tensor;
+
+    view_tensor = ov::Tensor(ov::element::f32, ov::Shape{16, 128}, data, strides);
+    EXPECT_EQ(view_tensor.is_continuous(), true);
+
+    view_tensor = ov::Tensor(ov::element::f32, ov::Shape{1, 128}, data, strides);
+    EXPECT_EQ(view_tensor.is_continuous(), true);
+
+    view_tensor = ov::Tensor(ov::element::f32, ov::Shape{1, 16}, data, strides);
+    EXPECT_EQ(view_tensor.is_continuous(), true);
+
+    view_tensor = ov::Tensor(ov::element::f32, ov::Shape{2, 16}, data, strides);
+    EXPECT_EQ(view_tensor.is_continuous(), false);
+}
+
+TEST_F(OVTensorTest, checkIsContinuousTensor3Dimensions) {
+    ov::Tensor tensor(ov::element::f32, ov::Shape{5, 32, 128});
+    auto data = tensor.data();
+    auto strides = tensor.get_strides();
+
+    ov::Tensor view_tensor;
+
+    view_tensor = ov::Tensor(ov::element::f32, ov::Shape{2, 32, 128}, data, strides);
+    EXPECT_EQ(view_tensor.is_continuous(), true);
+
+    view_tensor = ov::Tensor(ov::element::f32, ov::Shape{2, 16, 128}, data, strides);
+    EXPECT_EQ(view_tensor.is_continuous(), false);
+
+    view_tensor = ov::Tensor(ov::element::f32, ov::Shape{1, 1, 128}, data, strides);
+    EXPECT_EQ(view_tensor.is_continuous(), true);
+
+    view_tensor = ov::Tensor(ov::element::f32, ov::Shape{1, 1, 64}, data, strides);
+    EXPECT_EQ(view_tensor.is_continuous(), true);
+
+    view_tensor = ov::Tensor(ov::element::f32, ov::Shape{1, 16, 128}, data, strides);
+    EXPECT_EQ(view_tensor.is_continuous(), true);
+}
+
+TEST_F(OVTensorTest, checkIsContinuousTensor4Dimensions) {
+    ov::Tensor tensor(ov::element::f32, ov::Shape{3, 5, 32, 128});
+    auto data = tensor.data();
+    auto strides = tensor.get_strides();
+
+    ov::Tensor view_tensor;
+
+    view_tensor = ov::Tensor(ov::element::f32, ov::Shape{1, 2, 32, 128}, data, strides);
+    EXPECT_EQ(view_tensor.is_continuous(), true);
+
+    view_tensor = ov::Tensor(ov::element::f32, ov::Shape{2, 5, 32, 128}, data, strides);
+    EXPECT_EQ(view_tensor.is_continuous(), true);
+
+    view_tensor = ov::Tensor(ov::element::f32, ov::Shape{2, 2, 32, 128}, data, strides);
+    EXPECT_EQ(view_tensor.is_continuous(), false);
+
+    view_tensor = ov::Tensor(ov::element::f32, ov::Shape{1, 2, 5, 128}, data, strides);
+    EXPECT_EQ(view_tensor.is_continuous(), false);
+
+    view_tensor = ov::Tensor(ov::element::f32, ov::Shape{3, 5, 32, 64}, data, strides);
+    EXPECT_EQ(view_tensor.is_continuous(), false);
+
+    view_tensor = ov::Tensor(ov::element::f32, ov::Shape{1, 1, 16, 128}, data, strides);
+    EXPECT_EQ(view_tensor.is_continuous(), true);
+
+    view_tensor = ov::Tensor(ov::element::f32, ov::Shape{2, 1, 16, 128}, data, strides);
+    EXPECT_EQ(view_tensor.is_continuous(), false);
+
+    view_tensor = ov::Tensor(ov::element::f32, ov::Shape{1, 1, 1, 128}, data, strides);
+    EXPECT_EQ(view_tensor.is_continuous(), true);
+
+    view_tensor = ov::Tensor(ov::element::f32, ov::Shape{1, 1, 1, 32}, data, strides);
+    EXPECT_EQ(view_tensor.is_continuous(), true);
+}
+
 struct TestParams {
     ov::Shape src_shape;
     ov::Strides src_strides;
diff --git a/src/plugins/intel_npu/tests/functional/behavior/remote_tensor_tests/remote_run.hpp b/src/plugins/intel_npu/tests/functional/behavior/remote_tensor_tests/remote_run.hpp
index c1992b3047996d..b410ce70a5d3b8 100644
--- a/src/plugins/intel_npu/tests/functional/behavior/remote_tensor_tests/remote_run.hpp
+++ b/src/plugins/intel_npu/tests/functional/behavior/remote_tensor_tests/remote_run.hpp
@@ -93,6 +93,134 @@ class RemoteRunTests : public ov::test::behavior::OVPluginTestBase,
     }
 };
 
+TEST_P(RemoteRunTests, CheckIsContinuousHostTensorScalar) {
+    // Skip test according to plugin specific disabledTestPatterns() (if any)
+    SKIP_IF_CURRENT_TEST_IS_DISABLED()
+
+    auto zero_context = core->get_default_context(target_device);
+
+    auto host_tensor = zero_context.create_host_tensor(ov::element::f32, Shape{});
+    auto data = host_tensor.data();
+    auto strides = host_tensor.get_strides();
+
+    ov::Tensor view_tensor;
+
+    view_tensor = ov::Tensor(ov::element::f32, ov::Shape{}, data, strides);
+    EXPECT_EQ(view_tensor.is_continuous(), true);
+}
+
+TEST_P(RemoteRunTests, CheckIsContinuousHostTensor1Dimension) {
+    // Skip test according to plugin specific disabledTestPatterns() (if any)
+    SKIP_IF_CURRENT_TEST_IS_DISABLED()
+
+    auto zero_context = core->get_default_context(target_device);
+
+    auto host_tensor = zero_context.create_host_tensor(ov::element::f32, Shape{128});
+    auto data = host_tensor.data();
+    auto strides = host_tensor.get_strides();
+
+    ov::Tensor view_tensor;
+
+    view_tensor = ov::Tensor(ov::element::f32, ov::Shape{128}, data, strides);
+    EXPECT_EQ(view_tensor.is_continuous(), true);
+
+    view_tensor = ov::Tensor(ov::element::f32, ov::Shape{16}, data, strides);
+    EXPECT_EQ(view_tensor.is_continuous(), true);
+}
+
+TEST_P(RemoteRunTests, CheckIsContinuousHostTensor2Dimensions) {
+    // Skip test according to plugin specific disabledTestPatterns() (if any)
+    SKIP_IF_CURRENT_TEST_IS_DISABLED()
+
+    auto zero_context = core->get_default_context(target_device);
+
+    auto host_tensor = zero_context.create_host_tensor(ov::element::f32, Shape{32, 128});
+    auto data = host_tensor.data();
+    auto strides = host_tensor.get_strides();
+
+    ov::Tensor view_tensor;
+
+    view_tensor = ov::Tensor(ov::element::f32, Shape{16, 128}, data, strides);
+    EXPECT_EQ(view_tensor.is_continuous(), true);
+
+    view_tensor = ov::Tensor(ov::element::f32, Shape{1, 128}, data, strides);
+    EXPECT_EQ(view_tensor.is_continuous(), true);
+
+    view_tensor = ov::Tensor(ov::element::f32, Shape{1, 16}, data, strides);
+    EXPECT_EQ(view_tensor.is_continuous(), true);
+
+    view_tensor = ov::Tensor(ov::element::f32, Shape{2, 16}, data, strides);
+    EXPECT_EQ(view_tensor.is_continuous(), false);
+}
+
+TEST_P(RemoteRunTests, CheckIsContinuousHostTensor3Dimensions) {
+    // Skip test according to plugin specific disabledTestPatterns() (if any)
+    SKIP_IF_CURRENT_TEST_IS_DISABLED()
+
+    auto zero_context = core->get_default_context(target_device);
+
+    auto host_tensor = zero_context.create_host_tensor(ov::element::f32, Shape{5, 32, 128});
+    auto data = host_tensor.data();
+    auto strides = host_tensor.get_strides();
+
+    ov::Tensor view_tensor;
+
+    view_tensor = ov::Tensor(ov::element::f32, Shape{2, 32, 128}, data, strides);
+    EXPECT_EQ(view_tensor.is_continuous(), true);
+
+    view_tensor = ov::Tensor(ov::element::f32, Shape{2, 16, 128}, data, strides);
+    EXPECT_EQ(view_tensor.is_continuous(), false);
+
+    view_tensor = ov::Tensor(ov::element::f32, Shape{1, 1, 128}, data, strides);
+    EXPECT_EQ(view_tensor.is_continuous(), true);
+
+    view_tensor = ov::Tensor(ov::element::f32, Shape{1, 1, 64}, data, strides);
+    EXPECT_EQ(view_tensor.is_continuous(), true);
+
+    view_tensor = ov::Tensor(ov::element::f32, Shape{1, 16, 128}, data, strides);
+    EXPECT_EQ(view_tensor.is_continuous(), true);
+}
+
+TEST_P(RemoteRunTests, CheckIsContinuousHostTensor4Dimensions) {
+    // Skip test according to plugin specific disabledTestPatterns() (if any)
+    SKIP_IF_CURRENT_TEST_IS_DISABLED()
+
+    auto zero_context = core->get_default_context(target_device);
+
+    auto host_tensor = zero_context.create_host_tensor(ov::element::f32, Shape{3, 5, 32, 128});
+    auto data = host_tensor.data();
+    auto strides = host_tensor.get_strides();
+
+    ov::Tensor view_tensor;
+
+    view_tensor = ov::Tensor(ov::element::f32, Shape{1, 2, 32, 128}, data, strides);
+    EXPECT_EQ(view_tensor.is_continuous(), true);
+
+    view_tensor = ov::Tensor(ov::element::f32, Shape{2, 5, 32, 128}, data, strides);
+    EXPECT_EQ(view_tensor.is_continuous(), true);
+
+    view_tensor = ov::Tensor(ov::element::f32, Shape{2, 2, 32, 128}, data, strides);
+    EXPECT_EQ(view_tensor.is_continuous(), false);
+
+    view_tensor = ov::Tensor(ov::element::f32, Shape{1, 2, 5, 128}, data, strides);
+    EXPECT_EQ(view_tensor.is_continuous(), false);
+
+    view_tensor = ov::Tensor(ov::element::f32, Shape{3, 5, 32, 64}, data, strides);
+    EXPECT_EQ(view_tensor.is_continuous(), false);
+
+    view_tensor = ov::Tensor(ov::element::f32, Shape{1, 1, 16, 128}, data, strides);
+    EXPECT_EQ(view_tensor.is_continuous(), true);
+
+    view_tensor = ov::Tensor(ov::element::f32, Shape{2, 1, 16, 128}, data, strides);
+    EXPECT_EQ(view_tensor.is_continuous(), false);
+
+    view_tensor = ov::Tensor(ov::element::f32, Shape{1, 1, 1, 128}, data, strides);
+    EXPECT_EQ(view_tensor.is_continuous(), true);
+
+    view_tensor = ov::Tensor(ov::element::f32, Shape{1, 1, 1, 32}, data, strides);
+    EXPECT_EQ(view_tensor.is_continuous(), true);
+}
+
 TEST_P(RemoteRunTests, CheckRemoteTensorInternalBuf) {
     // Skip test according to plugin specific disabledTestPatterns() (if any)
     SKIP_IF_CURRENT_TEST_IS_DISABLED()

From 20ad7cb11906d30fb24bc131afa2b6e39566cbba Mon Sep 17 00:00:00 2001
From: Mustafa Cavus <mustafa.cavus@intel.com>
Date: Fri, 14 Feb 2025 13:07:23 -0800
Subject: [PATCH 09/10] Executorch initial support (#28425)

### Details:
 - OV side changes for initial ExecuTorch OV backend

### Tickets:
 - [*ticket-id*](https://jira.devtools.intel.com/browse/CVS-157257)

---------

Co-authored-by: ynimmaga <yamini.nimmagadda@intel.com>
Co-authored-by: Roman Kazantsev <roman.kazantsev@intel.com>
Co-authored-by: Maxim Vafin <maxim.vafin@intel.com>
---
 .../openvino/frontend/pytorch/fx_decoder.py   | 55 ++++++++++++-------
 .../pytorch/torchdynamo/op_support.py         |  9 +++
 src/frontends/pytorch/src/op_table.cpp        |  9 +++
 .../pytorch_tests/test_as_strided.py          | 34 ++++++++++++
 .../layer_tests/pytorch_tests/test_expand.py  | 25 +++++++++
 .../layer_tests/pytorch_tests/test_permute.py | 26 +++++++++
 .../layer_tests/pytorch_tests/test_select.py  | 28 ++++++++++
 tests/layer_tests/pytorch_tests/test_split.py | 25 +++++++++
 .../layer_tests/pytorch_tests/test_squeeze.py | 30 ++++++++++
 .../pytorch_tests/test_unsqueeze.py           | 28 ++++++++++
 tests/layer_tests/pytorch_tests/test_view.py  | 43 +++++++++++++++
 11 files changed, 292 insertions(+), 20 deletions(-)

diff --git a/src/bindings/python/src/openvino/frontend/pytorch/fx_decoder.py b/src/bindings/python/src/openvino/frontend/pytorch/fx_decoder.py
index b636ad806e2df7..483a5e82c7a881 100644
--- a/src/bindings/python/src/openvino/frontend/pytorch/fx_decoder.py
+++ b/src/bindings/python/src/openvino/frontend/pytorch/fx_decoder.py
@@ -178,7 +178,7 @@ def __init__(self, pt_module, fx_gm=None, nodes=None,
         self._input_signature = []
         self._example_input = None
 
-        if issubclass(type(pt_module), torch.fx.graph_module.GraphModule):
+        if isinstance(pt_module, torch.fx.graph_module.GraphModule):
             self._input_is_list = None
             self._nodes = list(pt_module.graph.nodes)
             found_types = []
@@ -187,38 +187,34 @@ def __init__(self, pt_module, fx_gm=None, nodes=None,
                 if value.op == 'placeholder':
                     self._inputs.append(i)
                     self._input_signature.append(value.name)
-                    if hasattr(value, "meta") and ('tensor_meta' in value.meta.keys()) and value.meta['tensor_meta']:
-                        found_shapes.append(value.meta['tensor_meta'].shape)
-                        found_types.append(
-                            OVAny(pt_to_ov_type_map[str(value.meta['tensor_meta'].dtype)]))
-                    else:
-                        found_shapes.append(None)
-                        found_types.append(None)
+
+                    found_shapes.append(self.get_found_shape(value))
+                    found_types.append(self.get_found_dtype(value))
+                    if found_shapes[-1] is not None:
+                        new_shape = []
+                        for dim in found_shapes[-1]:
+                            if (dynamic_shapes or type(dim).__name__ == "SymInt"):
+                                new_shape.append(-1)
+                            else:
+                                new_shape.append(dim)
+                        found_shapes[-1] = torch.Size(new_shape)
+
                 elif value.op == 'output':
                     # Instead of putting output index, refer to its target
                     uargs = self.unpack_containers(value.args)
                     self._outputs = [(arg[0], self._nodes.index(arg[1]))
                                      for arg in uargs if arg[1] is not None]
-            for idx, shape in enumerate(found_shapes):
-                if shape is not None:
-                    new_shape = []
-                    for dim in shape:
-                        if (dynamic_shapes or type(dim).__name__ == "SymInt"):
-                            new_shape.append(-1)
-                        else:
-                            new_shape.append(dim)
-                    found_shapes[idx] = torch.Size(new_shape)
 
             if not input_shapes or len(input_shapes) == 0:
                 self.input_shapes = found_shapes
             if not input_types or len(input_types) == 0:
                 self.input_types = found_types
 
-            if hasattr(pt_module, "forward"):
-                input_params = inspect.signature(pt_module.forward).parameters
+            if hasattr(self.pt_module, "forward"):
+                input_params = inspect.signature(self.pt_module.forward).parameters
                 self._input_signature = list(input_params)
 
-        elif issubclass(type(pt_module), torch.fx.Node):
+        elif isinstance(pt_module, torch.fx.Node):
             self._nodes = nodes  # passed from outer context
 
             # FIXME: Quadratic complexity nodes*nodes considering the outer loop over all nodes
@@ -234,6 +230,23 @@ def __init__(self, pt_module, fx_gm=None, nodes=None,
                 self.input_types.append(
                     BaseFXDecoder.get_type_for_value(arg))
 
+    @staticmethod
+    def get_found_shape(value) -> str:
+        # If input is a tensor, read the shape from meta data
+        if hasattr(value, "meta"):
+            if ('tensor_meta' in value.meta.keys()) and value.meta['tensor_meta']:
+                return value.meta['tensor_meta'].shape
+            if ('val' in value.meta.keys()) and isinstance(value.meta["val"], torch.Tensor):
+                return value.meta['val'].shape
+        return None
+
+    @staticmethod
+    def get_found_dtype(value) -> str:
+        # If input is a tensor, read the data type from meta data
+        if hasattr(value, "meta") and ('tensor_meta' in value.meta.keys()) and value.meta['tensor_meta']:
+            return OVAny(pt_to_ov_type_map[str(value.meta['tensor_meta'].dtype)])
+        return None
+
     def get_input_signature_name(self, index: int) -> str:
         if self._input_signature is not None and index < len(self._input_signature):
             return self._input_signature[index]
@@ -331,6 +344,8 @@ def get_subgraph_decoder(self, index):
 
     def get_op_type(self):
         if self.pt_module.op == 'call_function':
+            if type(self.pt_module.target).__name__ == "EdgeOpOverload":
+                return self.pt_module.target.__name__
             return str(self.pt_module.target)
         elif self.pt_module.op == 'get_attr':
             return 'get_attr'  # FIXME should be aligned with get_attr from TS implementation
diff --git a/src/bindings/python/src/openvino/frontend/pytorch/torchdynamo/op_support.py b/src/bindings/python/src/openvino/frontend/pytorch/torchdynamo/op_support.py
index 8ca3b7b489f665..da36e091ee2ab6 100644
--- a/src/bindings/python/src/openvino/frontend/pytorch/torchdynamo/op_support.py
+++ b/src/bindings/python/src/openvino/frontend/pytorch/torchdynamo/op_support.py
@@ -75,6 +75,7 @@ def __init__(self, options):
             "torch.ops.aten.argmin.default": None,
             "torch.ops.aten.as_strided.default": None,
             "torch.ops.aten.as_strided_.default": None,
+            "torch.ops.aten.as_strided_copy.default": None,
             "torch.ops.aten.asin.default": None,
             "torch.ops.aten.asinh.default": None,
             "torch.ops.aten.asinh.default": None,
@@ -118,6 +119,7 @@ def __init__(self, options):
             "torch.ops.aten.erf.default": None,
             "torch.ops.aten.exp.default": None,
             "torch.ops.aten.expand.default": None,
+            "torch.ops.aten.expand_copy.default": None,
             "torch.ops.aten.fake_quantize_per_channel_affine_cachemask.default": None,
             "torch.ops.aten.fill.Scalar": None,
             "torch.ops.aten.fill_.Scalar": None,
@@ -196,6 +198,7 @@ def __init__(self, options):
             "torch.ops.aten.new_zeros.default": None,
             "torch.ops.aten.ones.default": None,
             "torch.ops.aten.permute.default": None,
+            "torch.ops.aten.permute_copy.default": None,
             "torch.ops.aten.pow.Scalar": None,
             "torch.ops.aten.pow.Tensor_Scalar": None,
             "torch.ops.aten.pow.Tensor_Tensor": None,
@@ -213,6 +216,7 @@ def __init__(self, options):
             "torch.ops.aten.scatter.src": None,
             "torch.ops.aten.scatter.value": None,
             "torch.ops.aten.select.int": None,
+            "torch.ops.aten.select_copy.int": None,
             "torch.ops.aten.select_scatter.default": None,
             "torch.ops.aten.sigmoid.default": None,
             "torch.ops.aten.sigmoid_.default": None,
@@ -222,13 +226,16 @@ def __init__(self, options):
             "torch.ops.aten.sin.default": None,
             "torch.ops.aten.sinh.default": None,
             "torch.ops.aten.slice.Tensor": None,
+            "torch.ops.aten.slice_copy.Tensor": None,
             "torch.ops.aten.slice_scatter.default": None,
             "torch.ops.aten.sort.default": None,
             "torch.ops.aten.split.Tensor": None,
             "torch.ops.aten.split_with_sizes.default": None,
+            "torch.ops.aten.split_with_sizes_copy.default": None,
             "torch.ops.aten.sqrt.default": None,
             "torch.ops.aten.squeeze.dim": None,
             "torch.ops.aten.squeeze.dims": None,
+            "torch.ops.aten.squeeze_copy.dims": None,
             "torch.ops.aten.stack.default": None,
             "torch.ops.aten.std.correction": None,
             "torch.ops.aten.sub.default": None,
@@ -246,10 +253,12 @@ def __init__(self, options):
             "torch.ops.aten.unbind.int": None,
             "torch.ops.aten.unfold.default": None,
             "torch.ops.aten.unsqueeze.default": None,
+            "torch.ops.aten.unsqueeze_copy.default": None,
             "torch.ops.aten.upsample_nearest2d.default": None,
             "torch.ops.aten.var.correction": None,
             "torch.ops.aten.var_mean.correction": None,
             "torch.ops.aten.view.default": None,
+            "torch.ops.aten.view_copy.default": None,
             "torch.ops.aten.where.self": None,
             "torch.ops.aten.zeros.default": None,
             "torch.ops.aten.zeros_like.default": None,
diff --git a/src/frontends/pytorch/src/op_table.cpp b/src/frontends/pytorch/src/op_table.cpp
index ef75a253f7506a..2d33b32472ba36 100644
--- a/src/frontends/pytorch/src/op_table.cpp
+++ b/src/frontends/pytorch/src/op_table.cpp
@@ -804,6 +804,7 @@ const std::unordered_map<std::string, CreatorFunction> get_supported_ops_fx() {
         {"aten.argmin.default", op::translate_argmin},
         {"aten.as_strided.default", op::translate_as_strided},
         {"aten.as_strided_.default", op::translate_as_strided},
+        {"aten.as_strided_copy.default", op::translate_as_strided},
         {"aten.asin.default", op::translate_1to1_match_1_inputs_with_fp32_type_alignment<opset10::Asin>},
         {"aten.asinh.default", op::translate_1to1_match_1_inputs_with_fp32_type_alignment<opset10::Asinh>},
         {"aten.atan.default", op::translate_1to1_match_1_inputs_with_fp32_type_alignment<opset10::Atan>},
@@ -854,6 +855,7 @@ const std::unordered_map<std::string, CreatorFunction> get_supported_ops_fx() {
         {"aten.exp.default", op::translate_1to1_match_1_inputs_with_fp32_type_alignment<opset10::Exp>},
         {"aten.expm1.default", op::translate_expm1},
         {"aten.expand.default", op::translate_expand},
+        {"aten.expand_copy.default", op::translate_expand},
         {"aten.eye.m", op::translate_eye_fx},
         {"aten.fake_quantize_per_channel_affine_cachemask.default", op::translate_fake_quantize_per_channel_affine_fx},
         {"aten.fill.Scalar", op::translate_fill},
@@ -936,6 +938,7 @@ const std::unordered_map<std::string, CreatorFunction> get_supported_ops_fx() {
         {"aten.ones.names", op::translate_ones_fx},
         {"aten.ones_like.default", op::translate_ones_like_fx},
         {"aten.permute.default", op::translate_permute},
+        {"aten.permute_copy.default", op::translate_1to1_match_2_inputs<opset10::Transpose>},
         {"aten.pow.Scalar", op::translate_pow},
         {"aten.pow.Tensor_Scalar", op::translate_pow},
         {"aten.pow.Tensor_Tensor", op::translate_pow},
@@ -958,6 +961,7 @@ const std::unordered_map<std::string, CreatorFunction> get_supported_ops_fx() {
         {"aten.scatter.value", op::translate_scatter},
         {"aten.scatter_add.default", op::translate_scatter_add},
         {"aten.select.int", op::translate_select},
+        {"aten.select_copy.int", op::translate_select},
         {"aten.select_scatter.default", op::translate_select_scatter_fx},
         {"aten.sigmoid.default", op::translate_1to1_match_1_inputs_with_fp32_type_alignment<opset10::Sigmoid>},
         {"aten.sigmoid_.default", op::translate_1to1_match_1_inputs_with_fp32_type_alignment<opset10::Sigmoid>},
@@ -967,13 +971,16 @@ const std::unordered_map<std::string, CreatorFunction> get_supported_ops_fx() {
         {"aten.sin.default", op::translate_1to1_match_1_inputs_with_fp32_type_alignment<opset10::Sin>},
         {"aten.sinh.default", op::translate_1to1_match_1_inputs_with_fp32_type_alignment<opset10::Sinh>},
         {"aten.slice.Tensor", op::translate_slice_fx},
+        {"aten.slice_copy.Tensor", op::translate_slice_fx},
         {"aten.slice_scatter.default", op::translate_slice_scatter_fx},
         {"aten.sort.default", op::translate_sort_fx},
         {"aten.split.Tensor", op::translate_chunk_fx},
         {"aten.split_with_sizes.default", op::translate_split_with_sizes_fx},
+        {"aten.split_with_sizes_copy.default", op::translate_split_with_sizes_fx},
         {"aten.sqrt.default", op::translate_1to1_match_1_inputs_with_fp32_type_alignment<opset10::Sqrt>},
         {"aten.squeeze.dim", op::translate_squeeze},
         {"aten.squeeze.dims", op::translate_squeeze},
+        {"aten.squeeze_copy.dims", op::translate_squeeze},
         {"aten.stack.default", op::translate_stack_fx},
         {"aten.std.correction", op::translate_std_fx},
         {"aten.sub.default", op::translate_sub_fx},
@@ -991,10 +998,12 @@ const std::unordered_map<std::string, CreatorFunction> get_supported_ops_fx() {
         {"aten.unbind.int", op::translate_unbind_int_fx},
         {"aten.unfold.default", op::translate_unfold},
         {"aten.unsqueeze.default", op::translate_1to1_match_2_inputs<opset10::Unsqueeze>},
+        {"aten.unsqueeze_copy.default", op::translate_1to1_match_2_inputs<opset10::Unsqueeze>},
         {"aten.upsample_nearest2d.default", op::translate_upsample_nearest2d},
         {"aten.var.correction", op::translate_var_fx},
         {"aten.var_mean.correction", op::translate_var_mean_fx},
         {"aten.view.default", op::translate_reshape},
+        {"aten.view_copy.default", op::translate_reshape},
         {"aten.view_as_complex.default", op::translate_view_as_complex},
         {"aten.view_as_real.default", op::translate_view_as_real},
         {"aten.where.self", op::translate_where},
diff --git a/tests/layer_tests/pytorch_tests/test_as_strided.py b/tests/layer_tests/pytorch_tests/test_as_strided.py
index afdc25fd8657a7..674797ffda64cb 100644
--- a/tests/layer_tests/pytorch_tests/test_as_strided.py
+++ b/tests/layer_tests/pytorch_tests/test_as_strided.py
@@ -45,6 +45,40 @@ def forward(self, x):
     def test_as_strided(self, size, stride, offset, ie_device, precision, ir_version):
         self._test(*self.create_model(size, stride, offset), ie_device, precision, ir_version, trace_model=True)
 
+class TestAsStridedCopy(PytorchLayerTest):
+    def _prepare_input(self):
+        return (np.random.randn(8, 8).astype(np.float32),)
+
+    def create_model(self, size, stride, offset):
+        class aten_as_strided_copy(torch.nn.Module):
+            def __init__(self, size, stride, offset):
+                super().__init__()
+                self.size = size
+                self.stride = stride
+                self.offset = offset
+
+            def forward(self, x):
+                return torch.as_strided_copy(x, self.size, self.stride, self.offset)
+
+        ref_net = None
+
+        return aten_as_strided_copy(size, stride, offset), ref_net, "aten::as_strided_copy"
+
+    @pytest.mark.parametrize(
+        "size,stride",
+        [
+            ([1], [1]),
+            ([2, 2], [1, 1]),
+            ([5, 4, 3], [1, 3, 7]),
+            ([5, 5, 5], [5, 0, 5]),
+            ([1, 2, 3, 4], [4, 3, 2, 1]),
+        ],
+    )
+    @pytest.mark.parametrize("offset", [None, 1, 3, 7])
+    @pytest.mark.precommit_fx_backend
+    def test_as_strided_copy(self, size, stride, offset, ie_device, precision, ir_version):
+        self._test(*self.create_model(size, stride, offset), ie_device, precision, ir_version, trace_model=True)
+
 
 class TestAsStridedListConstruct(PytorchLayerTest):
     def _prepare_input(self, size_shape_tensor=[1], stride_shape_tensor=[1]):
diff --git a/tests/layer_tests/pytorch_tests/test_expand.py b/tests/layer_tests/pytorch_tests/test_expand.py
index e0f673fb927aaf..23d6eedf38bafe 100644
--- a/tests/layer_tests/pytorch_tests/test_expand.py
+++ b/tests/layer_tests/pytorch_tests/test_expand.py
@@ -41,6 +41,31 @@ def forward_broadcast(self, x):
     def test_expand(self, dims, op_type, ie_device, precision, ir_version):
         self._test(*self.create_model(dims, op_type), ie_device, precision, ir_version)
 
+class TestExpandCopy(PytorchLayerTest):
+    def _prepare_input(self):
+        import numpy as np
+        return (np.random.randn(1, 3).astype(np.float32),)
+
+    def create_model(self, dim):
+        import torch
+
+        class aten_expand_copy(torch.nn.Module):
+            def __init__(self, dims):
+                super(aten_expand_copy, self).__init__()
+                self.dims = dims
+
+            def forward(self, x):
+                return torch.expand_copy(x, self.dims)
+
+        ref_net = None
+
+        return aten_expand_copy(dim), ref_net, f"aten::expand_copy"
+
+    @pytest.mark.parametrize("dims", [(4, 3), (-1, -1), (1, 2, 3), (1, 2, 2, 3)])
+    @pytest.mark.precommit_fx_backend
+    def test_expand_copy(self, dims, ie_device, precision, ir_version):
+        self._test(*self.create_model(dims), ie_device, precision, ir_version)
+
 class TestExpandList(PytorchLayerTest):
     def _prepare_input(self, broadcast_shape):
         import numpy as np
diff --git a/tests/layer_tests/pytorch_tests/test_permute.py b/tests/layer_tests/pytorch_tests/test_permute.py
index efbd77d371eb89..d4b342e67273bc 100644
--- a/tests/layer_tests/pytorch_tests/test_permute.py
+++ b/tests/layer_tests/pytorch_tests/test_permute.py
@@ -38,9 +38,35 @@ def forward(self, x):
     @pytest.mark.nightly
     @pytest.mark.precommit
     @pytest.mark.precommit_torch_export
+    @pytest.mark.precommit_fx_backend
     def test_permute(self, order, complex_type, ie_device, precision, ir_version):
         self._test(*self.create_model(order, complex_type), ie_device, precision, ir_version)
 
+class TestPermuteCopy(PytorchLayerTest):
+    def _prepare_input(self):
+        import numpy as np
+        return (np.random.randn(1, 3, 224, 224).astype(np.float32),)
+
+    def create_model(self, order):
+        import torch
+
+        class aten_permute_copy(torch.nn.Module):
+            def __init__(self, order):
+                super(aten_permute_copy, self).__init__()
+                self.order = order
+
+            def forward(self, x):
+                return torch.permute_copy(x, self.order)
+
+        ref_net = None
+
+        return aten_permute_copy(order), ref_net, "aten::permute_copy"
+
+    @pytest.mark.parametrize("order", [[0, 2, 3, 1], [0, 3, 1, 2], [0, -1, 1, -2]])
+    @pytest.mark.precommit_fx_backend
+    def test_permute_copy(self, order, ie_device, precision, ir_version):
+        self._test(*self.create_model(order), ie_device, precision, ir_version)
+
 
 class TestPermuteList(PytorchLayerTest):
     def _prepare_input(self, permute_shape):
diff --git a/tests/layer_tests/pytorch_tests/test_select.py b/tests/layer_tests/pytorch_tests/test_select.py
index 5bd897e88148dd..828d0a57d60bdd 100644
--- a/tests/layer_tests/pytorch_tests/test_select.py
+++ b/tests/layer_tests/pytorch_tests/test_select.py
@@ -33,6 +33,34 @@ def forward(self, input_tensor):
     @pytest.mark.nightly
     @pytest.mark.precommit
     @pytest.mark.precommit_torch_export
+    @pytest.mark.precommit_fx_backend
     def test_select(self, ie_device, precision, ir_version, input_dim, input_index):
         self._test(*self.create_model(input_dim, input_index),
                    ie_device, precision, ir_version)
+
+@pytest.mark.parametrize('input_dim', list(range(-3, 4)))
+@pytest.mark.parametrize('input_index', list(range(-3, 4)))
+class TestSelectCopy(PytorchLayerTest):
+
+    def _prepare_input(self):
+        return (np.random.randn(4, 4, 5, 5).astype(np.float32),)
+
+    def create_model(self, input_dim, input_index):
+        class aten_select_copy(torch.nn.Module):
+
+            def __init__(self, input_dim, input_index) -> None:
+                super().__init__()
+                self.dim = input_dim
+                self.index = input_index
+
+            def forward(self, input_tensor):
+                return torch.select_copy(input_tensor, int(self.dim), int(self.index))
+
+        ref_net = None
+
+        return aten_select_copy(input_dim, input_index), ref_net, "aten::select_copy"
+
+    @pytest.mark.precommit_fx_backend
+    def test_select_copy(self, ie_device, precision, ir_version, input_dim, input_index):
+        self._test(*self.create_model(input_dim, input_index),
+                   ie_device, precision, ir_version)
diff --git a/tests/layer_tests/pytorch_tests/test_split.py b/tests/layer_tests/pytorch_tests/test_split.py
index e1ab4ed19ff701..497f314b7470c5 100644
--- a/tests/layer_tests/pytorch_tests/test_split.py
+++ b/tests/layer_tests/pytorch_tests/test_split.py
@@ -99,6 +99,31 @@ def forward(self, x, y):
 
     @pytest.mark.nightly
     @pytest.mark.precommit
+    @pytest.mark.precommit_fx_backend
     def test_split_with_sizes(self, ie_device, precision, ir_version):
         self._test(*self.create_model(),
                    ie_device, precision, ir_version, trace_model=True)
+
+class TestSplitWithSizesCopy(PytorchLayerTest):
+    def _prepare_input(self):
+        import numpy as np
+        return (np.random.randn(20).astype(np.float32),np.random.randn(20).astype(np.float32))
+
+    def create_model(self):
+        import torch
+
+        class aten_split_with_sizes_copy(torch.nn.Module):
+            def __init__(self):
+                super(aten_split_with_sizes_copy, self).__init__()                
+
+            def forward(self, x, y):
+                return torch.split_with_sizes_copy(x, [y.shape[0]], dim=0)
+
+        ref_net = None
+
+        return aten_split_with_sizes_copy(), ref_net, ["aten::split_with_sizes", "prim::ListConstruct"]
+
+    @pytest.mark.precommit_fx_backend
+    def test_split_with_sizes_copy(self, ie_device, precision, ir_version):
+        self._test(*self.create_model(),
+                   ie_device, precision, ir_version, trace_model=True)
diff --git a/tests/layer_tests/pytorch_tests/test_squeeze.py b/tests/layer_tests/pytorch_tests/test_squeeze.py
index 2f67ec89fcd481..3fa70532a46ee6 100644
--- a/tests/layer_tests/pytorch_tests/test_squeeze.py
+++ b/tests/layer_tests/pytorch_tests/test_squeeze.py
@@ -45,3 +45,33 @@ def test_squeeze(self, dim, dynamic_shapes, ie_device, precision, ir_version):
     def test_squeeze_non_1(self, dim, ie_device, precision, ir_version):
         # Dynamic shapes are introducing dynamic rank, with is not suppoerted by Squeeze operation.
         self._test(*self.create_model(dim), ie_device, precision, ir_version, dynamic_shapes=False)
+
+class TestSqueezeCopy(PytorchLayerTest):
+    def _prepare_input(self):
+        import numpy as np
+
+        return (np.random.randn(1, 1, 32).astype(np.float32),)
+
+    def create_model(self, dim):
+        import torch
+
+        class aten_squeeze_copy(torch.nn.Module):
+            def __init__(self, dim):
+                super(aten_squeeze_copy, self).__init__()
+                self.dim = dim
+
+            def forward(self, x):
+                if self.dim is not None:
+                    return torch.squeeze_copy(x, self.dim)
+                return torch.squeeze_copy(x)
+
+        ref_net = None
+
+        return aten_squeeze_copy(dim), ref_net, "aten::squeeze_copy"
+
+    @pytest.mark.parametrize("dim,dynamic_shapes", [(-2, True), (0, True), (None, False)])
+    @pytest.mark.precommit_fx_backend
+    def test_squeeze_copy(self, dim, dynamic_shapes, ie_device, precision, ir_version):
+        if PytorchLayerTest.use_torch_export() and dim is None:
+            pytest.xfail(reason="export fails if dim is not provided")
+        self._test(*self.create_model(dim), ie_device, precision, ir_version, dynamic_shapes=dynamic_shapes)
diff --git a/tests/layer_tests/pytorch_tests/test_unsqueeze.py b/tests/layer_tests/pytorch_tests/test_unsqueeze.py
index e77a43a7d79d8c..d7ba91be8b1487 100644
--- a/tests/layer_tests/pytorch_tests/test_unsqueeze.py
+++ b/tests/layer_tests/pytorch_tests/test_unsqueeze.py
@@ -41,5 +41,33 @@ def forward(self, x):
     @pytest.mark.nightly
     @pytest.mark.precommit
     @pytest.mark.precommit_torch_export
+    @pytest.mark.precommit_fx_backend
     def test_unsqueeze(self, inplace, dim, ie_device, precision, ir_version):
         self._test(*self.create_model(inplace, dim), ie_device, precision, ir_version)
+
+class TestUnsqueezeCopy(PytorchLayerTest):
+    def _prepare_input(self):
+        import numpy as np
+        return (np.random.randn(5, 10).astype(np.float32),)
+
+    def create_model(self, dim=0):
+        import torch
+
+        class aten_unsqueeze_copy(torch.nn.Module):
+            def __init__(self, dim):
+                super(aten_unsqueeze_copy, self).__init__()
+                self.op = torch.unsqueeze_copy
+                self.dim = dim
+
+            def forward(self, x):
+                return x, self.op(x, self.dim)
+
+        ref_net = None
+        model_class, op = (aten_unsqueeze_copy, "aten::unsqueeze_copy")
+
+        return model_class(dim), ref_net, op
+
+    @pytest.mark.parametrize("dim", [0, 1, -1])
+    @pytest.mark.precommit_fx_backend
+    def test_unsqueeze_copy(self, dim, ie_device, precision, ir_version):
+        self._test(*self.create_model(dim), ie_device, precision, ir_version)
diff --git a/tests/layer_tests/pytorch_tests/test_view.py b/tests/layer_tests/pytorch_tests/test_view.py
index 3cdd42779b80e8..326249ce87dca9 100644
--- a/tests/layer_tests/pytorch_tests/test_view.py
+++ b/tests/layer_tests/pytorch_tests/test_view.py
@@ -142,6 +142,7 @@ def forward(self, input_tensor):
 
     @pytest.mark.nightly
     @pytest.mark.precommit
+    @pytest.mark.precommit_fx_backend
     def test_view(self, ie_device, precision, ir_version, input_shapes):
         self.input_data = []
         for input_shape in input_shapes:
@@ -150,3 +151,45 @@ def test_view(self, ie_device, precision, ir_version, input_shapes):
             else:
                 self.input_data.append(input_shape)
         self._test(*self.create_model(), ie_device, precision, ir_version)
+
+@pytest.mark.parametrize('input_shapes',
+[
+    [
+        [2, 3, 2], 2, 6
+    ],
+    [
+        [4], 2, 2
+    ],
+    [
+        [4], 2, 2.1
+    ]
+])
+class TestViewCopy(PytorchLayerTest):
+
+    def _prepare_input(self):
+        return (self.input_data[0],)
+
+    def create_model(self):
+        class aten_view_copy(torch.nn.Module):
+
+            def __init__(self, input_data) -> None:
+                super().__init__()
+                self.dim1 = input_data[1]
+                self.dim2 = input_data[2]
+
+            def forward(self, input_tensor):
+                return torch.view_copy(input_tensor, [self.dim1, int(self.dim2)])
+
+        ref_net = None
+
+        return aten_view_copy(self.input_data), ref_net, "aten::view_copy"
+
+    @pytest.mark.precommit_fx_backend
+    def test_view_copy(self, ie_device, precision, ir_version, input_shapes):
+        self.input_data = []
+        for input_shape in input_shapes:
+            if type(input_shape) is list:
+                self.input_data.append(np.random.randn(*input_shape).astype(np.float32))
+            else:
+                self.input_data.append(input_shape)
+        self._test(*self.create_model(), ie_device, precision, ir_version)

From e737014105e300628201daa13dde9127b6187caf Mon Sep 17 00:00:00 2001
From: Arseniy Obolenskiy <arseniy.obolenskiy@intel.com>
Date: Sat, 15 Feb 2025 12:10:10 +0100
Subject: [PATCH 10/10] [Transformations] Hotfix for clang-format remarks
 (#29004)

### Details:
Fix failing clang-format in pre-commit

### Tickets:
 - N/A
---
 .../scaled_dot_product_decomposition_test.cpp | 26 +++++++++----------
 .../transformation_pipeline.cpp               |  4 +--
 2 files changed, 14 insertions(+), 16 deletions(-)

diff --git a/src/common/transformations/tests/op_conversions/scaled_dot_product_decomposition_test.cpp b/src/common/transformations/tests/op_conversions/scaled_dot_product_decomposition_test.cpp
index 82a0f89e83786c..c7fc7d2557ce54 100644
--- a/src/common/transformations/tests/op_conversions/scaled_dot_product_decomposition_test.cpp
+++ b/src/common/transformations/tests/op_conversions/scaled_dot_product_decomposition_test.cpp
@@ -31,13 +31,12 @@
 using namespace ov;
 using namespace testing;
 
-const std::shared_ptr<ov::Node> scaled_dot_product_attention_decomposition(
-    std::shared_ptr<ov::Node> query,
-    std::shared_ptr<ov::Node> key,
-    std::shared_ptr<ov::Node> value,
-    std::shared_ptr<ov::Node> attention_mask,
-    std::shared_ptr<ov::Node> scale,
-    bool casual);
+const std::shared_ptr<ov::Node> scaled_dot_product_attention_decomposition(std::shared_ptr<ov::Node> query,
+                                                                           std::shared_ptr<ov::Node> key,
+                                                                           std::shared_ptr<ov::Node> value,
+                                                                           std::shared_ptr<ov::Node> attention_mask,
+                                                                           std::shared_ptr<ov::Node> scale,
+                                                                           bool casual);
 
 TEST_F(TransformationTestsF, ScaledDotProductAttentionDecompositionStaticBasic) {
     const PartialShape query_shape{1, 32, 32};
@@ -187,13 +186,12 @@ TEST_F(TransformationTestsF, ScaledDotProductAttentionDecompositionDynamic) {
     }
 }
 
-const std::shared_ptr<ov::Node> scaled_dot_product_attention_decomposition(
-    std::shared_ptr<ov::Node> query,
-    std::shared_ptr<ov::Node> key,
-    std::shared_ptr<ov::Node> value,
-    std::shared_ptr<ov::Node> attention_mask,
-    std::shared_ptr<ov::Node> scale,
-    bool casual) {
+const std::shared_ptr<ov::Node> scaled_dot_product_attention_decomposition(std::shared_ptr<ov::Node> query,
+                                                                           std::shared_ptr<ov::Node> key,
+                                                                           std::shared_ptr<ov::Node> value,
+                                                                           std::shared_ptr<ov::Node> attention_mask,
+                                                                           std::shared_ptr<ov::Node> scale,
+                                                                           bool casual) {
     const auto q_shape = std::make_shared<ov::op::v3::ShapeOf>(query, element::i32);
     const auto k_shape = std::make_shared<ov::op::v3::ShapeOf>(key, element::i32);
     const auto minus_one = ov::op::v0::Constant::create(element::i32, Shape{}, {-1});
diff --git a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp
index d69b96a8fe9402..5d095a4c80119b 100644
--- a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp
+++ b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp
@@ -119,9 +119,9 @@
 #include "low_precision/fold_convert.hpp"
 #include "low_precision/fuse_convert.hpp"
 #include "low_precision/group_convolution.hpp"
+#include "low_precision/mat_mul.hpp"
 #include "low_precision/multiply_to_group_convolution.hpp"
 #include "low_precision/network_helper.hpp"
-#include "low_precision/mat_mul.hpp"
 #include "low_precision/recurrent_cell.hpp"
 #include "low_precision/rt_info/bias_attribute.hpp"
 #include "transformations/low_precision/mark_dequantization_subgraph.hpp"
@@ -850,7 +850,7 @@ void Transformations::Lpt(const std::vector<ov::element::Type>& defaultPrecision
         [&](const_node_ptr& node) -> bool {
             if (NetworkHelper::isConstantPath(node->get_input_node_shared_ptr(1)) &&
                 one_of(node->input_value(1).get_partial_shape().rank().get_length(), 2, 3)) {
-                    return false;
+                return false;
             }
             return true;
         },