[HETERO] support LLM and split model per available memory size (#21764)

### Details: - Support LLM inference |device| |--| |HETERO:CPU| |HETERO:GPU| |HETERO:CPU,GPU| |HETERO:GPU,CPU| |HETERO:GPU.0,GPU.1| |HETERO:GPU.0,GPU.1,CPU| |HETERO:GPU.0,GPU.1,GPU.2| - Use the `ov::hint::model_distribution_policy` property in [PR23077](#23077) - Use host memory in different subgraphs when input/output data exchange - Mask supported nodes and unsupported nodes to Subgraph in graph, and query model use subgraph, keep the model in query_model same as compile - Add a property `ov::query_model_ratio` to set the percentage of the model can be queried during query model - Improve performance on some LLM with large parameter by split the model to different devices, the number of split models should be smaller, reduce communication usage between multiple devices ### Tickets: - *CVS-133258* --------- Co-authored-by: Nadezhda <[email protected]> Co-authored-by: Shen, Wanglei <[email protected]> Co-authored-by: yanlan song <[email protected]>
openvinotoolkit · Mar 29, 2024 · 3e114be · 3e114be
1 parent b250796
commit 3e114be
Show file tree

Hide file tree

Showing 22 changed files with 949 additions and 78 deletions.
diff --git a/src/bindings/python/src/pyopenvino/core/properties/properties.cpp b/src/bindings/python/src/pyopenvino/core/properties/properties.cpp
@@ -72,7 +72,8 @@ void regmodule_properties(py::module m) {
         .value("ECORE_ONLY", ov::hint::SchedulingCoreType::ECORE_ONLY);
 
     py::enum_<ov::hint::ModelDistributionPolicy>(m_hint, "ModelDistributionPolicy", py::arithmetic())
-        .value("TENSOR_PARALLEL", ov::hint::ModelDistributionPolicy::TENSOR_PARALLEL);
+        .value("TENSOR_PARALLEL", ov::hint::ModelDistributionPolicy::TENSOR_PARALLEL)
+        .value("PIPELINE_PARALLEL", ov::hint::ModelDistributionPolicy::PIPELINE_PARALLEL);
 
     py::enum_<ov::hint::ExecutionMode>(m_hint, "ExecutionMode", py::arithmetic())
         .value("PERFORMANCE", ov::hint::ExecutionMode::PERFORMANCE)

diff --git a/src/core/include/openvino/core/any.hpp b/src/core/include/openvino/core/any.hpp
@@ -10,6 +10,7 @@
 
 #include <map>
 #include <memory>
+#include <set>
 #include <string>
 #include <typeindex>
 #include <typeinfo>
@@ -209,6 +210,18 @@ struct Read<std::vector<T, A>, typename std::enable_if<std::is_default_construct
     }
 };
 
+template <typename K, typename C, typename A>
+struct Read<std::set<K, C, A>, typename std::enable_if<std::is_default_constructible<K>::value>::type> {
+    void operator()(std::istream& is, std::set<K, C, A>& set) const {
+        while (is.good()) {
+            std::string str;
+            is >> str;
+            auto v = from_string<K>(str);
+            set.insert(std::move(v));
+        }
+    }
+};
+
 template <typename K, typename T, typename C, typename A>
 struct Read<
     std::map<K, T, C, A>,
@@ -343,6 +356,21 @@ struct Write<std::vector<T, A>> {
     }
 };
 
+template <typename K, typename C, typename A>
+struct Write<std::set<K, C, A>> {
+    void operator()(std::ostream& os, const std::set<K, C, A>& set) const {
+        if (!set.empty()) {
+            std::size_t i = 0;
+            for (auto&& v : set) {
+                os << to_string(v);
+                if (i < (set.size() - 1))
+                    os << ' ';
+                ++i;
+            }
+        }
+    }
+};
+
 template <typename K, typename T, typename C, typename A>
 struct Write<std::map<K, T, C, A>> {
     void operator()(std::ostream& os, const std::map<K, T, C, A>& map) const {

diff --git a/src/core/tests/any.cpp b/src/core/tests/any.cpp
@@ -158,6 +158,23 @@ TEST_F(AnyTests, AnyAsMapOfAnys) {
     ASSERT_EQ(refMap["testParamString"].as<std::string>(), testString);
 }
 
+TEST_F(AnyTests, AnyAsSetOfAnys) {
+    std::set<std::string> refSet0;
+    std::set<int> refSet1;
+    refSet0.insert("test");
+    refSet1.insert(4);
+    Any s0 = refSet0;
+    Any s1 = refSet1;
+    bool isSet0 = s0.is<std::set<std::string>>();
+    bool isSet1 = s1.is<std::set<int>>();
+    ASSERT_TRUE(isSet0);
+    ASSERT_TRUE(isSet1);
+    auto testSet0 = s0.as<std::set<std::string>>();
+    auto testSet1 = s1.as<std::set<int>>();
+    ASSERT_NE(testSet0.count("test"), 0);
+    ASSERT_NE(testSet1.count(4), 0);
+}
+
 TEST_F(AnyTests, AnyAsMapOfMapOfAnys) {
     std::map<std::string, Any> refMap1;
     refMap1["testParamInt"] = 4;

diff --git a/src/inference/dev_api/openvino/runtime/internal_properties.hpp b/src/inference/dev_api/openvino/runtime/internal_properties.hpp
@@ -69,5 +69,12 @@ static constexpr Property<std::string, PropertyMutability::RO> compiled_model_ru
 static constexpr Property<bool, PropertyMutability::RO> compiled_model_runtime_properties_supported{
     "COMPILED_MODEL_RUNTIME_PROPERTIES_SUPPORTED"};
 
+/**
+ * @brief Read-write property to set the percentage of the estimated model size which is used to determine the query
+ * model results for further processing
+ * @ingroup ov_dev_api_plugin_api
+ */
+static constexpr Property<float, PropertyMutability::RW> query_model_ratio{"QUERY_MODEL_RATIO"};
+
 }  // namespace internal
 }  // namespace ov
diff --git a/src/inference/dev_api/openvino/runtime/iplugin.hpp b/src/inference/dev_api/openvino/runtime/iplugin.hpp
@@ -230,12 +230,14 @@ class OPENVINO_RUNTIME_API IPlugin : public std::enable_shared_from_this<IPlugin
  * @param model Original model
  * @param transform Transformation pipeline function
  * @param is_node_supported Function returning whether node is supported or not
+ * @param query_model_ratio The percentage of the model can be queried during query model (0 if not query)
  * @return Set of strings which contains supported node names
  */
 OPENVINO_RUNTIME_API std::unordered_set<std::string> get_supported_nodes(
     const std::shared_ptr<const ov::Model>& model,
     std::function<void(std::shared_ptr<ov::Model>&)> transform,
-    std::function<bool(const std::shared_ptr<ov::Node>)> is_node_supported);
+    std::function<bool(const std::shared_ptr<ov::Node>)> is_node_supported,
+    float query_model_ratio = 1.0f);
 
 /**
  * @private

diff --git a/src/inference/include/openvino/runtime/properties.hpp b/src/inference/include/openvino/runtime/properties.hpp
@@ -400,16 +400,20 @@ inline std::istream& operator>>(std::istream& is, SchedulingCoreType& core_type)
 static constexpr Property<SchedulingCoreType> scheduling_core_type{"SCHEDULING_CORE_TYPE"};
 
 enum class ModelDistributionPolicy {
-    TENSOR_PARALLEL = 0,  // Split tensor into several parts and distribute them between sockets/devices during model
-                          // compilation. At inference time sockets/devices process tensors in parallel and do
-                          // syncronization at the end ensuring mathematical correctness.
+    TENSOR_PARALLEL = 0,    // Distribute tensor to multiple sockets/devices during model compilation. At inference
+                            // time, sockets/devices process individual tensor in parallel.
+    PIPELINE_PARALLEL = 1,  // Distribute tensor to multiple sockets/devices during model compilation. At inference
+                            // time, sockets/devices process individual tensor one by one. And each socket/device
+                            // processes a portion of a different tensor in parallel.
 };
 
 /** @cond INTERNAL */
 inline std::ostream& operator<<(std::ostream& os, const ModelDistributionPolicy& stream_mode) {
     switch (stream_mode) {
     case ModelDistributionPolicy::TENSOR_PARALLEL:
         return os << "TENSOR_PARALLEL";
+    case ModelDistributionPolicy::PIPELINE_PARALLEL:
+        return os << "PIPELINE_PARALLEL";
     default:
         OPENVINO_THROW("Unsupported model distribution policy!");
     }
@@ -420,6 +424,8 @@ inline std::istream& operator>>(std::istream& is, ModelDistributionPolicy& strea
     is >> str;
     if (str == "TENSOR_PARALLEL") {
         stream_mode = ModelDistributionPolicy::TENSOR_PARALLEL;
+    } else if (str == "PIPELINE_PARALLEL") {
+        stream_mode = ModelDistributionPolicy::PIPELINE_PARALLEL;
     } else {
         OPENVINO_THROW("Unsupported model distribution policy: ", str);
     }
@@ -430,17 +436,19 @@ inline std::istream& operator>>(std::istream& is, ModelDistributionPolicy& strea
 /**
  * @brief This property defines model distribution policy for inference with multiple sockets/devices.
  * @ingroup ov_runtime_cpp_prop_api
- *
  * This property can be used to select model distribution policy between execution units (e.g. between CPU sockets/NUMA
  * nodes or between different GPUs).
- * -- TENSOR_PARALLEL : Split tensor into several parts and distribute them between sockets/devices during model
- *                      compilation. At inference time sockets/devices process tensors in parallel and do syncronization
- *                      at the end ensuring mathematical correctness.
+ * -- TENSOR_PARALLEL   : Distribute tensor to multiple sockets/devices during model compilation. At inference time,
+ *                        sockets/devices process individual tensor in parallel.
+ * -- PIPELINE_PARALLEL : Distribute tensor to multiple sockets/devices during model compilation. At inference time,
+ *                        sockets/devices process individual tensor one by one. And each socket/device processes a
+ *                        portion of a different tensor in parallel.
  *
- * The following code is an example how TENSOR_PARALLEL model disrtibution policy might be enabled.
+ * The following code is an example how TENSOR_PARALLEL or PIPELINE_PARALLEL model distribution policy might be enabled.
  *
  * @code
  * ie.set_property(ov::hint::model_distribution_policy({ov::hint::ModelDistributionPolicy::TENSOR_PARALLEL}));
+ * ie.set_property(ov::hint::model_distribution_policy({ov::hint::ModelDistributionPolicy::PIPELINE_PARALLEL}));
  * @endcode
  */
 static constexpr Property<std::set<ModelDistributionPolicy>> model_distribution_policy{"MODEL_DISTRIBUTION_POLICY"};