Skip to content

Commit

Permalink
[HETERO] support LLM and split model per available memory size (#21764)
Browse files Browse the repository at this point in the history
### Details:
 - Support LLM inference
      |device|
      |--|
      |HETERO:CPU|
      |HETERO:GPU|
      |HETERO:CPU,GPU|
      |HETERO:GPU,CPU|
      |HETERO:GPU.0,GPU.1|
      |HETERO:GPU.0,GPU.1,CPU|
      |HETERO:GPU.0,GPU.1,GPU.2|

- Use the `ov::hint::model_distribution_policy` property in
[PR23077](#23077)
- Use host memory in different subgraphs when input/output data exchange
- Mask supported nodes and unsupported nodes to Subgraph in graph, and
query model use subgraph, keep the model in query_model same as compile
- Add a property `ov::query_model_ratio` to set the percentage of the
model can be queried during query model
- Improve performance on some LLM with large parameter by split the
model to different devices, the number of split models should be
smaller, reduce communication usage between multiple devices


### Tickets:
 - *CVS-133258*

---------

Co-authored-by: Nadezhda <[email protected]>
Co-authored-by: Shen, Wanglei <[email protected]>
Co-authored-by: yanlan song <[email protected]>
  • Loading branch information
4 people authored Mar 29, 2024
1 parent b250796 commit 3e114be
Show file tree
Hide file tree
Showing 22 changed files with 949 additions and 78 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,8 @@ void regmodule_properties(py::module m) {
.value("ECORE_ONLY", ov::hint::SchedulingCoreType::ECORE_ONLY);

py::enum_<ov::hint::ModelDistributionPolicy>(m_hint, "ModelDistributionPolicy", py::arithmetic())
.value("TENSOR_PARALLEL", ov::hint::ModelDistributionPolicy::TENSOR_PARALLEL);
.value("TENSOR_PARALLEL", ov::hint::ModelDistributionPolicy::TENSOR_PARALLEL)
.value("PIPELINE_PARALLEL", ov::hint::ModelDistributionPolicy::PIPELINE_PARALLEL);

py::enum_<ov::hint::ExecutionMode>(m_hint, "ExecutionMode", py::arithmetic())
.value("PERFORMANCE", ov::hint::ExecutionMode::PERFORMANCE)
Expand Down
28 changes: 28 additions & 0 deletions src/core/include/openvino/core/any.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

#include <map>
#include <memory>
#include <set>
#include <string>
#include <typeindex>
#include <typeinfo>
Expand Down Expand Up @@ -209,6 +210,18 @@ struct Read<std::vector<T, A>, typename std::enable_if<std::is_default_construct
}
};

template <typename K, typename C, typename A>
struct Read<std::set<K, C, A>, typename std::enable_if<std::is_default_constructible<K>::value>::type> {
void operator()(std::istream& is, std::set<K, C, A>& set) const {
while (is.good()) {
std::string str;
is >> str;
auto v = from_string<K>(str);
set.insert(std::move(v));
}
}
};

template <typename K, typename T, typename C, typename A>
struct Read<
std::map<K, T, C, A>,
Expand Down Expand Up @@ -343,6 +356,21 @@ struct Write<std::vector<T, A>> {
}
};

template <typename K, typename C, typename A>
struct Write<std::set<K, C, A>> {
void operator()(std::ostream& os, const std::set<K, C, A>& set) const {
if (!set.empty()) {
std::size_t i = 0;
for (auto&& v : set) {
os << to_string(v);
if (i < (set.size() - 1))
os << ' ';
++i;
}
}
}
};

template <typename K, typename T, typename C, typename A>
struct Write<std::map<K, T, C, A>> {
void operator()(std::ostream& os, const std::map<K, T, C, A>& map) const {
Expand Down
17 changes: 17 additions & 0 deletions src/core/tests/any.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,23 @@ TEST_F(AnyTests, AnyAsMapOfAnys) {
ASSERT_EQ(refMap["testParamString"].as<std::string>(), testString);
}

TEST_F(AnyTests, AnyAsSetOfAnys) {
std::set<std::string> refSet0;
std::set<int> refSet1;
refSet0.insert("test");
refSet1.insert(4);
Any s0 = refSet0;
Any s1 = refSet1;
bool isSet0 = s0.is<std::set<std::string>>();
bool isSet1 = s1.is<std::set<int>>();
ASSERT_TRUE(isSet0);
ASSERT_TRUE(isSet1);
auto testSet0 = s0.as<std::set<std::string>>();
auto testSet1 = s1.as<std::set<int>>();
ASSERT_NE(testSet0.count("test"), 0);
ASSERT_NE(testSet1.count(4), 0);
}

TEST_F(AnyTests, AnyAsMapOfMapOfAnys) {
std::map<std::string, Any> refMap1;
refMap1["testParamInt"] = 4;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -69,5 +69,12 @@ static constexpr Property<std::string, PropertyMutability::RO> compiled_model_ru
static constexpr Property<bool, PropertyMutability::RO> compiled_model_runtime_properties_supported{
"COMPILED_MODEL_RUNTIME_PROPERTIES_SUPPORTED"};

/**
* @brief Read-write property to set the percentage of the estimated model size which is used to determine the query
* model results for further processing
* @ingroup ov_dev_api_plugin_api
*/
static constexpr Property<float, PropertyMutability::RW> query_model_ratio{"QUERY_MODEL_RATIO"};

} // namespace internal
} // namespace ov
4 changes: 3 additions & 1 deletion src/inference/dev_api/openvino/runtime/iplugin.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -230,12 +230,14 @@ class OPENVINO_RUNTIME_API IPlugin : public std::enable_shared_from_this<IPlugin
* @param model Original model
* @param transform Transformation pipeline function
* @param is_node_supported Function returning whether node is supported or not
* @param query_model_ratio The percentage of the model can be queried during query model (0 if not query)
* @return Set of strings which contains supported node names
*/
OPENVINO_RUNTIME_API std::unordered_set<std::string> get_supported_nodes(
const std::shared_ptr<const ov::Model>& model,
std::function<void(std::shared_ptr<ov::Model>&)> transform,
std::function<bool(const std::shared_ptr<ov::Node>)> is_node_supported);
std::function<bool(const std::shared_ptr<ov::Node>)> is_node_supported,
float query_model_ratio = 1.0f);

/**
* @private
Expand Down
24 changes: 16 additions & 8 deletions src/inference/include/openvino/runtime/properties.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -400,16 +400,20 @@ inline std::istream& operator>>(std::istream& is, SchedulingCoreType& core_type)
static constexpr Property<SchedulingCoreType> scheduling_core_type{"SCHEDULING_CORE_TYPE"};

enum class ModelDistributionPolicy {
TENSOR_PARALLEL = 0, // Split tensor into several parts and distribute them between sockets/devices during model
// compilation. At inference time sockets/devices process tensors in parallel and do
// syncronization at the end ensuring mathematical correctness.
TENSOR_PARALLEL = 0, // Distribute tensor to multiple sockets/devices during model compilation. At inference
// time, sockets/devices process individual tensor in parallel.
PIPELINE_PARALLEL = 1, // Distribute tensor to multiple sockets/devices during model compilation. At inference
// time, sockets/devices process individual tensor one by one. And each socket/device
// processes a portion of a different tensor in parallel.
};

/** @cond INTERNAL */
inline std::ostream& operator<<(std::ostream& os, const ModelDistributionPolicy& stream_mode) {
switch (stream_mode) {
case ModelDistributionPolicy::TENSOR_PARALLEL:
return os << "TENSOR_PARALLEL";
case ModelDistributionPolicy::PIPELINE_PARALLEL:
return os << "PIPELINE_PARALLEL";
default:
OPENVINO_THROW("Unsupported model distribution policy!");
}
Expand All @@ -420,6 +424,8 @@ inline std::istream& operator>>(std::istream& is, ModelDistributionPolicy& strea
is >> str;
if (str == "TENSOR_PARALLEL") {
stream_mode = ModelDistributionPolicy::TENSOR_PARALLEL;
} else if (str == "PIPELINE_PARALLEL") {
stream_mode = ModelDistributionPolicy::PIPELINE_PARALLEL;
} else {
OPENVINO_THROW("Unsupported model distribution policy: ", str);
}
Expand All @@ -430,17 +436,19 @@ inline std::istream& operator>>(std::istream& is, ModelDistributionPolicy& strea
/**
* @brief This property defines model distribution policy for inference with multiple sockets/devices.
* @ingroup ov_runtime_cpp_prop_api
*
* This property can be used to select model distribution policy between execution units (e.g. between CPU sockets/NUMA
* nodes or between different GPUs).
* -- TENSOR_PARALLEL : Split tensor into several parts and distribute them between sockets/devices during model
* compilation. At inference time sockets/devices process tensors in parallel and do syncronization
* at the end ensuring mathematical correctness.
* -- TENSOR_PARALLEL : Distribute tensor to multiple sockets/devices during model compilation. At inference time,
* sockets/devices process individual tensor in parallel.
* -- PIPELINE_PARALLEL : Distribute tensor to multiple sockets/devices during model compilation. At inference time,
* sockets/devices process individual tensor one by one. And each socket/device processes a
* portion of a different tensor in parallel.
*
* The following code is an example how TENSOR_PARALLEL model disrtibution policy might be enabled.
* The following code is an example how TENSOR_PARALLEL or PIPELINE_PARALLEL model distribution policy might be enabled.
*
* @code
* ie.set_property(ov::hint::model_distribution_policy({ov::hint::ModelDistributionPolicy::TENSOR_PARALLEL}));
* ie.set_property(ov::hint::model_distribution_policy({ov::hint::ModelDistributionPolicy::PIPELINE_PARALLEL}));
* @endcode
*/
static constexpr Property<std::set<ModelDistributionPolicy>> model_distribution_policy{"MODEL_DISTRIBUTION_POLICY"};
Expand Down
Loading

0 comments on commit 3e114be

Please sign in to comment.