Skip to content

Commit

Permalink
[XPU] refactor thread_local
Browse files Browse the repository at this point in the history
  • Loading branch information
xiuxin121 committed Dec 7, 2022
1 parent ff9406b commit f2e70ba
Show file tree
Hide file tree
Showing 13 changed files with 477 additions and 177 deletions.
3 changes: 3 additions & 0 deletions lite/api/cxx_api.cc
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,9 @@ void Predictor::SaveOpKernelInfo(const std::string &model_dir) {

#if !defined(LITE_WITH_METAL)
lite::Tensor *Predictor::GetInput(size_t offset) {
#ifdef LITE_WITH_XPU
XPU_CALL(xpu_set_device(xpu_runtime_option_.xpu_dev_num));
#endif
CHECK(input_names_.size() > offset)
<< "The network has " << input_names_.size() << " inputs"
<< ", the offset should be less than this.";
Expand Down
26 changes: 26 additions & 0 deletions lite/api/cxx_api.h
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,12 @@ class LITE_API Predictor {
CheckInputValid();

#ifdef LITE_WITH_XPU
if (lite::TargetWrapperXPU::xpu_runtime_ptr != &xpu_runtime_option_) {
lite::TargetWrapperXPU::xpu_runtime_ptr = &xpu_runtime_option_;
// thanks to rumtime context is thread_local,so we should set device when
// using different predictor in the same thread.
XPU_CALL(xpu_set_device(xpu_runtime_option_.xpu_dev_num));
}
std::vector<std::vector<int64_t>> query_shape;
for (size_t i = 0; i < input_names_.size(); i++) {
query_shape.push_back(std::vector<int64_t>(GetInput(i)->dims().data()));
Expand Down Expand Up @@ -236,6 +242,20 @@ class LITE_API Predictor {
/////////////////////////////////////////////////////////////////////////////
void CheckPaddleOpVersions(
const std::shared_ptr<cpp::ProgramDesc>& program_desc);
#ifdef LITE_WITH_XPU
void SetXPURunTimeOption(const lite_api::CxxConfig& config) {
if (config.get_xpu_runtime_option()) {
xpu_runtime_option_.Set(reinterpret_cast<const lite::XPURunTimeOption*>(
config.get_xpu_runtime_option()));
XPU_CALL(xpu_set_device(xpu_runtime_option_.xpu_dev_num));
}
}

void set_xpu_stream(void* stream) {
xpu_runtime_option_.xpu_stream.SetXPUStream(stream);
}

#endif

// #ifdef LITE_WITH_TRAIN
// void Run(const std::vector<framework::Tensor>& tensors) {
Expand All @@ -257,6 +277,9 @@ class LITE_API Predictor {
#endif

private:
#ifdef LITE_WITH_XPU
XPURunTimeOption xpu_runtime_option_;
#endif
std::shared_ptr<cpp::ProgramDesc> program_desc_;
std::shared_ptr<Scope> scope_;
Scope* exec_scope_;
Expand Down Expand Up @@ -323,6 +346,9 @@ class CxxPaddleApiImpl : public lite_api::PaddlePredictor {
const std::string& model_dir,
lite_api::LiteModelType model_type = lite_api::LiteModelType::kProtobuf,
bool record_info = false) override;
#ifdef LITE_WITH_XPU
void set_xpu_stream(void* stream);
#endif

private:
std::shared_ptr<Predictor> raw_predictor_;
Expand Down
9 changes: 9 additions & 0 deletions lite/api/cxx_api_impl.cc
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,9 @@ void CxxPaddleApiImpl::Init(const lite_api::CxxConfig &config) {
config_ = config;
mode_ = config.power_mode();
threads_ = config.threads();
#ifdef LITE_WITH_XPU
raw_predictor_->SetXPURunTimeOption(config);
#endif
#ifdef LITE_USE_THREAD_POOL
int thread_num = ThreadPool::Init(threads_);
if (thread_num > 1) {
Expand Down Expand Up @@ -278,6 +281,12 @@ bool CxxPaddleApiImpl::TryShrinkMemory() {
return raw_predictor_->TryShrinkMemory();
}

#ifdef LITE_WITH_XPU
void CxxPaddleApiImpl::set_xpu_stream(void *stream) {
raw_predictor_->set_xpu_stream(stream);
}
#endif

} // namespace lite

namespace lite_api {
Expand Down
65 changes: 56 additions & 9 deletions lite/api/paddle_api.cc
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
#ifdef LITE_WITH_XPU
#include <functional>
#include <mutex> // NOLINT
#include "lite/backends/xpu/runtime_option.h"
#include "lite/backends/xpu/target_wrapper.h"
#endif

Expand Down Expand Up @@ -478,10 +479,12 @@ void CxxConfig::set_xpu_l3_cache_method(size_t l3_size, bool locked) {
CHECK(lite::TargetWrapperXPU::shared_l3_size >= l3_size)
<< "Enlarge XPU Shared L3 Cache Is Not Allowed.";
}
lite::TargetWrapperXPU::local_l3_size = 0;
reinterpret_cast<lite::XPURunTimeOption *>(set_xpu_runtime_option())
->xpu_local_l3_size = 0;
lite::TargetWrapperXPU::need_l3_mutex = true;
} else {
lite::TargetWrapperXPU::local_l3_size = l3_size;
reinterpret_cast<lite::XPURunTimeOption *>(set_xpu_runtime_option())
->xpu_local_l3_size = l3_size;
lite::TargetWrapperXPU::need_l3_mutex = false;
}
#else
Expand All @@ -493,17 +496,19 @@ void CxxConfig::set_xpu_l3_cache_method(size_t l3_size, bool locked) {

void CxxConfig::set_xpu_l3_cache_autotune(bool autotune) {
#ifdef LITE_WITH_XPU
lite::TargetWrapperXPU::local_l3_autotune = autotune;
reinterpret_cast<lite::XPURunTimeOption *>(set_xpu_runtime_option())
->xpu_local_l3_autotune = autotune;
#else
LOG(WARNING) << "The invoking of the function "
"'set_xpu_l3_cache_autotune' is ignored, please "
"rebuild it with LITE_WITH_XPU=ON.";
#endif
}

void set_xpu_gm_workspace_method(size_t gm_size) {
void CxxConfig::set_xpu_gm_workspace_method(size_t gm_size) {
#ifdef LITE_WITH_XPU
lite::TargetWrapperXPU::local_gm_size = gm_size;
reinterpret_cast<lite::XPURunTimeOption *>(set_xpu_runtime_option())
->xpu_local_gm_size = gm_size;
#else
LOG(WARNING) << "The invoking of the function "
"'set_xpu_gm_workspace_method' is ignored, please "
Expand All @@ -513,7 +518,8 @@ void set_xpu_gm_workspace_method(size_t gm_size) {

void CxxConfig::set_xpu_dev_per_thread(int dev_no) {
#ifdef LITE_WITH_XPU
lite::TargetWrapperXPU::SetDev(dev_no);
reinterpret_cast<lite::XPURunTimeOption *>(set_xpu_runtime_option())
->xpu_dev_num = dev_no;
#else
LOG(WARNING) << "The invoking of the function 'set_xpu_dev_per_thread' is "
"ignored, please rebuild it with LITE_WITH_XPU=ON.";
Expand All @@ -522,7 +528,8 @@ void CxxConfig::set_xpu_dev_per_thread(int dev_no) {

void CxxConfig::enable_xpu_multi_stream() {
#ifdef LITE_WITH_XPU
lite::TargetWrapperXPU::enable_xpu_multi_stream();
reinterpret_cast<lite::XPURunTimeOption *>(set_xpu_runtime_option())
->xpu_enable_multi_stream = true;
#else
LOG(WARNING)
<< "The invoking of the function 'enable_xpu_stream_per_thread' is "
Expand Down Expand Up @@ -591,7 +598,8 @@ void CxxConfig::set_xpu_conv_autotune(bool autotune,

void CxxConfig::set_xpu_cluster_num(const int num) {
#ifdef LITE_WITH_XPU
lite::TargetWrapperXPU::cluster_num = num;
reinterpret_cast<lite::XPURunTimeOption *>(set_xpu_runtime_option())
->xpu_cluster_num = num;
#else
LOG(WARNING) << "The invoking of the function "
"'set_xpu_cluster_num' is ignored, please "
Expand All @@ -601,14 +609,37 @@ void CxxConfig::set_xpu_cluster_num(const int num) {

void CxxConfig::set_xpu_sdnn_num(const int num) {
#ifdef LITE_WITH_XPU
lite::TargetWrapperXPU::sdnn_num = num;
reinterpret_cast<lite::XPURunTimeOption *>(set_xpu_runtime_option())
->xpu_sdnn_num = num;
#else
LOG(WARNING) << "The invoking of the function "
"'set_xpu_sdnn_num' is ignored, please "
"rebuild it with LITE_WITH_XPU=ON.";
#endif
}

void CxxConfig::set_xpu_dump_tensor_path(const std::string dump_tensor_path) {
#ifdef LITE_WITH_XPU
reinterpret_cast<lite::XPURunTimeOption *>(set_xpu_runtime_option())
->xpu_dump_tensor_path = dump_tensor_path;
#else
LOG(WARNING) << "The invoking of the function "
"'set_xpu_dump_tensor_path' is ignored, please "
"rebuild it with LITE_WITH_XPU=ON.";
#endif
}

void CxxConfig::set_xpu_dump_log_path(const std::string dump_log_path) {
#ifdef LITE_WITH_XPU
reinterpret_cast<lite::XPURunTimeOption *>(set_xpu_runtime_option())
->xpu_dump_log_path = dump_log_path;
#else
LOG(WARNING) << "The invoking of the function "
"'set_xpu_dump_log_path' is ignored, please "
"rebuild it with LITE_WITH_XPU=ON.";
#endif
}

template <class T>
void CxxConfig::set_preferred_inputs_for_warmup(const int group_idx,
const int tensor_idx,
Expand Down Expand Up @@ -665,6 +696,22 @@ _SetPreferredInputsForWarmup(int32_t);
_SetPreferredInputsForWarmup(int64_t);
#undef _SetPreferredInputsForWarmup

void *CxxConfig::set_xpu_runtime_option() {
#ifdef LITE_WITH_XPU
if (runtime_option_ != nullptr) {
return runtime_option_.get();
}

runtime_option_ =
std::shared_ptr<lite::XPURunTimeOption>(new lite::XPURunTimeOption);

return runtime_option_.get();
#else
LOG(WARNING) << "'set_xpu_runtime_option' is only for xpu now, please "
"rebuild it with LITE_WITH_XPU=ON.";
#endif
}

// set model data in combined format, `set_model_from_file` refers to loading
// model from file, set_model_from_buffer refers to loading model from memory
// buffer
Expand Down
13 changes: 13 additions & 0 deletions lite/api/paddle_api.h
Original file line number Diff line number Diff line change
Expand Up @@ -380,6 +380,7 @@ class LITE_API CxxConfig : public ConfigBase {
float sparse_threshold_{0.6f};
std::map<int, std::vector<std::shared_ptr<void>>>
preferred_inputs_for_warmup_;
std::shared_ptr<void> runtime_option_{nullptr};
// The custom configuration file or buffer for the NNAdapter subgraph
// partition, here is an example:
// op_type:in_var_name_0,in_var_name1:out_var_name_0,out_var_name1
Expand Down Expand Up @@ -424,6 +425,13 @@ class LITE_API CxxConfig : public ConfigBase {
// but is_model_from_memory is recommended and `model_from_memory` will be
// abandoned in v3.0.
bool model_from_memory() const { return static_cast<bool>(model_buffer_); }
const void* get_xpu_runtime_option() const {
if (runtime_option_ != nullptr) {
return runtime_option_.get();
}

return nullptr;
}

// XPU only, set the size of the workspace memory from L3 cache for the
// current thread.
Expand Down Expand Up @@ -455,6 +463,8 @@ class LITE_API CxxConfig : public ConfigBase {
void set_xpu_sdnn_num(const int num);
void set_xpu_local_quant(bool local_quant = false);
void set_xpu_compute_precision(const std::string& precision = "int16");
void set_xpu_dump_tensor_path(const std::string dump_tensor_path = "");
void set_xpu_dump_log_path(const std::string dump_log_path = "");

// set input tensor for warmup.
// It is optional. If you set prefered_inputs, model wil run immediately when
Expand Down Expand Up @@ -520,6 +530,9 @@ class LITE_API CxxConfig : public ConfigBase {
const {
return mixed_precision_quantization_config_buffer_;
}

private:
void* set_xpu_runtime_option();
};

/// MobileConfig is the config for the light weight predictor, it will skip
Expand Down
Loading

0 comments on commit f2e70ba

Please sign in to comment.