[XPU] refactor thread_local

PaddlePaddle · Dec 7, 2022 · f2e70ba · f2e70ba
1 parent ff9406b
commit f2e70ba
Show file tree

Hide file tree

Showing 13 changed files with 477 additions and 177 deletions.
diff --git a/lite/api/cxx_api.cc b/lite/api/cxx_api.cc
@@ -182,6 +182,9 @@ void Predictor::SaveOpKernelInfo(const std::string &model_dir) {
 
 #if !defined(LITE_WITH_METAL)
 lite::Tensor *Predictor::GetInput(size_t offset) {
+#ifdef LITE_WITH_XPU
+  XPU_CALL(xpu_set_device(xpu_runtime_option_.xpu_dev_num));
+#endif
   CHECK(input_names_.size() > offset)
       << "The network has " << input_names_.size() << " inputs"
       << ", the offset should be less than this.";

diff --git a/lite/api/cxx_api.h b/lite/api/cxx_api.h
@@ -164,6 +164,12 @@ class LITE_API Predictor {
     CheckInputValid();
 
 #ifdef LITE_WITH_XPU
+    if (lite::TargetWrapperXPU::xpu_runtime_ptr != &xpu_runtime_option_) {
+      lite::TargetWrapperXPU::xpu_runtime_ptr = &xpu_runtime_option_;
+      // thanks to rumtime context is thread_local,so we should set device when
+      // using different predictor in the same thread.
+      XPU_CALL(xpu_set_device(xpu_runtime_option_.xpu_dev_num));
+    }
     std::vector<std::vector<int64_t>> query_shape;
     for (size_t i = 0; i < input_names_.size(); i++) {
       query_shape.push_back(std::vector<int64_t>(GetInput(i)->dims().data()));
@@ -236,6 +242,20 @@ class LITE_API Predictor {
   /////////////////////////////////////////////////////////////////////////////
   void CheckPaddleOpVersions(
       const std::shared_ptr<cpp::ProgramDesc>& program_desc);
+#ifdef LITE_WITH_XPU
+  void SetXPURunTimeOption(const lite_api::CxxConfig& config) {
+    if (config.get_xpu_runtime_option()) {
+      xpu_runtime_option_.Set(reinterpret_cast<const lite::XPURunTimeOption*>(
+          config.get_xpu_runtime_option()));
+      XPU_CALL(xpu_set_device(xpu_runtime_option_.xpu_dev_num));
+    }
+  }
+
+  void set_xpu_stream(void* stream) {
+    xpu_runtime_option_.xpu_stream.SetXPUStream(stream);
+  }
+
+#endif
 
   // #ifdef LITE_WITH_TRAIN
   //   void Run(const std::vector<framework::Tensor>& tensors) {
@@ -257,6 +277,9 @@ class LITE_API Predictor {
 #endif
 
  private:
+#ifdef LITE_WITH_XPU
+  XPURunTimeOption xpu_runtime_option_;
+#endif
   std::shared_ptr<cpp::ProgramDesc> program_desc_;
   std::shared_ptr<Scope> scope_;
   Scope* exec_scope_;
@@ -323,6 +346,9 @@ class CxxPaddleApiImpl : public lite_api::PaddlePredictor {
       const std::string& model_dir,
       lite_api::LiteModelType model_type = lite_api::LiteModelType::kProtobuf,
       bool record_info = false) override;
+#ifdef LITE_WITH_XPU
+  void set_xpu_stream(void* stream);
+#endif
 
  private:
   std::shared_ptr<Predictor> raw_predictor_;

diff --git a/lite/api/cxx_api_impl.cc b/lite/api/cxx_api_impl.cc
@@ -44,6 +44,9 @@ void CxxPaddleApiImpl::Init(const lite_api::CxxConfig &config) {
   config_ = config;
   mode_ = config.power_mode();
   threads_ = config.threads();
+#ifdef LITE_WITH_XPU
+  raw_predictor_->SetXPURunTimeOption(config);
+#endif
 #ifdef LITE_USE_THREAD_POOL
   int thread_num = ThreadPool::Init(threads_);
   if (thread_num > 1) {
@@ -278,6 +281,12 @@ bool CxxPaddleApiImpl::TryShrinkMemory() {
   return raw_predictor_->TryShrinkMemory();
 }
 
+#ifdef LITE_WITH_XPU
+void CxxPaddleApiImpl::set_xpu_stream(void *stream) {
+  raw_predictor_->set_xpu_stream(stream);
+}
+#endif
+
 }  // namespace lite
 
 namespace lite_api {

diff --git a/lite/api/paddle_api.cc b/lite/api/paddle_api.cc
@@ -24,6 +24,7 @@
 #ifdef LITE_WITH_XPU
 #include <functional>
 #include <mutex>  // NOLINT
+#include "lite/backends/xpu/runtime_option.h"
 #include "lite/backends/xpu/target_wrapper.h"
 #endif
 
@@ -478,10 +479,12 @@ void CxxConfig::set_xpu_l3_cache_method(size_t l3_size, bool locked) {
       CHECK(lite::TargetWrapperXPU::shared_l3_size >= l3_size)
           << "Enlarge XPU Shared L3 Cache Is Not Allowed.";
     }
-    lite::TargetWrapperXPU::local_l3_size = 0;
+    reinterpret_cast<lite::XPURunTimeOption *>(set_xpu_runtime_option())
+        ->xpu_local_l3_size = 0;
     lite::TargetWrapperXPU::need_l3_mutex = true;
   } else {
-    lite::TargetWrapperXPU::local_l3_size = l3_size;
+    reinterpret_cast<lite::XPURunTimeOption *>(set_xpu_runtime_option())
+        ->xpu_local_l3_size = l3_size;
     lite::TargetWrapperXPU::need_l3_mutex = false;
   }
 #else
@@ -493,17 +496,19 @@ void CxxConfig::set_xpu_l3_cache_method(size_t l3_size, bool locked) {
 
 void CxxConfig::set_xpu_l3_cache_autotune(bool autotune) {
 #ifdef LITE_WITH_XPU
-  lite::TargetWrapperXPU::local_l3_autotune = autotune;
+  reinterpret_cast<lite::XPURunTimeOption *>(set_xpu_runtime_option())
+      ->xpu_local_l3_autotune = autotune;
 #else
   LOG(WARNING) << "The invoking of the function "
                   "'set_xpu_l3_cache_autotune' is ignored, please "
                   "rebuild it with LITE_WITH_XPU=ON.";
 #endif
 }
 
-void set_xpu_gm_workspace_method(size_t gm_size) {
+void CxxConfig::set_xpu_gm_workspace_method(size_t gm_size) {
 #ifdef LITE_WITH_XPU
-  lite::TargetWrapperXPU::local_gm_size = gm_size;
+  reinterpret_cast<lite::XPURunTimeOption *>(set_xpu_runtime_option())
+      ->xpu_local_gm_size = gm_size;
 #else
   LOG(WARNING) << "The invoking of the function "
                   "'set_xpu_gm_workspace_method' is ignored, please "
@@ -513,7 +518,8 @@ void set_xpu_gm_workspace_method(size_t gm_size) {
 
 void CxxConfig::set_xpu_dev_per_thread(int dev_no) {
 #ifdef LITE_WITH_XPU
-  lite::TargetWrapperXPU::SetDev(dev_no);
+  reinterpret_cast<lite::XPURunTimeOption *>(set_xpu_runtime_option())
+      ->xpu_dev_num = dev_no;
 #else
   LOG(WARNING) << "The invoking of the function 'set_xpu_dev_per_thread' is "
                   "ignored, please rebuild it with LITE_WITH_XPU=ON.";
@@ -522,7 +528,8 @@ void CxxConfig::set_xpu_dev_per_thread(int dev_no) {
 
 void CxxConfig::enable_xpu_multi_stream() {
 #ifdef LITE_WITH_XPU
-  lite::TargetWrapperXPU::enable_xpu_multi_stream();
+  reinterpret_cast<lite::XPURunTimeOption *>(set_xpu_runtime_option())
+      ->xpu_enable_multi_stream = true;
 #else
   LOG(WARNING)
       << "The invoking of the function 'enable_xpu_stream_per_thread' is "
@@ -591,7 +598,8 @@ void CxxConfig::set_xpu_conv_autotune(bool autotune,
 
 void CxxConfig::set_xpu_cluster_num(const int num) {
 #ifdef LITE_WITH_XPU
-  lite::TargetWrapperXPU::cluster_num = num;
+  reinterpret_cast<lite::XPURunTimeOption *>(set_xpu_runtime_option())
+      ->xpu_cluster_num = num;
 #else
   LOG(WARNING) << "The invoking of the function "
                   "'set_xpu_cluster_num' is ignored, please "
@@ -601,14 +609,37 @@ void CxxConfig::set_xpu_cluster_num(const int num) {
 
 void CxxConfig::set_xpu_sdnn_num(const int num) {
 #ifdef LITE_WITH_XPU
-  lite::TargetWrapperXPU::sdnn_num = num;
+  reinterpret_cast<lite::XPURunTimeOption *>(set_xpu_runtime_option())
+      ->xpu_sdnn_num = num;
 #else
   LOG(WARNING) << "The invoking of the function "
                   "'set_xpu_sdnn_num' is ignored, please "
                   "rebuild it with LITE_WITH_XPU=ON.";
 #endif
 }
 
+void CxxConfig::set_xpu_dump_tensor_path(const std::string dump_tensor_path) {
+#ifdef LITE_WITH_XPU
+  reinterpret_cast<lite::XPURunTimeOption *>(set_xpu_runtime_option())
+      ->xpu_dump_tensor_path = dump_tensor_path;
+#else
+  LOG(WARNING) << "The invoking of the function "
+                  "'set_xpu_dump_tensor_path' is ignored, please "
+                  "rebuild it with LITE_WITH_XPU=ON.";
+#endif
+}
+
+void CxxConfig::set_xpu_dump_log_path(const std::string dump_log_path) {
+#ifdef LITE_WITH_XPU
+  reinterpret_cast<lite::XPURunTimeOption *>(set_xpu_runtime_option())
+      ->xpu_dump_log_path = dump_log_path;
+#else
+  LOG(WARNING) << "The invoking of the function "
+                  "'set_xpu_dump_log_path' is ignored, please "
+                  "rebuild it with LITE_WITH_XPU=ON.";
+#endif
+}
+
 template <class T>
 void CxxConfig::set_preferred_inputs_for_warmup(const int group_idx,
                                                 const int tensor_idx,
@@ -665,6 +696,22 @@ _SetPreferredInputsForWarmup(int32_t);
 _SetPreferredInputsForWarmup(int64_t);
 #undef _SetPreferredInputsForWarmup
 
+void *CxxConfig::set_xpu_runtime_option() {
+#ifdef LITE_WITH_XPU
+  if (runtime_option_ != nullptr) {
+    return runtime_option_.get();
+  }
+
+  runtime_option_ =
+      std::shared_ptr<lite::XPURunTimeOption>(new lite::XPURunTimeOption);
+
+  return runtime_option_.get();
+#else
+  LOG(WARNING) << "'set_xpu_runtime_option' is only for xpu now, please "
+                  "rebuild it with LITE_WITH_XPU=ON.";
+#endif
+}
+
 // set model data in combined format, `set_model_from_file` refers to loading
 // model from file, set_model_from_buffer refers to loading model from memory
 // buffer

diff --git a/lite/api/paddle_api.h b/lite/api/paddle_api.h
@@ -380,6 +380,7 @@ class LITE_API CxxConfig : public ConfigBase {
   float sparse_threshold_{0.6f};
   std::map<int, std::vector<std::shared_ptr<void>>>
       preferred_inputs_for_warmup_;
+  std::shared_ptr<void> runtime_option_{nullptr};
   // The custom configuration file or buffer for the NNAdapter subgraph
   // partition, here is an example:
   // op_type:in_var_name_0,in_var_name1:out_var_name_0,out_var_name1
@@ -424,6 +425,13 @@ class LITE_API CxxConfig : public ConfigBase {
   // but is_model_from_memory is recommended and `model_from_memory` will be
   // abandoned in v3.0.
   bool model_from_memory() const { return static_cast<bool>(model_buffer_); }
+  const void* get_xpu_runtime_option() const {
+    if (runtime_option_ != nullptr) {
+      return runtime_option_.get();
+    }
+
+    return nullptr;
+  }
 
   // XPU only, set the size of the workspace memory from L3 cache for the
   // current thread.
@@ -455,6 +463,8 @@ class LITE_API CxxConfig : public ConfigBase {
   void set_xpu_sdnn_num(const int num);
   void set_xpu_local_quant(bool local_quant = false);
   void set_xpu_compute_precision(const std::string& precision = "int16");
+  void set_xpu_dump_tensor_path(const std::string dump_tensor_path = "");
+  void set_xpu_dump_log_path(const std::string dump_log_path = "");
 
   // set input tensor for warmup.
   // It is optional. If you set prefered_inputs, model wil run immediately when
@@ -520,6 +530,9 @@ class LITE_API CxxConfig : public ConfigBase {
       const {
     return mixed_precision_quantization_config_buffer_;
   }
+
+ private:
+  void* set_xpu_runtime_option();
 };
 
 /// MobileConfig is the config for the light weight predictor, it will skip