From 582d26a31295645247caf85019e1538f523315c8 Mon Sep 17 00:00:00 2001
From: laiou <zs2281475@163.com>
Date: Fri, 7 Jan 2022 14:30:32 +0800
Subject: [PATCH 1/3] pad3d and memory pass

---
 lite/api/paddle_use_passes.h                  |   1 +
 .../mir/fusion/__xpu__inplace_fuse_pass.cc    |  45 ++++
 .../mir/fusion/__xpu__inplace_fuse_pass.h     |  32 +++
 .../mir/fusion/__xpu__inplace_fuser.cc        |  53 ++++
 .../mir/fusion/__xpu__inplace_fuser.h         |  40 +++
 .../optimizer/mir/fusion/inplace_fuse_pass.cc |   3 +-
 .../optimizer/mir/xpu_memory_optimize_pass.cc | 255 +++++++++++++++---
 .../optimizer/mir/xpu_memory_optimize_pass.h  |  12 +-
 lite/core/optimizer/optimizer.cc              |   1 +
 .../x86_mobilenetv1_full_demo/CMakeLists.txt  |  73 +++++
 .../x86_mobilenetv1_light_demo/CMakeLists.txt |  73 +++++
 lite/kernels/xpu/CMakeLists.txt               |   1 +
 lite/kernels/xpu/pad3d_compute.cc             | 101 +++++++
 lite/kernels/xpu/pad3d_compute.h              |  37 +++
 14 files changed, 680 insertions(+), 47 deletions(-)
 create mode 100644 lite/core/optimizer/mir/fusion/__xpu__inplace_fuse_pass.cc
 create mode 100644 lite/core/optimizer/mir/fusion/__xpu__inplace_fuse_pass.h
 create mode 100644 lite/core/optimizer/mir/fusion/__xpu__inplace_fuser.cc
 create mode 100644 lite/core/optimizer/mir/fusion/__xpu__inplace_fuser.h
 create mode 100644 lite/demo/cxx/x86_mobilenetv1_full_demo/CMakeLists.txt
 create mode 100644 lite/demo/cxx/x86_mobilenetv1_light_demo/CMakeLists.txt
 create mode 100644 lite/kernels/xpu/pad3d_compute.cc
 create mode 100644 lite/kernels/xpu/pad3d_compute.h
diff --git a/lite/api/paddle_use_passes.h b/lite/api/paddle_use_passes.h
index 4fd1e24d09a..5c7af415737 100644
--- a/lite/api/paddle_use_passes.h
+++ b/lite/api/paddle_use_passes.h
@@ -65,6 +65,7 @@ USE_MIR_PASS(type_layout_cast_preprocess_pass);
 USE_MIR_PASS(memory_optimize_pass);
 USE_MIR_PASS(xpu_memory_optimize_pass);
 USE_MIR_PASS(lite_inplace_fuse_pass);
+USE_MIR_PASS(xpu_inplace_fuse_pass);
 USE_MIR_PASS(multi_stream_analysis_pass);
 USE_MIR_PASS(elementwise_mul_constant_eliminate_pass);
 USE_MIR_PASS(npu_subgraph_pass);
diff --git a/lite/core/optimizer/mir/fusion/__xpu__inplace_fuse_pass.cc b/lite/core/optimizer/mir/fusion/__xpu__inplace_fuse_pass.cc
new file mode 100644
index 00000000000..38212137228
--- /dev/null
+++ b/lite/core/optimizer/mir/fusion/__xpu__inplace_fuse_pass.cc
@@ -0,0 +1,45 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/core/optimizer/mir/fusion/__xpu__inplace_fuse_pass.h"
+#include <memory>
+#include <vector>
+#include "lite/core/optimizer/mir/fusion/__xpu__inplace_fuser.h"
+#include "lite/core/optimizer/mir/pass_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+
+void XPUInplaceFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
+  std::vector<std::string> inplace_type_cases{"reshape",
+                                              "reshape2",
+                                              "flatten",
+                                              "flatten2",
+                                              "squeeze",
+                                              "squeeze2",
+                                              "unsqueeze",
+                                              "unsqueeze2"};
+  for (auto type : inplace_type_cases) {
+    fusion::XPUInplaceFuser inplace_fuser(type);
+    inplace_fuser(graph.get());
+  }
+}
+
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_MIR_PASS(xpu_inplace_fuse_pass, paddle::lite::mir::XPUInplaceFusePass)
+    .BindTargets({TARGET(kXPU)});
diff --git a/lite/core/optimizer/mir/fusion/__xpu__inplace_fuse_pass.h b/lite/core/optimizer/mir/fusion/__xpu__inplace_fuse_pass.h
new file mode 100644
index 00000000000..5fb421bfbbc
--- /dev/null
+++ b/lite/core/optimizer/mir/fusion/__xpu__inplace_fuse_pass.h
@@ -0,0 +1,32 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include "lite/core/optimizer/mir/pass.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+
+class XPUInplaceFusePass : public ProgramPass {
+ public:
+  void Apply(const std::unique_ptr<SSAGraph>& graph) override;
+};
+
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/core/optimizer/mir/fusion/__xpu__inplace_fuser.cc b/lite/core/optimizer/mir/fusion/__xpu__inplace_fuser.cc
new file mode 100644
index 00000000000..d9740213d47
--- /dev/null
+++ b/lite/core/optimizer/mir/fusion/__xpu__inplace_fuser.cc
@@ -0,0 +1,53 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/core/optimizer/mir/fusion/__xpu__inplace_fuser.h"
+#include <memory>
+#include <vector>
+#include "lite/core/optimizer/mir/pattern_matcher_high_api.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+namespace fusion {
+
+void XPUInplaceFuser::BuildPattern() {
+  auto* input = VarNode("input")
+                    ->assert_is_op_input(type_, "X")
+                    ->assert_only_one_output()
+                    ->AsInput();
+
+  auto* op_node = OpNode("inplace", type_)->assert_is_op(type_);
+
+  auto* output =
+      VarNode("output")->assert_is_op_output(type_, "Out")->AsOutput();
+
+  *input >> *op_node >> *output;
+}
+
+void XPUInplaceFuser::InsertNewNode(SSAGraph* graph,
+                                    const key2nodes_t& matched) {
+  bool inplace = true;
+  auto* stmt = matched.at("inplace")->stmt();
+  auto op = stmt->op();
+  cpp::OpDesc* op_desc = op->mutable_op_info();
+  op_desc->SetAttr<bool>("inplace", inplace);
+  stmt->op()->Attach(*op_desc, op->scope());
+  stmt->op()->AttachKernel(&(stmt->picked_kernel()));
+}
+
+}  // namespace fusion
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/core/optimizer/mir/fusion/__xpu__inplace_fuser.h b/lite/core/optimizer/mir/fusion/__xpu__inplace_fuser.h
new file mode 100644
index 00000000000..75d2e44ad37
--- /dev/null
+++ b/lite/core/optimizer/mir/fusion/__xpu__inplace_fuser.h
@@ -0,0 +1,40 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include "lite/core/optimizer/mir/pattern_matcher_high_api.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+namespace fusion {
+
+class XPUInplaceFuser : public FuseBase {
+ public:
+  explicit XPUInplaceFuser(const std::string& type) : type_(type) {}
+
+  void BuildPattern() override;
+  void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override;
+
+ private:
+  std::string type_;
+};
+
+}  // namespace fusion
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/core/optimizer/mir/fusion/inplace_fuse_pass.cc b/lite/core/optimizer/mir/fusion/inplace_fuse_pass.cc
index 4fc05bd051e..25354fbee2f 100644
--- a/lite/core/optimizer/mir/fusion/inplace_fuse_pass.cc
+++ b/lite/core/optimizer/mir/fusion/inplace_fuse_pass.cc
@@ -43,4 +43,5 @@ void InplaceFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
 
 REGISTER_MIR_PASS(lite_inplace_fuse_pass, paddle::lite::mir::InplaceFusePass)
     .BindTargets({TARGET(kAny)})
-    .ExcludeTargets({TARGET(kNPU)});
+    .ExcludeTargets({TARGET(kNPU)})
+    .ExcludeTargets({TARGET(kXPU)});
diff --git a/lite/core/optimizer/mir/xpu_memory_optimize_pass.cc b/lite/core/optimizer/mir/xpu_memory_optimize_pass.cc
index 46c415fcf82..3e151a9220e 100644
--- a/lite/core/optimizer/mir/xpu_memory_optimize_pass.cc
+++ b/lite/core/optimizer/mir/xpu_memory_optimize_pass.cc
@@ -30,11 +30,15 @@ typedef struct {
   int cluster;
   std::pair<int, int> lifetime;
   int life_interval;
+  int mapping;
   std::set<std::string> adj;
 } XPUMemNode;
 
 void XPUMemoryOptimizePass::CollectLifeCycleByDevice(
-    std::map<std::string, lifecycle_map_t>* lifecycles, SSAGraph* graph) {
+    std::map<std::string, lifecycle_map_t>* lifecycles,
+    SSAGraph* graph,
+    std::map<std::string, std::string>* squeeze_input2output,
+    std::map<std::string, std::string>* squeeze_output2input) {
   max_lifecycle_ = 0;
 
   auto is_host = [](TargetType x) -> bool {
@@ -93,12 +97,11 @@ void XPUMemoryOptimizePass::CollectLifeCycleByDevice(
         }
       };
 
-  VLOG(4) << "invalid_op_nodes.size();" << invalid_op_nodes.size();
   insert_invalid_op_nodes_for_specific_target(invalid_op_nodes);
-  VLOG(4) << "invalid_op_nodes.size();" << invalid_op_nodes.size();
 
   // Collect the invalid input and output variables that will not be reused.
   std::set<std::string> invalid_var_names;
+  int inplace_op_num = 0;
   for (auto& op_node : graph->StmtTopologicalOrder()) {
     // variables of invalid_op_nodes wil not be reused
     if (!op_node->IsStmt()) continue;
@@ -130,19 +133,19 @@ void XPUMemoryOptimizePass::CollectLifeCycleByDevice(
                             {"unsqueeze", {{"X"}, {"Out"}}},
                             {"unsqueeze2", {{"X"}, {"Out"}}}};
     auto inplace_op_node = inplace_op_nodes.find(op_type);
+
     if (inplace_op_node != inplace_op_nodes.end()) {
       bool inplace = false;
       if (op_info->HasAttr("inplace")) {
         inplace = op_info->GetAttr<bool>("inplace");
       }
       if (inplace) {
+        inplace_op_num++;
         for (auto& in_param_name : inplace_op_node->second.first) {
           const auto& in_arg_names = op_info->Input(in_param_name);
-          invalid_var_names.insert(in_arg_names.begin(), in_arg_names.end());
         }
         for (auto& out_param_name : inplace_op_node->second.second) {
           const auto& out_arg_names = op_info->Output(out_param_name);
-          invalid_var_names.insert(out_arg_names.begin(), out_arg_names.end());
         }
       }
     }
@@ -161,12 +164,35 @@ void XPUMemoryOptimizePass::CollectLifeCycleByDevice(
       if (op_node->AsStmt().op_info()->Type() == "io_copy_once") {
         continue;
       }
+
+      std::map<std::string,
+               std::pair<std::set<std::string>, std::set<std::string>>>
+          inplace_ops = {{"reshape", {{"X"}, {"Out"}}},
+                         {"reshape2", {{"X"}, {"Out"}}},
+                         {"flatten", {{"X"}, {"Out"}}},
+                         {"flatten2", {{"X"}, {"Out"}}},
+                         {"squeeze", {{"X"}, {"Out"}}},
+                         {"squeeze2", {{"X"}, {"Out"}}},
+                         {"unsqueeze", {{"X"}, {"Out"}}},
+                         {"unsqueeze2", {{"X"}, {"Out"}}}};
       VLOG(4) << op_node->AsStmt().op_info()->Type() << " life is "
               << max_lifecycle_;
       std::vector<Node*> var_nodes(op_node->inlinks.begin(),
                                    op_node->inlinks.end());
       var_nodes.insert(
           var_nodes.end(), op_node->outlinks.begin(), op_node->outlinks.end());
+
+      int count = 0;
+
+      bool is_inplace = false;
+
+      if (op_node->AsStmt().op_info()->HasAttr("inplace")) {
+        is_inplace = op_node->AsStmt().op_info()->GetAttr<bool>("inplace");
+      }
+
+      std::string input_host_var_name = " ";
+      std::string input_xpu_var_name = " ";
+
       for (auto* var_node : var_nodes) {
         CHECK(var_node->IsArg());
         auto& arg = var_node->AsArg();
@@ -175,18 +201,59 @@ void XPUMemoryOptimizePass::CollectLifeCycleByDevice(
         VLOG(4) << "OP VAR NAME IS " << var_name;
         if (var_name.find("_xpu_max") != std::string::npos) continue;
         if (invalid_var_names.count(var_name)) continue;
-        TargetType target_type = arg.type->target();
-        if (is_host(target_type)) target_type = TARGET(kHost);
-
-        if (!(*lifecycles)[TargetToStr(target_type)].count(var_name)) {
-          (*lifecycles)[TargetToStr(target_type)].emplace(
-              var_name, std::make_pair(max_lifecycle_, max_lifecycle_));
-        } else {
-          int cur_life =
-              (*lifecycles)[TargetToStr(target_type)][var_name].second;
-          (*lifecycles)[TargetToStr(target_type)][var_name].second =
-              (std::max)(max_lifecycle_, cur_life);
-        }
+        auto find_inplace_op =
+            inplace_ops.find(op_node->AsStmt().op_info()->Type());
+
+        if (find_inplace_op != inplace_ops.end() && count != 2) {
+          TargetType target_type = arg.type->target();
+          if (is_host(target_type)) {
+            target_type = TARGET(kHost);
+            continue;
+          }
+
+          if ((*lifecycles)[TargetToStr(target_type)].count(var_name)) {
+            if (is_host(target_type)) {
+              input_host_var_name = var_name;
+            } else {
+              input_xpu_var_name = var_name;
+              count++;
+              int cur_life =
+                  (*lifecycles)[TargetToStr(target_type)][var_name].second;
+              (*lifecycles)[TargetToStr(target_type)][var_name].second =
+                  (std::max)(max_lifecycle_, cur_life);
+            }
+          } else if (!(*lifecycles)[TargetToStr(target_type)].count(var_name)) {
+            count++;
+            if (is_host(target_type)) {
+              (*lifecycles)[TargetToStr(target_type)].emplace(
+                  var_name,
+                  (*lifecycles)[TargetToStr(target_type)][input_host_var_name]);
+            } else {
+              if (is_inplace) {
+                (*lifecycles)[TargetToStr(target_type)].emplace(
+                    var_name, std::make_pair(max_lifecycle_, max_lifecycle_));
+                squeeze_input2output->emplace(input_xpu_var_name, var_name);
+                squeeze_output2input->emplace(var_name, input_xpu_var_name);
+              } else {
+                (*lifecycles)[TargetToStr(target_type)].emplace(
+                    var_name, std::make_pair(max_lifecycle_, max_lifecycle_));
+              }
+            }
+          }
+        } else if (find_inplace_op == inplace_ops.end()) {
+          TargetType target_type = arg.type->target();
+          if (is_host(target_type)) target_type = TARGET(kHost);
+
+          if (!(*lifecycles)[TargetToStr(target_type)].count(var_name)) {
+            (*lifecycles)[TargetToStr(target_type)].emplace(
+                var_name, std::make_pair(max_lifecycle_, max_lifecycle_));
+          } else {
+            int cur_life =
+                (*lifecycles)[TargetToStr(target_type)][var_name].second;
+            (*lifecycles)[TargetToStr(target_type)][var_name].second =
+                (std::max)(max_lifecycle_, cur_life);
+          }
+        }  // if else
       }
       ++max_lifecycle_;
     }
@@ -196,7 +263,9 @@ void XPUMemoryOptimizePass::CollectLifeCycleByDevice(
 
 void XPUMemoryOptimizePass::MakeReusePlan(
     const lifecycle_map_t& lifecycles,
-    std::map<std::string, std::string>* node2cluster) {
+    std::map<std::string, std::string>* node2cluster,
+    std::map<std::string, std::string>* squeeze_input2output,
+    std::map<std::string, std::string>* squeeze_output2input) {
   std::vector<XPUMemNode> mem_nodes;
   std::vector<std::string> cluster;
   for (auto& data : lifecycles) {
@@ -204,6 +273,7 @@ void XPUMemoryOptimizePass::MakeReusePlan(
     temp_node.name = data.first;
     temp_node.cluster = -1;
     temp_node.lifetime = data.second;
+    temp_node.mapping = 0;
     temp_node.life_interval = data.second.second - data.second.first;
     mem_nodes.push_back(temp_node);
   }
@@ -234,33 +304,129 @@ void XPUMemoryOptimizePass::MakeReusePlan(
       }
     }
   }
+  VLOG(4) << "Step1 get inplace node Cluster: ";
+  for (size_t i = 0; i < mem_nodes.size(); i++) {
+    if (squeeze_input2output->count(mem_nodes[i].name)) {
+      int cluster_index = cluster.size();
+      mem_nodes[i].cluster = cluster_index;
+      (*node2cluster)[mem_nodes[i].name] = mem_nodes[i].name;
+      VLOG(4) << "Mapping Tensor Cluster: " << mem_nodes[i].name
+              << ", life time is " << mem_nodes[i].lifetime.first << " --> "
+              << mem_nodes[i].lifetime.second << ", cluster name is "
+              << (*node2cluster)[mem_nodes[i].name];
+      std::set<std::string> cluster_adj = mem_nodes[i].adj;
+      for (size_t j = 0; j < mem_nodes.size(); j++) {
+        if (mem_nodes[j].name == (*squeeze_input2output)[mem_nodes[i].name]) {
+          (*node2cluster)[mem_nodes[j].name] == mem_nodes[i].name;
+          mem_nodes[j].cluster = cluster_index;
+          VLOG(4) << mem_nodes[j].name << ", life time is "
+                  << mem_nodes[j].lifetime.first << " --> "
+                  << mem_nodes[j].lifetime.second << ", cluster name is "
+                  << (*node2cluster)[mem_nodes[j].name];
+          for (auto& n : mem_nodes[j].adj) {
+            cluster_adj.insert(n);
+          }
+        }
+      }
+    }
+  }
+  VLOG(4) << "Step2 merge inplace node Cluster: ";
+  for (size_t i = 0; i < mem_nodes.size(); i++) {
+    if (squeeze_input2output->count(mem_nodes[i].name) &&
+        mem_nodes[i].mapping != 1) {
+      int cluster_index = cluster.size();
+      mem_nodes[i].cluster = cluster_index;
+      (*node2cluster)[mem_nodes[i].name] = mem_nodes[i].name;
+      mem_nodes[i].mapping = 1;
+      VLOG(4) << "Mapping Tensor Cluster: " << mem_nodes[i].name
+              << ", life time is " << mem_nodes[i].lifetime.first << " --> "
+              << mem_nodes[i].lifetime.second << ", cluster index is "
+              << mem_nodes[i].cluster << ", cluster name is "
+              << (*node2cluster)[mem_nodes[i].name];
+      cluster.push_back(mem_nodes[i].name);
+
+      std::set<std::string> cluster_adj = mem_nodes[i].adj;
+      for (size_t j = 0; j < mem_nodes.size(); j++) {
+        if (mem_nodes[j].name == (*squeeze_input2output)[mem_nodes[i].name]) {
+          mem_nodes[j].cluster = mem_nodes[i].cluster;
+          (*node2cluster)[mem_nodes[j].name] = mem_nodes[i].name;
+          VLOG(4) << mem_nodes[j].name << ", life time is "
+                  << mem_nodes[j].lifetime.first << " --> "
+                  << mem_nodes[j].lifetime.second << ", cluster index is "
+                  << mem_nodes[j].cluster << ", cluster name is "
+                  << (*node2cluster)[mem_nodes[j].name];
+
+          for (auto& m : mem_nodes[j].adj) {
+            cluster_adj.insert(m);
+          }
+        } else if (squeeze_input2output->count(mem_nodes[j].name) &&
+                   (cluster_adj.find(mem_nodes[j].name) == cluster_adj.end()) &&
+                   mem_nodes[j].mapping != 1) {
+          mem_nodes[j].mapping = 1;
+          mem_nodes[j].cluster = mem_nodes[i].cluster;
+          (*node2cluster)[mem_nodes[j].name] = mem_nodes[i].name;
+          VLOG(4) << mem_nodes[j].name << ", life time is "
+                  << mem_nodes[j].lifetime.first << " --> "
+                  << mem_nodes[j].lifetime.second << ", cluster index is "
+                  << mem_nodes[j].cluster << ", cluster name is "
+                  << (*node2cluster)[mem_nodes[j].name];
+
+          for (auto& n : mem_nodes[j].adj) {
+            cluster_adj.insert(n);
+          }
+          for (size_t n = 0; n < mem_nodes.size(); n++) {
+            if (mem_nodes[n].name ==
+                (*squeeze_input2output)[mem_nodes[j].name]) {
+              mem_nodes[n].cluster = mem_nodes[i].cluster;
+              (*node2cluster)[mem_nodes[n].name] = mem_nodes[i].name;
+              VLOG(4) << mem_nodes[n].name << ", life time is "
+                      << mem_nodes[n].lifetime.first << " --> "
+                      << mem_nodes[n].lifetime.second << ", cluster index is "
+                      << mem_nodes[n].cluster << ", cluster name is "
+                      << (*node2cluster)[mem_nodes[n].name];
 
-  // Generating XPUMemory Reuse Strategy Based on Greedy Way
-  // The vars can be reused if there is no overlap between them.
+              for (auto& m : mem_nodes[n].adj) {
+                cluster_adj.insert(m);
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+  VLOG(4) << "Step3 get others node Cluster : ";
   for (size_t i = 0; i < mem_nodes.size(); i++) {
-    if (mem_nodes[i].cluster >= 0 || mem_nodes[i].life_interval == 0) continue;
-    int cluster_index = cluster.size();
-    mem_nodes[i].cluster = cluster_index;
-    (*node2cluster)[mem_nodes[i].name] = mem_nodes[i].name;
-    VLOG(4) << "Mapping Tensor Cluster: " << mem_nodes[i].name
-            << ", life time is " << mem_nodes[i].lifetime.first << " --> "
-            << mem_nodes[i].lifetime.second;
-    cluster.push_back(mem_nodes[i].name);
-    std::set<std::string> cluster_adj = mem_nodes[i].adj;
-    for (size_t j = i + 1; j < mem_nodes.size(); j++) {
-      if (mem_nodes[j].cluster < 0 &&
-          (cluster_adj.find(mem_nodes[j].name) == cluster_adj.end())) {
-        (*node2cluster)[mem_nodes[j].name] = mem_nodes[i].name;
-        mem_nodes[j].cluster = cluster_index;
-        VLOG(4) << mem_nodes[j].name << ", life time is "
-                << mem_nodes[j].lifetime.first << " --> "
-                << mem_nodes[j].lifetime.second;
-        for (auto& n : mem_nodes[j].adj) {
-          cluster_adj.insert(n);
+    if (!(squeeze_input2output->count(mem_nodes[i].name)) &&
+        mem_nodes[i].cluster < 0 && mem_nodes[i].life_interval != 0) {
+      int cluster_index = cluster.size();
+      mem_nodes[i].cluster = cluster_index;
+      (*node2cluster)[mem_nodes[i].name] = mem_nodes[i].name;
+      VLOG(4) << "Mapping Tensor Cluster: " << mem_nodes[i].name
+              << ", life time is " << mem_nodes[i].lifetime.first << " --> "
+              << mem_nodes[i].lifetime.second << ", cluster index is "
+              << mem_nodes[i].cluster << ", cluster name is "
+              << (*node2cluster)[mem_nodes[i].name];
+      cluster.push_back(mem_nodes[i].name);
+      std::set<std::string> cluster_adj = mem_nodes[i].adj;
+      for (size_t j = i + 1; j < mem_nodes.size(); j++) {
+        if (!(squeeze_input2output->count(mem_nodes[j].name)) &&
+            mem_nodes[j].cluster < 0 &&
+            (cluster_adj.find(mem_nodes[j].name) == cluster_adj.end())) {
+          mem_nodes[j].cluster = mem_nodes[i].cluster;
+          (*node2cluster)[mem_nodes[j].name] = mem_nodes[i].name;
+          VLOG(4) << mem_nodes[j].name << ", life time is "
+                  << mem_nodes[j].lifetime.first << " --> "
+                  << mem_nodes[j].lifetime.second << ", cluster index is "
+                  << mem_nodes[j].cluster << ", cluster name is "
+                  << (*node2cluster)[mem_nodes[j].name];
+          for (auto& n : mem_nodes[j].adj) {
+            cluster_adj.insert(n);
+          }
         }
       }
     }
   }
+
   for (auto& name : cluster) {
     LOG(INFO) << "cluster: " << name;
   }
@@ -272,6 +438,7 @@ void XPUMemoryOptimizePass::PerformReusePlan(
   for (auto& op_node : graph->StmtTopologicalOrder()) {
     if (!op_node->IsStmt()) continue;
     auto& stmt = op_node->AsStmt();
+
     auto* op_info = stmt.mutable_op_info();
     std::map<std::string, std::vector<std::string>> in_args, out_args;
     // replace the op's input according the reuse table.
@@ -354,13 +521,19 @@ void XPUMemoryOptimizePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
   // 3. Perform reuse plan: Replace all var's name in the model according to the
   // mapping table.
   std::map<std::string, lifecycle_map_t> lifecycles;
-  CollectLifeCycleByDevice(&lifecycles, graph.get());
+  std::map<std::string, std::string> squeeze_input2output;
+  std::map<std::string, std::string> squeeze_output2input;
+  CollectLifeCycleByDevice(
+      &lifecycles, graph.get(), &squeeze_input2output, &squeeze_output2input);
   for (auto& ele : lifecycles) {
     if (ele.first != "xpu") {
       continue;
     }
     std::map<std::string, std::string> node2cluster;
-    MakeReusePlan(ele.second, &node2cluster);
+    MakeReusePlan(ele.second,
+                  &node2cluster,
+                  &squeeze_input2output,
+                  &squeeze_output2input);
     PerformReusePlan(graph.get(), node2cluster);
   }
 }
diff --git a/lite/core/optimizer/mir/xpu_memory_optimize_pass.h b/lite/core/optimizer/mir/xpu_memory_optimize_pass.h
index f0d920fadf3..053914371d9 100644
--- a/lite/core/optimizer/mir/xpu_memory_optimize_pass.h
+++ b/lite/core/optimizer/mir/xpu_memory_optimize_pass.h
@@ -31,9 +31,6 @@ namespace paddle {
 namespace lite {
 namespace mir {
 
-/*
- * XPUMemoryOptimizePass will
- */
 class XPUMemoryOptimizePass : public ProgramPass {
  public:
   using lifecycle_t = std::pair<int, int>;
@@ -42,9 +39,14 @@ class XPUMemoryOptimizePass : public ProgramPass {
 
  private:
   void CollectLifeCycleByDevice(
-      std::map<std::string, lifecycle_map_t>* lifecycles, SSAGraph*);
+      std::map<std::string, lifecycle_map_t>* lifecycles,
+      SSAGraph*,
+      std::map<std::string, std::string>* squeeze_input2output,
+      std::map<std::string, std::string>* squeeze_output2input);
   void MakeReusePlan(const lifecycle_map_t& lifecycles,
-                     std::map<std::string, std::string>* node2cluster);
+                     std::map<std::string, std::string>* node2cluster,
+                     std::map<std::string, std::string>* squeeze_input2output,
+                     std::map<std::string, std::string>* squeeze_output2input);
   void PerformReusePlan(SSAGraph* graph,
                         const std::map<std::string, std::string>& reuse_table);
 
diff --git a/lite/core/optimizer/optimizer.cc b/lite/core/optimizer/optimizer.cc
index f1b393cc2cb..06c3c918e5e 100644
--- a/lite/core/optimizer/optimizer.cc
+++ b/lite/core/optimizer/optimizer.cc
@@ -264,6 +264,7 @@ std::unique_ptr<RuntimeProgram> RunDefaultOptimizer(
        "runtime_context_assign_pass",
        "argument_type_display_pass",
        "lite_inplace_fuse_pass",
+       "xpu_inplace_fuse_pass",
 #if !(defined(LITE_WITH_FPGA) || defined(LITE_WITH_PRECISION_PROFILE))
        "memory_optimize_pass",
        "xpu_memory_optimize_pass"
diff --git a/lite/demo/cxx/x86_mobilenetv1_full_demo/CMakeLists.txt b/lite/demo/cxx/x86_mobilenetv1_full_demo/CMakeLists.txt
new file mode 100644
index 00000000000..aaed1b50e02
--- /dev/null
+++ b/lite/demo/cxx/x86_mobilenetv1_full_demo/CMakeLists.txt
@@ -0,0 +1,73 @@
+cmake_minimum_required(VERSION 2.8)
+project(mobilenet_full_api)
+set(TARGET mobilenet_full_api)
+
+# 1. path to Paddle-Lite lib and mklml lib
+set(LITE_DIR "${PROJECT_SOURCE_DIR}/../../../cxx")
+set(MKLML_DIR "${PROJECT_SOURCE_DIR}/../../../third_party/mklml/")
+
+if (WIN32)
+  add_definitions("/DGOOGLE_GLOG_DLL_DECL=")
+  set(MSVC_STATIC_CRT )
+  if(MSVC_STATIC_CRT)
+    set(CMAKE_C_FLAGS_DEBUG   "${CMAKE_C_FLAGS_DEBUG} /bigobj /MTd")
+    set(CMAKE_C_FLAGS_RELEASE  "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT")
+    set(CMAKE_CXX_FLAGS_DEBUG  "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd")
+    set(CMAKE_CXX_FLAGS_RELEASE   "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT")
+  else(MSVC_STATIC_CRT)
+    set(CMAKE_C_FLAGS_DEBUG   "${CMAKE_C_FLAGS_DEBUG} /bigobj /MDd")
+    set(CMAKE_C_FLAGS_RELEASE  "${CMAKE_C_FLAGS_RELEASE} /bigobj /MD")
+    set(CMAKE_CXX_FLAGS_DEBUG  "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MDd")
+    set(CMAKE_CXX_FLAGS_RELEASE   "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MD")
+  endif(MSVC_STATIC_CRT)
+endif()
+
+if (APPLE AND METAL)
+  message(STATUS "set METAL=ON")
+  add_definitions("-DMETAL")
+  find_library(METAL_LIBRARY Metal REQUIRED)
+  find_library(GRAPHIC CoreGraphics REQUIRED)
+  find_library(MPS_LIBRARY MetalPerformanceShaders REQUIRED)
+endif()
+
+# 2. link mklml and Paddle-Lite directory
+link_directories(${LITE_DIR}/lib ${MKLML_DIR}/lib)
+include_directories(${LITE_DIR}/include/ ${MKLML_DIR}/include)
+
+# 3. compile options 
+if (NOT WIN32)
+  add_definitions(-std=c++11 -g -O3 -pthread)
+  set(EXECUTABLE_OUTPUT_PATH ${PROJECT_SOURCE_DIR})
+endif()
+
+# 4.add executable output
+add_executable(${TARGET} ${TARGET}.cc)
+if (WIN32)
+  set(WITH_STATIC_MKL OFF)
+  if(WITH_STATIC_MKL)
+    set(MATH_LIB ${MKLML_DIR}/lib/libiomp5md${CMAKE_STATIC_LIBRARY_SUFFIX})
+  else()
+    set(MATH_LIB ${MKLML_DIR}/lib/mklml${CMAKE_STATIC_LIBRARY_SUFFIX}
+                 ${MKLML_DIR}/lib/libiomp5md${CMAKE_STATIC_LIBRARY_SUFFIX})
+  endif()
+
+  target_link_libraries(${TARGET} libpaddle_api_full_bundled.lib)
+  target_link_libraries(${TARGET} shlwapi.lib)
+  target_link_libraries(${TARGET} ${MATH_LIB})
+
+  add_custom_command(TARGET ${TARGET} POST_BUILD
+      COMMAND ${CMAKE_COMMAND} -E copy ${MKLML_DIR}/lib/libiomp5md.dll ${CMAKE_BINARY_DIR}/Release
+  )
+  if(NOT WITH_STATIC_MKL)
+    add_custom_command(TARGET ${TARGET} POST_BUILD
+        COMMAND ${CMAKE_COMMAND} -E copy ${MKLML_DIR}/lib/mklml.dll ${CMAKE_BINARY_DIR}/Release
+    )
+  endif()
+else()
+    if (APPLE AND METAL)
+      target_link_libraries(${TARGET} ${METAL_LIBRARY} ${GRAPHIC} ${MPS_LIBRARY})
+    endif()
+    target_link_libraries(${TARGET} -lpaddle_full_api_shared)
+    target_link_libraries(${TARGET} -liomp5)
+    target_link_libraries(${TARGET} -ldl)
+endif()
diff --git a/lite/demo/cxx/x86_mobilenetv1_light_demo/CMakeLists.txt b/lite/demo/cxx/x86_mobilenetv1_light_demo/CMakeLists.txt
new file mode 100644
index 00000000000..50b777e7520
--- /dev/null
+++ b/lite/demo/cxx/x86_mobilenetv1_light_demo/CMakeLists.txt
@@ -0,0 +1,73 @@
+cmake_minimum_required(VERSION 2.8)
+project(mobilenet_light_api)
+set(TARGET mobilenet_light_api)
+
+# 1. path to Paddle-Lite lib and mklml lib
+set(LITE_DIR "${PROJECT_SOURCE_DIR}/../../../cxx")
+set(MKLML_DIR "${PROJECT_SOURCE_DIR}/../../../third_party/mklml/")
+
+if (WIN32)
+  add_definitions("/DGOOGLE_GLOG_DLL_DECL=")
+  set(MSVC_STATIC_CRT )
+  if(MSVC_STATIC_CRT)
+    set(CMAKE_C_FLAGS_DEBUG   "${CMAKE_C_FLAGS_DEBUG} /bigobj /MTd")
+    set(CMAKE_C_FLAGS_RELEASE  "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT")
+    set(CMAKE_CXX_FLAGS_DEBUG  "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd")
+    set(CMAKE_CXX_FLAGS_RELEASE   "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT")
+  else(MSVC_STATIC_CRT)
+    set(CMAKE_C_FLAGS_DEBUG   "${CMAKE_C_FLAGS_DEBUG} /bigobj /MDd")
+    set(CMAKE_C_FLAGS_RELEASE  "${CMAKE_C_FLAGS_RELEASE} /bigobj /MD")
+    set(CMAKE_CXX_FLAGS_DEBUG  "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MDd")
+    set(CMAKE_CXX_FLAGS_RELEASE   "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MD")
+  endif(MSVC_STATIC_CRT)
+endif()
+
+if (APPLE AND METAL)
+  message(STATUS "set METAL=ON")
+  add_definitions("-DMETAL")
+  find_library(METAL_LIBRARY Metal REQUIRED)
+  find_library(GRAPHIC CoreGraphics REQUIRED)
+  find_library(MPS_LIBRARY MetalPerformanceShaders REQUIRED)
+endif()
+
+# 2. link mklml and Paddle-Lite directory
+link_directories(${LITE_DIR}/lib ${MKLML_DIR}/lib)
+include_directories(${LITE_DIR}/include/ ${MKLML_DIR}/include)
+
+# 3. compile options 
+if (NOT WIN32)
+  add_definitions(-std=c++11 -g -O3 -pthread)
+  set(EXECUTABLE_OUTPUT_PATH ${PROJECT_SOURCE_DIR})
+endif()
+
+# 4.add executable output
+add_executable(${TARGET} ${TARGET}.cc)
+if (WIN32)
+  set(WITH_STATIC_MKL OFF)
+  if(WITH_STATIC_MKL)
+    set(MATH_LIB ${MKLML_DIR}/lib/libiomp5md${CMAKE_STATIC_LIBRARY_SUFFIX})
+  else()
+    set(MATH_LIB ${MKLML_DIR}/lib/mklml${CMAKE_STATIC_LIBRARY_SUFFIX}
+                 ${MKLML_DIR}/lib/libiomp5md${CMAKE_STATIC_LIBRARY_SUFFIX})
+  endif()
+
+  target_link_libraries(${TARGET} libpaddle_api_light_bundled.lib)
+  target_link_libraries(${TARGET} shlwapi.lib)
+  target_link_libraries(${TARGET} ${MATH_LIB})
+
+  add_custom_command(TARGET ${TARGET} POST_BUILD
+      COMMAND ${CMAKE_COMMAND} -E copy ${MKLML_DIR}/lib/libiomp5md.dll ${CMAKE_BINARY_DIR}/Release
+  )
+  if(NOT WITH_STATIC_MKL)
+    add_custom_command(TARGET ${TARGET} POST_BUILD
+        COMMAND ${CMAKE_COMMAND} -E copy ${MKLML_DIR}/lib/mklml.dll ${CMAKE_BINARY_DIR}/Release
+    )
+  endif()
+else()
+  if (APPLE AND METAL)
+    target_link_libraries(${TARGET} ${METAL_LIBRARY} ${GRAPHIC} ${MPS_LIBRARY})
+  endif()
+  target_link_libraries(${TARGET} -lpaddle_light_api_shared)
+  target_link_libraries(${TARGET} -liomp5)
+  target_link_libraries(${TARGET} -ldl)
+endif()
diff --git a/lite/kernels/xpu/CMakeLists.txt b/lite/kernels/xpu/CMakeLists.txt
index 3d3720ba7f0..4b2be99d0b6 100644
--- a/lite/kernels/xpu/CMakeLists.txt
+++ b/lite/kernels/xpu/CMakeLists.txt
@@ -92,6 +92,7 @@ else()
   add_kernel(anchor_generator_compute_xpu XPU extra SRCS anchor_generator_compute.cc)
   add_kernel(box_clip_compute_xpu XPU extra SRCS box_clip_compute.cc)
   add_kernel(pad2d_compute_xpu XPU extra SRCS pad2d_compute.cc)
+  add_kernel(pad3d_compute_xpu XPU extra SRCS pad3d_compute.cc)
   add_kernel(pixel_shuffle_compute_xpu XPU extra SRCS pixel_shuffle_compute.cc)
   add_kernel(correlation_compute_xpu XPU extra SRCS correlation_compute.cc)
   add_kernel(logical_compute_xpu XPU extra SRCS logical_compute.cc)
diff --git a/lite/kernels/xpu/pad3d_compute.cc b/lite/kernels/xpu/pad3d_compute.cc
new file mode 100644
index 00000000000..8be51425f10
--- /dev/null
+++ b/lite/kernels/xpu/pad3d_compute.cc
@@ -0,0 +1,101 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/xpu/pad3d_compute.h"
+#include <vector>
+#include "lite/backends/xpu/xpu_header_sitter.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+template <class T>
+void Pad3dCompute<T>::Run() {
+  auto& param = this->template Param<param_t>();
+  auto& ctx = this->ctx_->template As<XPUContext>();
+  auto pads = param.paddings;
+  auto mode = param.mode;
+  auto data_format = param.data_format;
+  T value = static_cast<T>(param.pad_value);
+
+  auto* x = param.X;
+  auto in_dims = x->dims();
+  auto* in_data = x->template data<T>();
+  auto* out = param.Out;
+  T* out_data = out->template mutable_data<T>(TARGET(kXPU));
+
+  if (mode == "reflect" || mode == "constant" || mode == "replicate" ||
+      mode == "circular") {
+    if (data_format == "NCDHW") {
+      std::vector<int> pad_left = {0, 0, pads[4], pads[2], pads[0]};
+      std::vector<int> pad_right = {0, 0, pads[5], pads[3], pads[1]};
+
+      int n_shape = in_dims[0];
+      int c_shape = in_dims[1];
+      int d_shape = in_dims[2];
+      int h_shape = in_dims[3];
+      int w_shape = in_dims[4];
+
+      std::vector<int> xshape = {n_shape, c_shape, d_shape, h_shape, w_shape};
+
+      int r = xdnn::pad<T>(ctx.GetRawContext(),
+                           in_data,
+                           out_data,
+                           xshape,
+                           pad_left,
+                           pad_right,
+                           value);
+      CHECK_EQ(r, 0);
+    } else if (data_format == "NDHWC") {
+      std::vector<int> pad_left = {0, pads[4], pads[2], pads[0], 0};
+      std::vector<int> pad_right = {0, pads[5], pads[3], pads[1], 0};
+
+      int n_shape = in_dims[0];
+      int d_shape = in_dims[1];
+      int h_shape = in_dims[2];
+      int w_shape = in_dims[3];
+      int c_shape = in_dims[4];
+      std::vector<int> xshape = {n_shape, d_shape, h_shape, w_shape, c_shape};
+
+      int r = xdnn::pad<T>(ctx.GetRawContext(),
+                           in_data,
+                           out_data,
+                           xshape,
+                           pad_left,
+                           pad_right,
+                           value);
+      CHECK_EQ(r, 0);
+    }
+
+  } else {
+    LOG(FATAL) << "xpu unsupport mode: " << mode;
+  }
+}
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(pad3d,
+                     kXPU,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::xpu::Pad3dCompute<float>,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
diff --git a/lite/kernels/xpu/pad3d_compute.h b/lite/kernels/xpu/pad3d_compute.h
new file mode 100644
index 00000000000..bd027a91c6e
--- /dev/null
+++ b/lite/kernels/xpu/pad3d_compute.h
@@ -0,0 +1,37 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "lite/core/kernel.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+template <class T>
+class Pad3dCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::Pad2dParam;
+
+  virtual void Run();
+
+  virtual ~Pad3dCompute() = default;
+};
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle

From 9a5488bc829507534548b68f247e2e72851c2d66 Mon Sep 17 00:00:00 2001
From: laiou <zs2281475@163.com>
Date: Tue, 11 Jan 2022 17:17:34 +0800
Subject: [PATCH 2/3] add pad3d change xpu_memory_pass lite_inplace_pass

---
 lite/api/paddle_use_passes.h                  |  1 -
 .../mir/fusion/__xpu__inplace_fuse_pass.cc    | 45 ----------------
 .../mir/fusion/__xpu__inplace_fuse_pass.h     | 32 -----------
 .../mir/fusion/__xpu__inplace_fuser.cc        | 53 -------------------
 .../mir/fusion/__xpu__inplace_fuser.h         | 40 --------------
 .../optimizer/mir/fusion/inplace_fuse_pass.cc |  3 +-
 .../optimizer/mir/fusion/inplace_fuser.cc     | 23 +++++---
 .../optimizer/mir/xpu_memory_optimize_pass.cc | 42 ++++++++-------
 .../optimizer/mir/xpu_memory_optimize_pass.h  | 13 ++---
 lite/core/optimizer/optimizer.cc              |  1 -
 10 files changed, 46 insertions(+), 207 deletions(-)
 delete mode 100644 lite/core/optimizer/mir/fusion/__xpu__inplace_fuse_pass.cc
 delete mode 100644 lite/core/optimizer/mir/fusion/__xpu__inplace_fuse_pass.h
 delete mode 100644 lite/core/optimizer/mir/fusion/__xpu__inplace_fuser.cc
 delete mode 100644 lite/core/optimizer/mir/fusion/__xpu__inplace_fuser.h

diff --git a/lite/api/paddle_use_passes.h b/lite/api/paddle_use_passes.h
index 5c7af415737..4fd1e24d09a 100644
--- a/lite/api/paddle_use_passes.h
+++ b/lite/api/paddle_use_passes.h
@@ -65,7 +65,6 @@ USE_MIR_PASS(type_layout_cast_preprocess_pass);
 USE_MIR_PASS(memory_optimize_pass);
 USE_MIR_PASS(xpu_memory_optimize_pass);
 USE_MIR_PASS(lite_inplace_fuse_pass);
-USE_MIR_PASS(xpu_inplace_fuse_pass);
 USE_MIR_PASS(multi_stream_analysis_pass);
 USE_MIR_PASS(elementwise_mul_constant_eliminate_pass);
 USE_MIR_PASS(npu_subgraph_pass);
diff --git a/lite/core/optimizer/mir/fusion/__xpu__inplace_fuse_pass.cc b/lite/core/optimizer/mir/fusion/__xpu__inplace_fuse_pass.cc
deleted file mode 100644
index 38212137228..00000000000
--- a/lite/core/optimizer/mir/fusion/__xpu__inplace_fuse_pass.cc
+++ /dev/null
@@ -1,45 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/core/optimizer/mir/fusion/__xpu__inplace_fuse_pass.h"
-#include <memory>
-#include <vector>
-#include "lite/core/optimizer/mir/fusion/__xpu__inplace_fuser.h"
-#include "lite/core/optimizer/mir/pass_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace mir {
-
-void XPUInplaceFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
-  std::vector<std::string> inplace_type_cases{"reshape",
-                                              "reshape2",
-                                              "flatten",
-                                              "flatten2",
-                                              "squeeze",
-                                              "squeeze2",
-                                              "unsqueeze",
-                                              "unsqueeze2"};
-  for (auto type : inplace_type_cases) {
-    fusion::XPUInplaceFuser inplace_fuser(type);
-    inplace_fuser(graph.get());
-  }
-}
-
-}  // namespace mir
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_MIR_PASS(xpu_inplace_fuse_pass, paddle::lite::mir::XPUInplaceFusePass)
-    .BindTargets({TARGET(kXPU)});
diff --git a/lite/core/optimizer/mir/fusion/__xpu__inplace_fuse_pass.h b/lite/core/optimizer/mir/fusion/__xpu__inplace_fuse_pass.h
deleted file mode 100644
index 5fb421bfbbc..00000000000
--- a/lite/core/optimizer/mir/fusion/__xpu__inplace_fuse_pass.h
+++ /dev/null
@@ -1,32 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <memory>
-#include <string>
-#include "lite/core/optimizer/mir/pass.h"
-
-namespace paddle {
-namespace lite {
-namespace mir {
-
-class XPUInplaceFusePass : public ProgramPass {
- public:
-  void Apply(const std::unique_ptr<SSAGraph>& graph) override;
-};
-
-}  // namespace mir
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/core/optimizer/mir/fusion/__xpu__inplace_fuser.cc b/lite/core/optimizer/mir/fusion/__xpu__inplace_fuser.cc
deleted file mode 100644
index d9740213d47..00000000000
--- a/lite/core/optimizer/mir/fusion/__xpu__inplace_fuser.cc
+++ /dev/null
@@ -1,53 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/core/optimizer/mir/fusion/__xpu__inplace_fuser.h"
-#include <memory>
-#include <vector>
-#include "lite/core/optimizer/mir/pattern_matcher_high_api.h"
-
-namespace paddle {
-namespace lite {
-namespace mir {
-namespace fusion {
-
-void XPUInplaceFuser::BuildPattern() {
-  auto* input = VarNode("input")
-                    ->assert_is_op_input(type_, "X")
-                    ->assert_only_one_output()
-                    ->AsInput();
-
-  auto* op_node = OpNode("inplace", type_)->assert_is_op(type_);
-
-  auto* output =
-      VarNode("output")->assert_is_op_output(type_, "Out")->AsOutput();
-
-  *input >> *op_node >> *output;
-}
-
-void XPUInplaceFuser::InsertNewNode(SSAGraph* graph,
-                                    const key2nodes_t& matched) {
-  bool inplace = true;
-  auto* stmt = matched.at("inplace")->stmt();
-  auto op = stmt->op();
-  cpp::OpDesc* op_desc = op->mutable_op_info();
-  op_desc->SetAttr<bool>("inplace", inplace);
-  stmt->op()->Attach(*op_desc, op->scope());
-  stmt->op()->AttachKernel(&(stmt->picked_kernel()));
-}
-
-}  // namespace fusion
-}  // namespace mir
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/core/optimizer/mir/fusion/__xpu__inplace_fuser.h b/lite/core/optimizer/mir/fusion/__xpu__inplace_fuser.h
deleted file mode 100644
index 75d2e44ad37..00000000000
--- a/lite/core/optimizer/mir/fusion/__xpu__inplace_fuser.h
+++ /dev/null
@@ -1,40 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <memory>
-#include <string>
-#include "lite/core/optimizer/mir/pattern_matcher_high_api.h"
-
-namespace paddle {
-namespace lite {
-namespace mir {
-namespace fusion {
-
-class XPUInplaceFuser : public FuseBase {
- public:
-  explicit XPUInplaceFuser(const std::string& type) : type_(type) {}
-
-  void BuildPattern() override;
-  void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override;
-
- private:
-  std::string type_;
-};
-
-}  // namespace fusion
-}  // namespace mir
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/core/optimizer/mir/fusion/inplace_fuse_pass.cc b/lite/core/optimizer/mir/fusion/inplace_fuse_pass.cc
index 25354fbee2f..4fc05bd051e 100644
--- a/lite/core/optimizer/mir/fusion/inplace_fuse_pass.cc
+++ b/lite/core/optimizer/mir/fusion/inplace_fuse_pass.cc
@@ -43,5 +43,4 @@ void InplaceFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
 
 REGISTER_MIR_PASS(lite_inplace_fuse_pass, paddle::lite::mir::InplaceFusePass)
     .BindTargets({TARGET(kAny)})
-    .ExcludeTargets({TARGET(kNPU)})
-    .ExcludeTargets({TARGET(kXPU)});
+    .ExcludeTargets({TARGET(kNPU)});
diff --git a/lite/core/optimizer/mir/fusion/inplace_fuser.cc b/lite/core/optimizer/mir/fusion/inplace_fuser.cc
index 89399af8af7..9cef60c09e7 100644
--- a/lite/core/optimizer/mir/fusion/inplace_fuser.cc
+++ b/lite/core/optimizer/mir/fusion/inplace_fuser.cc
@@ -15,22 +15,31 @@
 #include "lite/core/optimizer/mir/fusion/inplace_fuser.h"
 #include <memory>
 #include <vector>
+#include "lite/core/optimizer/mir/pattern_matcher_high_api.h"
 
 namespace paddle {
 namespace lite {
 namespace mir {
 namespace fusion {
 
-void InplaceFuser::BuildPattern() { OpNode("inplace", type_); }
+void InplaceFuser::BuildPattern() {
+  auto* input = VarNode("input")
+                    ->assert_is_op_input(type_, "X")
+                    ->assert_only_one_output()
+                    ->AsInput();
+
+  auto* op_node = OpNode("inplace", type_)->assert_is_op(type_);
+
+  auto* output = VarNode("output")
+                     ->assert_is_op_output(type_, "Out")
+                     ->assert_only_one_output()
+                     ->AsOutput();
+
+  *input >> *op_node >> *output;
+}
 
 void InplaceFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) {
-  auto out_var_nodes = matched.at("inplace")->outlinks;
   bool inplace = true;
-  for (auto& out_var_node : out_var_nodes) {
-    if (out_var_node->outlinks.size() > 1) {
-      inplace = false;
-    }
-  }
   auto* stmt = matched.at("inplace")->stmt();
   auto op = stmt->op();
   cpp::OpDesc* op_desc = op->mutable_op_info();
diff --git a/lite/core/optimizer/mir/xpu_memory_optimize_pass.cc b/lite/core/optimizer/mir/xpu_memory_optimize_pass.cc
index 3e151a9220e..e93c7fce24b 100644
--- a/lite/core/optimizer/mir/xpu_memory_optimize_pass.cc
+++ b/lite/core/optimizer/mir/xpu_memory_optimize_pass.cc
@@ -37,8 +37,8 @@ typedef struct {
 void XPUMemoryOptimizePass::CollectLifeCycleByDevice(
     std::map<std::string, lifecycle_map_t>* lifecycles,
     SSAGraph* graph,
-    std::map<std::string, std::string>* squeeze_input2output,
-    std::map<std::string, std::string>* squeeze_output2input) {
+    std::map<std::string, std::string>* inplaceop_input2output,
+    std::map<std::string, std::string>* inplaceop_output2input) {
   max_lifecycle_ = 0;
 
   auto is_host = [](TargetType x) -> bool {
@@ -232,8 +232,8 @@ void XPUMemoryOptimizePass::CollectLifeCycleByDevice(
               if (is_inplace) {
                 (*lifecycles)[TargetToStr(target_type)].emplace(
                     var_name, std::make_pair(max_lifecycle_, max_lifecycle_));
-                squeeze_input2output->emplace(input_xpu_var_name, var_name);
-                squeeze_output2input->emplace(var_name, input_xpu_var_name);
+                inplaceop_input2output->emplace(input_xpu_var_name, var_name);
+                inplaceop_output2input->emplace(var_name, input_xpu_var_name);
               } else {
                 (*lifecycles)[TargetToStr(target_type)].emplace(
                     var_name, std::make_pair(max_lifecycle_, max_lifecycle_));
@@ -264,8 +264,8 @@ void XPUMemoryOptimizePass::CollectLifeCycleByDevice(
 void XPUMemoryOptimizePass::MakeReusePlan(
     const lifecycle_map_t& lifecycles,
     std::map<std::string, std::string>* node2cluster,
-    std::map<std::string, std::string>* squeeze_input2output,
-    std::map<std::string, std::string>* squeeze_output2input) {
+    std::map<std::string, std::string>* inplaceop_input2output,
+    std::map<std::string, std::string>* inplaceop_output2input) {
   std::vector<XPUMemNode> mem_nodes;
   std::vector<std::string> cluster;
   for (auto& data : lifecycles) {
@@ -306,7 +306,7 @@ void XPUMemoryOptimizePass::MakeReusePlan(
   }
   VLOG(4) << "Step1 get inplace node Cluster: ";
   for (size_t i = 0; i < mem_nodes.size(); i++) {
-    if (squeeze_input2output->count(mem_nodes[i].name)) {
+    if (inplaceop_input2output->count(mem_nodes[i].name)) {
       int cluster_index = cluster.size();
       mem_nodes[i].cluster = cluster_index;
       (*node2cluster)[mem_nodes[i].name] = mem_nodes[i].name;
@@ -316,7 +316,7 @@ void XPUMemoryOptimizePass::MakeReusePlan(
               << (*node2cluster)[mem_nodes[i].name];
       std::set<std::string> cluster_adj = mem_nodes[i].adj;
       for (size_t j = 0; j < mem_nodes.size(); j++) {
-        if (mem_nodes[j].name == (*squeeze_input2output)[mem_nodes[i].name]) {
+        if (mem_nodes[j].name == (*inplaceop_input2output)[mem_nodes[i].name]) {
           (*node2cluster)[mem_nodes[j].name] == mem_nodes[i].name;
           mem_nodes[j].cluster = cluster_index;
           VLOG(4) << mem_nodes[j].name << ", life time is "
@@ -332,7 +332,7 @@ void XPUMemoryOptimizePass::MakeReusePlan(
   }
   VLOG(4) << "Step2 merge inplace node Cluster: ";
   for (size_t i = 0; i < mem_nodes.size(); i++) {
-    if (squeeze_input2output->count(mem_nodes[i].name) &&
+    if (inplaceop_input2output->count(mem_nodes[i].name) &&
         mem_nodes[i].mapping != 1) {
       int cluster_index = cluster.size();
       mem_nodes[i].cluster = cluster_index;
@@ -347,7 +347,7 @@ void XPUMemoryOptimizePass::MakeReusePlan(
 
       std::set<std::string> cluster_adj = mem_nodes[i].adj;
       for (size_t j = 0; j < mem_nodes.size(); j++) {
-        if (mem_nodes[j].name == (*squeeze_input2output)[mem_nodes[i].name]) {
+        if (mem_nodes[j].name == (*inplaceop_input2output)[mem_nodes[i].name]) {
           mem_nodes[j].cluster = mem_nodes[i].cluster;
           (*node2cluster)[mem_nodes[j].name] = mem_nodes[i].name;
           VLOG(4) << mem_nodes[j].name << ", life time is "
@@ -359,7 +359,7 @@ void XPUMemoryOptimizePass::MakeReusePlan(
           for (auto& m : mem_nodes[j].adj) {
             cluster_adj.insert(m);
           }
-        } else if (squeeze_input2output->count(mem_nodes[j].name) &&
+        } else if (inplaceop_input2output->count(mem_nodes[j].name) &&
                    (cluster_adj.find(mem_nodes[j].name) == cluster_adj.end()) &&
                    mem_nodes[j].mapping != 1) {
           mem_nodes[j].mapping = 1;
@@ -376,7 +376,7 @@ void XPUMemoryOptimizePass::MakeReusePlan(
           }
           for (size_t n = 0; n < mem_nodes.size(); n++) {
             if (mem_nodes[n].name ==
-                (*squeeze_input2output)[mem_nodes[j].name]) {
+                (*inplaceop_input2output)[mem_nodes[j].name]) {
               mem_nodes[n].cluster = mem_nodes[i].cluster;
               (*node2cluster)[mem_nodes[n].name] = mem_nodes[i].name;
               VLOG(4) << mem_nodes[n].name << ", life time is "
@@ -396,7 +396,7 @@ void XPUMemoryOptimizePass::MakeReusePlan(
   }
   VLOG(4) << "Step3 get others node Cluster : ";
   for (size_t i = 0; i < mem_nodes.size(); i++) {
-    if (!(squeeze_input2output->count(mem_nodes[i].name)) &&
+    if (!(inplaceop_input2output->count(mem_nodes[i].name)) &&
         mem_nodes[i].cluster < 0 && mem_nodes[i].life_interval != 0) {
       int cluster_index = cluster.size();
       mem_nodes[i].cluster = cluster_index;
@@ -409,7 +409,7 @@ void XPUMemoryOptimizePass::MakeReusePlan(
       cluster.push_back(mem_nodes[i].name);
       std::set<std::string> cluster_adj = mem_nodes[i].adj;
       for (size_t j = i + 1; j < mem_nodes.size(); j++) {
-        if (!(squeeze_input2output->count(mem_nodes[j].name)) &&
+        if (!(inplaceop_input2output->count(mem_nodes[j].name)) &&
             mem_nodes[j].cluster < 0 &&
             (cluster_adj.find(mem_nodes[j].name) == cluster_adj.end())) {
           mem_nodes[j].cluster = mem_nodes[i].cluster;
@@ -521,10 +521,12 @@ void XPUMemoryOptimizePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
   // 3. Perform reuse plan: Replace all var's name in the model according to the
   // mapping table.
   std::map<std::string, lifecycle_map_t> lifecycles;
-  std::map<std::string, std::string> squeeze_input2output;
-  std::map<std::string, std::string> squeeze_output2input;
-  CollectLifeCycleByDevice(
-      &lifecycles, graph.get(), &squeeze_input2output, &squeeze_output2input);
+  std::map<std::string, std::string> inplaceop_input2output;
+  std::map<std::string, std::string> inplaceop_output2input;
+  CollectLifeCycleByDevice(&lifecycles,
+                           graph.get(),
+                           &inplaceop_input2output,
+                           &inplaceop_output2input);
   for (auto& ele : lifecycles) {
     if (ele.first != "xpu") {
       continue;
@@ -532,8 +534,8 @@ void XPUMemoryOptimizePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
     std::map<std::string, std::string> node2cluster;
     MakeReusePlan(ele.second,
                   &node2cluster,
-                  &squeeze_input2output,
-                  &squeeze_output2input);
+                  &inplaceop_input2output,
+                  &inplaceop_output2input);
     PerformReusePlan(graph.get(), node2cluster);
   }
 }
diff --git a/lite/core/optimizer/mir/xpu_memory_optimize_pass.h b/lite/core/optimizer/mir/xpu_memory_optimize_pass.h
index 053914371d9..d4bbf9e7f9d 100644
--- a/lite/core/optimizer/mir/xpu_memory_optimize_pass.h
+++ b/lite/core/optimizer/mir/xpu_memory_optimize_pass.h
@@ -41,12 +41,13 @@ class XPUMemoryOptimizePass : public ProgramPass {
   void CollectLifeCycleByDevice(
       std::map<std::string, lifecycle_map_t>* lifecycles,
       SSAGraph*,
-      std::map<std::string, std::string>* squeeze_input2output,
-      std::map<std::string, std::string>* squeeze_output2input);
-  void MakeReusePlan(const lifecycle_map_t& lifecycles,
-                     std::map<std::string, std::string>* node2cluster,
-                     std::map<std::string, std::string>* squeeze_input2output,
-                     std::map<std::string, std::string>* squeeze_output2input);
+      std::map<std::string, std::string>* inplaceop_input2output,
+      std::map<std::string, std::string>* inplaceop_output2input);
+  void MakeReusePlan(
+      const lifecycle_map_t& lifecycles,
+      std::map<std::string, std::string>* node2cluster,
+      std::map<std::string, std::string>* inplaceop_input2output,
+      std::map<std::string, std::string>* inplaceop_output2input);
   void PerformReusePlan(SSAGraph* graph,
                         const std::map<std::string, std::string>& reuse_table);
 
diff --git a/lite/core/optimizer/optimizer.cc b/lite/core/optimizer/optimizer.cc
index 06c3c918e5e..f1b393cc2cb 100644
--- a/lite/core/optimizer/optimizer.cc
+++ b/lite/core/optimizer/optimizer.cc
@@ -264,7 +264,6 @@ std::unique_ptr<RuntimeProgram> RunDefaultOptimizer(
        "runtime_context_assign_pass",
        "argument_type_display_pass",
        "lite_inplace_fuse_pass",
-       "xpu_inplace_fuse_pass",
 #if !(defined(LITE_WITH_FPGA) || defined(LITE_WITH_PRECISION_PROFILE))
        "memory_optimize_pass",
        "xpu_memory_optimize_pass"

From be34e3285135581bbcc8e8790da7d60713a9ff1c Mon Sep 17 00:00:00 2001
From: laiou <zs2281475@163.com>
Date: Tue, 11 Jan 2022 17:27:09 +0800
Subject: [PATCH 3/3] add pad3d and change lite_inplace_pass xpu_memory_pass

---
 .../optimizer/mir/xpu_memory_optimize_pass.cc | 31 --------
 .../x86_mobilenetv1_full_demo/CMakeLists.txt  | 73 -------------------
 .../x86_mobilenetv1_light_demo/CMakeLists.txt | 73 -------------------
 lite/kernels/xpu/pad3d_compute.cc             |  2 +-
 lite/kernels/xpu/pad3d_compute.h              |  2 +-
 5 files changed, 2 insertions(+), 179 deletions(-)
 delete mode 100644 lite/demo/cxx/x86_mobilenetv1_full_demo/CMakeLists.txt
 delete mode 100644 lite/demo/cxx/x86_mobilenetv1_light_demo/CMakeLists.txt

diff --git a/lite/core/optimizer/mir/xpu_memory_optimize_pass.cc b/lite/core/optimizer/mir/xpu_memory_optimize_pass.cc
index e93c7fce24b..2070fdfa356 100644
--- a/lite/core/optimizer/mir/xpu_memory_optimize_pass.cc
+++ b/lite/core/optimizer/mir/xpu_memory_optimize_pass.cc
@@ -101,7 +101,6 @@ void XPUMemoryOptimizePass::CollectLifeCycleByDevice(
 
   // Collect the invalid input and output variables that will not be reused.
   std::set<std::string> invalid_var_names;
-  int inplace_op_num = 0;
   for (auto& op_node : graph->StmtTopologicalOrder()) {
     // variables of invalid_op_nodes wil not be reused
     if (!op_node->IsStmt()) continue;
@@ -119,36 +118,6 @@ void XPUMemoryOptimizePass::CollectLifeCycleByDevice(
       }
       continue;
     }
-    // The specified input and output variables of the Ops whose 'inplace' attr
-    // is true will not be reused, such as reshape/reshape2's X and Out
-    // variables
-    std::map<std::string,
-             std::pair<std::set<std::string>, std::set<std::string>>>
-        inplace_op_nodes = {{"reshape", {{"X"}, {"Out"}}},
-                            {"reshape2", {{"X"}, {"Out"}}},
-                            {"flatten", {{"X"}, {"Out"}}},
-                            {"flatten2", {{"X"}, {"Out"}}},
-                            {"squeeze", {{"X"}, {"Out"}}},
-                            {"squeeze2", {{"X"}, {"Out"}}},
-                            {"unsqueeze", {{"X"}, {"Out"}}},
-                            {"unsqueeze2", {{"X"}, {"Out"}}}};
-    auto inplace_op_node = inplace_op_nodes.find(op_type);
-
-    if (inplace_op_node != inplace_op_nodes.end()) {
-      bool inplace = false;
-      if (op_info->HasAttr("inplace")) {
-        inplace = op_info->GetAttr<bool>("inplace");
-      }
-      if (inplace) {
-        inplace_op_num++;
-        for (auto& in_param_name : inplace_op_node->second.first) {
-          const auto& in_arg_names = op_info->Input(in_param_name);
-        }
-        for (auto& out_param_name : inplace_op_node->second.second) {
-          const auto& out_arg_names = op_info->Output(out_param_name);
-        }
-      }
-    }
   }
 
   // non-tensor(like tensor_array) variables will not be reused
diff --git a/lite/demo/cxx/x86_mobilenetv1_full_demo/CMakeLists.txt b/lite/demo/cxx/x86_mobilenetv1_full_demo/CMakeLists.txt
deleted file mode 100644
index aaed1b50e02..00000000000
--- a/lite/demo/cxx/x86_mobilenetv1_full_demo/CMakeLists.txt
+++ /dev/null
@@ -1,73 +0,0 @@
-cmake_minimum_required(VERSION 2.8)
-project(mobilenet_full_api)
-set(TARGET mobilenet_full_api)
-
-# 1. path to Paddle-Lite lib and mklml lib
-set(LITE_DIR "${PROJECT_SOURCE_DIR}/../../../cxx")
-set(MKLML_DIR "${PROJECT_SOURCE_DIR}/../../../third_party/mklml/")
-
-if (WIN32)
-  add_definitions("/DGOOGLE_GLOG_DLL_DECL=")
-  set(MSVC_STATIC_CRT )
-  if(MSVC_STATIC_CRT)
-    set(CMAKE_C_FLAGS_DEBUG   "${CMAKE_C_FLAGS_DEBUG} /bigobj /MTd")
-    set(CMAKE_C_FLAGS_RELEASE  "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT")
-    set(CMAKE_CXX_FLAGS_DEBUG  "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd")
-    set(CMAKE_CXX_FLAGS_RELEASE   "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT")
-  else(MSVC_STATIC_CRT)
-    set(CMAKE_C_FLAGS_DEBUG   "${CMAKE_C_FLAGS_DEBUG} /bigobj /MDd")
-    set(CMAKE_C_FLAGS_RELEASE  "${CMAKE_C_FLAGS_RELEASE} /bigobj /MD")
-    set(CMAKE_CXX_FLAGS_DEBUG  "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MDd")
-    set(CMAKE_CXX_FLAGS_RELEASE   "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MD")
-  endif(MSVC_STATIC_CRT)
-endif()
-
-if (APPLE AND METAL)
-  message(STATUS "set METAL=ON")
-  add_definitions("-DMETAL")
-  find_library(METAL_LIBRARY Metal REQUIRED)
-  find_library(GRAPHIC CoreGraphics REQUIRED)
-  find_library(MPS_LIBRARY MetalPerformanceShaders REQUIRED)
-endif()
-
-# 2. link mklml and Paddle-Lite directory
-link_directories(${LITE_DIR}/lib ${MKLML_DIR}/lib)
-include_directories(${LITE_DIR}/include/ ${MKLML_DIR}/include)
-
-# 3. compile options 
-if (NOT WIN32)
-  add_definitions(-std=c++11 -g -O3 -pthread)
-  set(EXECUTABLE_OUTPUT_PATH ${PROJECT_SOURCE_DIR})
-endif()
-
-# 4.add executable output
-add_executable(${TARGET} ${TARGET}.cc)
-if (WIN32)
-  set(WITH_STATIC_MKL OFF)
-  if(WITH_STATIC_MKL)
-    set(MATH_LIB ${MKLML_DIR}/lib/libiomp5md${CMAKE_STATIC_LIBRARY_SUFFIX})
-  else()
-    set(MATH_LIB ${MKLML_DIR}/lib/mklml${CMAKE_STATIC_LIBRARY_SUFFIX}
-                 ${MKLML_DIR}/lib/libiomp5md${CMAKE_STATIC_LIBRARY_SUFFIX})
-  endif()
-
-  target_link_libraries(${TARGET} libpaddle_api_full_bundled.lib)
-  target_link_libraries(${TARGET} shlwapi.lib)
-  target_link_libraries(${TARGET} ${MATH_LIB})
-
-  add_custom_command(TARGET ${TARGET} POST_BUILD
-      COMMAND ${CMAKE_COMMAND} -E copy ${MKLML_DIR}/lib/libiomp5md.dll ${CMAKE_BINARY_DIR}/Release
-  )
-  if(NOT WITH_STATIC_MKL)
-    add_custom_command(TARGET ${TARGET} POST_BUILD
-        COMMAND ${CMAKE_COMMAND} -E copy ${MKLML_DIR}/lib/mklml.dll ${CMAKE_BINARY_DIR}/Release
-    )
-  endif()
-else()
-    if (APPLE AND METAL)
-      target_link_libraries(${TARGET} ${METAL_LIBRARY} ${GRAPHIC} ${MPS_LIBRARY})
-    endif()
-    target_link_libraries(${TARGET} -lpaddle_full_api_shared)
-    target_link_libraries(${TARGET} -liomp5)
-    target_link_libraries(${TARGET} -ldl)
-endif()
diff --git a/lite/demo/cxx/x86_mobilenetv1_light_demo/CMakeLists.txt b/lite/demo/cxx/x86_mobilenetv1_light_demo/CMakeLists.txt
deleted file mode 100644
index 50b777e7520..00000000000
--- a/lite/demo/cxx/x86_mobilenetv1_light_demo/CMakeLists.txt
+++ /dev/null
@@ -1,73 +0,0 @@
-cmake_minimum_required(VERSION 2.8)
-project(mobilenet_light_api)
-set(TARGET mobilenet_light_api)
-
-# 1. path to Paddle-Lite lib and mklml lib
-set(LITE_DIR "${PROJECT_SOURCE_DIR}/../../../cxx")
-set(MKLML_DIR "${PROJECT_SOURCE_DIR}/../../../third_party/mklml/")
-
-if (WIN32)
-  add_definitions("/DGOOGLE_GLOG_DLL_DECL=")
-  set(MSVC_STATIC_CRT )
-  if(MSVC_STATIC_CRT)
-    set(CMAKE_C_FLAGS_DEBUG   "${CMAKE_C_FLAGS_DEBUG} /bigobj /MTd")
-    set(CMAKE_C_FLAGS_RELEASE  "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT")
-    set(CMAKE_CXX_FLAGS_DEBUG  "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd")
-    set(CMAKE_CXX_FLAGS_RELEASE   "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT")
-  else(MSVC_STATIC_CRT)
-    set(CMAKE_C_FLAGS_DEBUG   "${CMAKE_C_FLAGS_DEBUG} /bigobj /MDd")
-    set(CMAKE_C_FLAGS_RELEASE  "${CMAKE_C_FLAGS_RELEASE} /bigobj /MD")
-    set(CMAKE_CXX_FLAGS_DEBUG  "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MDd")
-    set(CMAKE_CXX_FLAGS_RELEASE   "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MD")
-  endif(MSVC_STATIC_CRT)
-endif()
-
-if (APPLE AND METAL)
-  message(STATUS "set METAL=ON")
-  add_definitions("-DMETAL")
-  find_library(METAL_LIBRARY Metal REQUIRED)
-  find_library(GRAPHIC CoreGraphics REQUIRED)
-  find_library(MPS_LIBRARY MetalPerformanceShaders REQUIRED)
-endif()
-
-# 2. link mklml and Paddle-Lite directory
-link_directories(${LITE_DIR}/lib ${MKLML_DIR}/lib)
-include_directories(${LITE_DIR}/include/ ${MKLML_DIR}/include)
-
-# 3. compile options 
-if (NOT WIN32)
-  add_definitions(-std=c++11 -g -O3 -pthread)
-  set(EXECUTABLE_OUTPUT_PATH ${PROJECT_SOURCE_DIR})
-endif()
-
-# 4.add executable output
-add_executable(${TARGET} ${TARGET}.cc)
-if (WIN32)
-  set(WITH_STATIC_MKL OFF)
-  if(WITH_STATIC_MKL)
-    set(MATH_LIB ${MKLML_DIR}/lib/libiomp5md${CMAKE_STATIC_LIBRARY_SUFFIX})
-  else()
-    set(MATH_LIB ${MKLML_DIR}/lib/mklml${CMAKE_STATIC_LIBRARY_SUFFIX}
-                 ${MKLML_DIR}/lib/libiomp5md${CMAKE_STATIC_LIBRARY_SUFFIX})
-  endif()
-
-  target_link_libraries(${TARGET} libpaddle_api_light_bundled.lib)
-  target_link_libraries(${TARGET} shlwapi.lib)
-  target_link_libraries(${TARGET} ${MATH_LIB})
-
-  add_custom_command(TARGET ${TARGET} POST_BUILD
-      COMMAND ${CMAKE_COMMAND} -E copy ${MKLML_DIR}/lib/libiomp5md.dll ${CMAKE_BINARY_DIR}/Release
-  )
-  if(NOT WITH_STATIC_MKL)
-    add_custom_command(TARGET ${TARGET} POST_BUILD
-        COMMAND ${CMAKE_COMMAND} -E copy ${MKLML_DIR}/lib/mklml.dll ${CMAKE_BINARY_DIR}/Release
-    )
-  endif()
-else()
-  if (APPLE AND METAL)
-    target_link_libraries(${TARGET} ${METAL_LIBRARY} ${GRAPHIC} ${MPS_LIBRARY})
-  endif()
-  target_link_libraries(${TARGET} -lpaddle_light_api_shared)
-  target_link_libraries(${TARGET} -liomp5)
-  target_link_libraries(${TARGET} -ldl)
-endif()
diff --git a/lite/kernels/xpu/pad3d_compute.cc b/lite/kernels/xpu/pad3d_compute.cc
index 8be51425f10..261090faded 100644
--- a/lite/kernels/xpu/pad3d_compute.cc
+++ b/lite/kernels/xpu/pad3d_compute.cc
@@ -1,4 +1,4 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/lite/kernels/xpu/pad3d_compute.h b/lite/kernels/xpu/pad3d_compute.h
index bd027a91c6e..734e01fde5b 100644
--- a/lite/kernels/xpu/pad3d_compute.h
+++ b/lite/kernels/xpu/pad3d_compute.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.