Merge branch 'develop' of github.com:gouzil/Paddle into ct_init_varia…

…bles # Conflicts: # paddle/fluid/operators/detection/roi_perspective_transform_op.cc
PaddlePaddle · Oct 10, 2023 · c9cab46 · c9cab46
2 parents af02782 + 7e4b34a
commit c9cab46
Show file tree

Hide file tree

Showing 401 changed files with 8,608 additions and 24,521 deletions.
diff --git a/cmake/external/brpc.cmake b/cmake/external/brpc.cmake
@@ -13,7 +13,9 @@
 # limitations under the License.
 
 include(ExternalProject)
-set(OPENSSL_USE_STATIC_LIBS ON)
+if(NOT WITH_ARM)
+  set(OPENSSL_USE_STATIC_LIBS ON)
+endif()
 find_package(OpenSSL REQUIRED)
 
 message(STATUS "ssl:" ${OPENSSL_SSL_LIBRARY})

diff --git a/cmake/generic.cmake b/cmake/generic.cmake
@@ -88,7 +88,7 @@
 # To build a unit test binary, which is an executable binary with libpaddle.so
 # automatically linked:
 #
-#   paddle_test(example SHARED)
+#   paddle_test(example SRCS example_test.cc)
 #
 
 # including binary directory for generated headers.

diff --git a/paddle/cinn/ast_gen_ius/ast_gen.cc b/paddle/cinn/ast_gen_ius/ast_gen.cc
@@ -19,6 +19,7 @@
 #include "paddle/cinn/ir/operation.h"
 #include "paddle/cinn/ir/tensor.h"
 #include "paddle/cinn/lang/compute.h"
+#include "paddle/cinn/optim/replace_var_with_expr.h"
 
 namespace cinn {
 namespace ast_gen_ius {
@@ -84,11 +85,75 @@ ir::Expr AstGen::Build(const ir::Tensor& tensor, TensorGroup* tensor_group) {
     tensor_group->MarkShareMemBuffer(tensor, init_tensor);
     tensor_group->CtrlDepend(tensor, init_tensor);
     Expr init_body = ir::Store::Make(init_tensor, init_value, axis_exprs);
+    // create schedule block itervars, i0,i1...
+    std::vector<ir::Var> block_vars;
+    std::vector<ir::Expr> iter_values;
+    // reduce body and reduce init schedule block should have different objects
+    // for same axis so we re-create objects
+    std::vector<Var> axis_vars = common::GenDefaultAxis(axis_len);
+    for (int i = 0; i < shape.size(); ++i) {
+      block_vars.push_back(Var(Expr(0),
+                               shape[i],
+                               cinn::UniqName("i" + std::to_string(i)),
+                               /*is_reduce = */ false));
+      optim::ReplaceVarWithExpr(&init_body, axis[i], block_vars[i]);
+      axis_vars[i]->is_reduce_axis = false;
+      if (shape[i] == Expr(1)) {
+        iter_values.push_back(Expr(0));
+      } else {
+        iter_values.push_back(axis_vars[i]);
+      }
+    }
+    init_body = ir::ScheduleBlockRealize::Make(
+        iter_values,
+        ir::ScheduleBlock::Make(
+            block_vars, {}, {}, reduce_init_name, init_body));
 
     // For the remaining reduce axis, make reduce body
     const std::vector<ir::Var>& reduce_axis = tensor->reduce_axis;
     ir::Expr reduce_body =
         ConvertReduceBody(tensor->body(), tensor, axis_exprs);
+    // create schedule block itervars, i0,i1...
+    std::vector<ir::Var> reduce_block_vars;
+    std::vector<ir::Expr> reduce_iter_values;
+    // reduce body and reduce init schedule block should have different objects
+    // for same axis so we re-create objects
+    std::vector<Var> reduce_axis_vars = common::GenDefaultAxis(axis_len);
+    for (int i = 0; i < shape.size(); ++i) {
+      reduce_block_vars.push_back(Var(Expr(0),
+                                      shape[i],
+                                      cinn::UniqName("i" + std::to_string(i)),
+                                      /*is_reduce = */ false));
+      reduce_axis_vars[i]->is_reduce_axis = false;
+      if (shape[i] == Expr(1)) {
+        reduce_iter_values.push_back(Expr(0));
+      } else {
+        reduce_iter_values.push_back(axis_vars[i]);
+      }
+    }
+    for (int i = 0; i < reduce_axis.size(); ++i) {
+      int count = shape.size() + i;
+      reduce_block_vars.push_back(
+          Var(reduce_axis[i]->lower_bound,
+              reduce_axis[i]->upper_bound,
+              cinn::UniqName("i" + std::to_string(count)),
+              /*is_reduce = */ true));
+      ir::Var reduce_axis_var = reduce_axis[i];
+      reduce_axis_var->is_reduce_axis = true;
+      reduce_iter_values.push_back(reduce_axis_var);
+    }
+    for (int i = 0; i < axis.size(); ++i) {
+      optim::ReplaceVarWithExpr(&reduce_body, axis[i], reduce_block_vars[i]);
+    }
+    for (int i = axis.size(); i < reduce_block_vars.size(); ++i) {
+      optim::ReplaceVarWithExpr(
+          &reduce_body, reduce_axis[i - axis.size()], reduce_block_vars[i]);
+    }
+
+    reduce_body = ir::ScheduleBlockRealize::Make(
+        reduce_iter_values,
+        ir::ScheduleBlock::Make(
+            reduce_block_vars, {}, {}, tensor->name, reduce_body));
     for (int i = static_cast<int>(reduce_axis.size()) - 1; i >= 0; --i) {
       reduce_body = ir::For::Make(reduce_axis[i],
                                   reduce_axis[i]->lower_bound,
@@ -114,6 +179,24 @@ ir::Expr AstGen::Build(const ir::Tensor& tensor, TensorGroup* tensor_group) {
     return body;
   } else {
     ir::Expr body = ir::Store::Make(tensor, tensor->body(), axis_exprs);
+    // create schedule block itervars, i0,i1...
+    std::vector<ir::Var> block_vars;
+    std::vector<ir::Expr> iter_values;
+    std::vector<Var> axis_vars = common::GenDefaultAxis(axis_len);
+    for (int i = 0; i < shape.size(); ++i) {
+      block_vars.push_back(Var(
+          Expr(0), shape[i], cinn::UniqName("i" + std::to_string(i)), false));
+      optim::ReplaceVarWithExpr(&body, axis[i], block_vars[i]);
+      axis_vars[i]->is_reduce_axis = false;
+      if (shape[i] == Expr(1)) {
+        iter_values.push_back(Expr(0));
+      } else {
+        iter_values.push_back(axis_vars[i]);
+      }
+    }
+    body = ir::ScheduleBlockRealize::Make(
+        iter_values,
+        ir::ScheduleBlock::Make(block_vars, {}, {}, tensor->name, body));
     for (int i = static_cast<int>(axis_len) - 1; i >= 0; --i) {
       ir::Var loop_var = axis[i];
       ir::Expr loop_extent = shape[i];

diff --git a/paddle/cinn/ast_gen_ius/tensor_group.cc b/paddle/cinn/ast_gen_ius/tensor_group.cc
@@ -21,26 +21,37 @@
 #include "paddle/cinn/ir/ir_base.h"
 #include "paddle/cinn/ir/tensor.h"
 #include "paddle/cinn/ir/utils/ir_nodes_collector.h"
+#include "paddle/cinn/poly/stage.h"
 
 namespace cinn {
 namespace ast_gen_ius {
 
 TensorGroup::TensorGroup(const std::vector<ir::Tensor>& tensors) {
-  std::set<ir::Tensor> all_tensors(tensors.begin(), tensors.end());
-
-  for (auto& tensor : tensors) {
+  for (const ir::Tensor& tensor : tensors) {
     output_tensor_names_.insert(tensor->name);
-    std::set<ir::Expr> used_tensors = ir::ir_utils::CollectIRNodes(
-        tensor->body(), [](const Expr* x) { return x->as_tensor(); });
-    for (const Expr& x : used_tensors) {
-      const ir::Tensor to_dep = x.as_tensor_ref();
-      all_tensors.insert(to_dep);
-      this->CtrlDepend(tensor, to_dep);
+    this->Insert(tensor);
+  }
+}
+
+void TensorGroup::ShowLog() const {
+  VLOG(6) << "Showing log for TensorGroup";
+  for (auto& p : name_to_tensor_) {
+    VLOG(6) << "Tensor name = " << p.first << " depends on {";
+    if (ctrl_dep_.count(p.first)) {
+      for (auto& dep_name : ctrl_dep_.at(p.first)) {
+        VLOG(6) << dep_name;
+      }
     }
+    VLOG(6) << "}";
   }
+}
 
-  for (const ir::Tensor& t : all_tensors) {
-    name_to_tensor_.insert({t->name, t});
+TensorGroup::TensorGroup(
+    const std::unordered_map<std::string, ir::Tensor>& tensor_map) {
+  for (const auto& map_pair : tensor_map) {
+    const ir::Tensor& tensor = map_pair.second;
+    output_tensor_names_.insert(tensor->name);
+    this->Insert(tensor);
   }
 }
 
@@ -51,7 +62,23 @@ bool TensorGroup::Contain(const std::string& name) const {
 }
 
 void TensorGroup::Insert(const ir::Tensor& tensor) {
-  name_to_tensor_.insert({tensor->name, tensor});
+  if (!name_to_tensor_.count(tensor->name)) {
+    name_to_tensor_.insert({tensor->name, tensor});
+  }
+
+  // Using set to de-duplicate
+  std::set<ir::Tensor> dep_tensors;
+  std::set<ir::Expr> used_tensors = ir::ir_utils::CollectIRNodes(
+      tensor->body(), [](const Expr* x) { return x->as_tensor(); });
+  for (const Expr& x : used_tensors) {
+    const ir::Tensor to_dep = x.as_tensor_ref();
+    dep_tensors.insert(to_dep);
+    this->CtrlDepend(tensor, to_dep);
+  }
+
+  for (const ir::Tensor& t : dep_tensors) {
+    this->Insert(t);
+  }
 }
 
 ir::Tensor TensorGroup::Get(const std::string& name) {
@@ -72,6 +99,8 @@ std::vector<ir::Tensor> TensorGroup::GetGenFuncTopoOrder(
   for (const auto& dep_pair : ctrl_dep_) {
     const std::unordered_set<std::string>& dep_tensor_names = dep_pair.second;
     in_degree[dep_pair.first] = dep_tensor_names.size();
+    VLOG(6) << "indegree[" << dep_pair.first
+            << "] = " << dep_tensor_names.size();
   }
 
   std::vector<ir::Tensor> ret;
@@ -95,7 +124,6 @@ std::vector<ir::Tensor> TensorGroup::GetGenFuncTopoOrder(
   while (!node_set.empty()) {
     const std::string cur = *(node_set.begin());
     node_set.erase(node_set.begin());
-
     if (!input_arg_names.count(cur)) {
       ret.push_back(name_to_tensor_[cur]);
     }
@@ -187,5 +215,45 @@ absl::flat_hash_map<std::string, ir::Tensor> TensorGroup::AllocateBuffers() {
   return name_to_tensor_;
 }
 
+void StageMapShareMemory(const poly::StageMap& stages) {
+  absl::flat_hash_map<std::string, ir::_Tensor_*> tensor_map;
+  for (auto& stage : stages) {
+    tensor_map[stage.second->tensor()->name] = stage.second->tensor();
+  }
+  for (auto& stage : stages) {
+    if (!stage.second->tensor()->buffer.defined() &&
+        !stage.second->meta.tensors_to_share_buffer_with.empty()) {
+      for (auto& str : stage.second->meta.tensors_to_share_buffer_with) {
+        if (tensor_map[str]->buffer.defined()) {
+          auto edited_shape = tensor_map[str]->buffer->shape;
+          stage.second->tensor()->Bind(tensor_map[str]->buffer);
+          tensor_map[str]->buffer->shape = edited_shape;
+          VLOG(3) << "Stage Tensor " << stage.second->tensor()->name
+                  << " bind buffer to " << tensor_map[str]->name << " , "
+                  << tensor_map[str]->buffer->name;
+        }
+      }
+    }
+  }
+}
+
+TensorGroup ConvertStageMapToTensorGroup(const poly::StageMap& stage_map) {
+  std::vector<ir::Tensor> stage_tensors;
+  std::set<ir::Tensor> reshape_tensors;
+  for (auto iter = stage_map.begin(); iter != stage_map.end(); ++iter) {
+    if (iter->second->has_expression()) {
+      const std::string& tensor_name = iter->first;
+      stage_tensors.push_back(ir::Tensor(iter->second->tensor()));
+      if (utils::Endswith(tensor_name, "_reshape")) {
+        reshape_tensors.insert(ir::Tensor(iter->second->tensor()));
+      }
+    }
+  }
+
+  ast_gen_ius::TensorGroup tensor_group(stage_tensors);
+  StageMapShareMemory(stage_map);
+  return tensor_group;
+}
+
 }  // namespace ast_gen_ius
 }  // namespace cinn
diff --git a/paddle/cinn/ast_gen_ius/tensor_group.h b/paddle/cinn/ast_gen_ius/tensor_group.h
@@ -24,6 +24,7 @@
 #include "paddle/cinn/ir/ir.h"
 #include "paddle/cinn/ir/ir_base.h"
 #include "paddle/cinn/ir/tensor.h"
+#include "paddle/cinn/poly/stage.h"
 
 namespace cinn {
 namespace ast_gen_ius {
@@ -41,11 +42,21 @@ class TensorGroup {
    */
   explicit TensorGroup(const std::vector<ir::Tensor>& tensors);
 
+  /**
+   * Constructor for a TensorGroup, the argument tensors should be output tensor
+   * arguments of the AST body to be generated. The dependent tensors of the
+   * output tensors will be collected during construction.
+   */
+  explicit TensorGroup(
+      const std::unordered_map<std::string, ir::Tensor>& tensor_map);
+
   /**
    * Destructor.
    */
   ~TensorGroup();
 
+  void ShowLog() const;
+
   /**
    * Returns true if TensorGroup collection contains a tensor with input name.
    */
@@ -119,5 +130,9 @@ class TensorGroup {
   std::unordered_map<std::string, std::string> share_memory_tensor_;
 };
 
+// TODO(zhhsplendid): remove stage_map need to change all fcompute CINNValuePack
+// we will change it in the next PR
+TensorGroup ConvertStageMapToTensorGroup(const poly::StageMap& stage_map);
+
 }  // namespace ast_gen_ius
 }  // namespace cinn
diff --git a/paddle/cinn/auto_schedule/analysis/analyze_ir.cc b/paddle/cinn/auto_schedule/analysis/analyze_ir.cc
@@ -190,5 +190,40 @@ ir::LoweredFunc UpdateFuncWithNewBody(const common::Target& target,
   return new_func;
 }
 
+std::unordered_set<std::string> GetReduceLoopVarNames(const ir::Expr block) {
+  const ir::ScheduleBlockRealize* block_realize =
+      block.As<ir::ScheduleBlockRealize>();
+  CHECK_NOTNULL(block_realize);
+  const ir::ScheduleBlock* block_node =
+      block_realize->schedule_block.As<ir::ScheduleBlock>();
+  CHECK_NOTNULL(block_node);
+  std::vector<ir::Expr> iter_values = block_realize->iter_values;
+  std::vector<ir::Var> iter_vars = block_node->iter_vars;
+
+  std::unordered_set<std::string> reduce_loop_var;
+  for (int i = 0; i < iter_vars.size(); ++i) {
+    if (iter_vars[i]->is_reduce_axis) {
+      ir::ir_utils::CollectIRNodesWithoutTensor(
+          iter_values[i], [&](const ir::Expr* x) {
+            if (x->as_var()) {
+              reduce_loop_var.insert(x->as_var_ref()->name);
+            }
+            return false;
+          });
+    }
+  }
+  return reduce_loop_var;
+}
+
+std::string GetBlockName(const ir::Expr block) {
+  const ir::ScheduleBlockRealize* block_realize =
+      block.As<ir::ScheduleBlockRealize>();
+  CHECK_NOTNULL(block_realize);
+  const ir::ScheduleBlock* block_node =
+      block_realize->schedule_block.As<ir::ScheduleBlock>();
+  CHECK_NOTNULL(block_node);
+  return block_node->name;
+}
+
 }  // namespace auto_schedule
 }  // namespace cinn
diff --git a/paddle/cinn/auto_schedule/analysis/analyze_ir.h b/paddle/cinn/auto_schedule/analysis/analyze_ir.h
@@ -48,5 +48,15 @@ ir::LoweredFunc UpdateFuncWithNewBody(const common::Target& target,
                                       const ir::LoweredFunc& old_func,
                                       ir::Expr& body);  // NOLINT
 
+/**
+ * Get loop var names of reduce axis
+ */
+std::unordered_set<std::string> GetReduceLoopVarNames(const ir::Expr block);
+
+/**
+ * Get name of a ScheduleBlock
+ */
+std::string GetBlockName(const ir::Expr block);
+
 }  // namespace auto_schedule
 }  // namespace cinn
diff --git a/paddle/cinn/auto_schedule/post_schedule_rule/cooperative_process_test.cc b/paddle/cinn/auto_schedule/post_schedule_rule/cooperative_process_test.cc
@@ -129,7 +129,7 @@ TEST_F(TestCooperativeProcess, Matmul) {
                   {
                     i0, i1 = axis.bind(((16 * i) + ((2 * i_0) + i_1)), ((16 * j) + ((8 * j_0) + j_1)))
                     {
-                      temp_matmul_out__reduce_init[((16 * i) + ((2 * i_0) + i_1)), ((16 * j) + ((8 * j_0) + j_1))] = 0.00000000f
+                      temp_matmul_out__reduce_init[i0, i1] = 0.00000000f
                     }
                   }
                 }
@@ -181,7 +181,7 @@ TEST_F(TestCooperativeProcess, Matmul) {
                   {
                     i0_0, i1_0, i2 = axis.bind(((2 * (i_0_j_0_fused / 2)) + ((16 * (i_j_fused / 2)) + i_1)), ((8 * (i_0_j_0_fused % 2)) + ((16 * (i_j_fused % 2)) + j_1)), ((4 * reduce_k_0) + reduce_k_1))
                     {
-                      temp_matmul_out[((2 * (i_0_j_0_fused / 2)) + ((16 * (i_j_fused / 2)) + i_1)), ((8 * (i_0_j_0_fused % 2)) + ((16 * (i_j_fused % 2)) + j_1))] = (temp_matmul_out[((2 * (i_0_j_0_fused / 2)) + ((16 * (i_j_fused / 2)) + i_1)), ((8 * (i_0_j_0_fused % 2)) + ((16 * (i_j_fused % 2)) + j_1))] + (X_reshape_shared_temp_buffer[((2 * (i_0_j_0_fused / 2)) + ((16 * (i_j_fused / 2)) + i_1)), ((4 * reduce_k_0) + reduce_k_1)] * Y_reshape_shared_temp_buffer[((4 * reduce_k_0) + reduce_k_1), ((8 * (i_0_j_0_fused % 2)) + ((16 * (i_j_fused % 2)) + j_1))]))
+                      temp_matmul_out[i0_0, i1_0] = (temp_matmul_out[i0_0, i1_0] + (X_reshape_shared_temp_buffer[i0_0, i2] * Y_reshape_shared_temp_buffer[i2, i1_0]))
                     }
                   }
                 }