From dd50d81d4ce7da6ad3d66f9590569a5712e1f082 Mon Sep 17 00:00:00 2001 From: winter-wang <78149749+winter-wang@users.noreply.github.com> Date: Sat, 7 Oct 2023 19:40:30 +0800 Subject: [PATCH 01/62] [PIR] remove unuseful AsOpaquePointer func. (#57832) --- paddle/fluid/pir/dialect/operator/ir/manual_op.cc | 5 +---- paddle/pir/core/interface_support.h | 5 ++--- paddle/pir/core/ir_context.cc | 4 ++-- paddle/pir/core/op_info.h | 5 ++--- paddle/pir/core/type_id.h | 5 ++--- paddle/pir/core/value.cc | 8 ++------ paddle/pir/pattern_rewrite/pattern_match.cc | 10 +++------- test/cpp/pir/core/op_info_test.cc | 2 +- 8 files changed, 15 insertions(+), 29 deletions(-) diff --git a/paddle/fluid/pir/dialect/operator/ir/manual_op.cc b/paddle/fluid/pir/dialect/operator/ir/manual_op.cc index 66a52f99c6b44..8a4e4cda9f50b 100644 --- a/paddle/fluid/pir/dialect/operator/ir/manual_op.cc +++ b/paddle/fluid/pir/dialect/operator/ir/manual_op.cc @@ -1045,7 +1045,6 @@ void IfOp::Build(pir::Builder &builder, // NOLINT pir::Value cond, std::vector &&output_types) { VLOG(4) << "Start build IfOp"; - argument.AddRegions(2u); argument.AddInput(cond); argument.output_types.swap(output_types); @@ -1086,11 +1085,9 @@ void WhileOp::Build(pir::Builder &builder, // NOLINT pir::OperationArgument &argument, // NOLINT const std::vector &inputs, const std::vector &output_types) { - // auto insert_point = builder.insert_point(); argument.AddInputs(inputs); argument.AddOutputs(output_types); - argument.AddRegion(nullptr); - argument.AddRegion(nullptr); + argument.AddRegions(2u); } pir::Block *WhileOp::cond_block() { pir::Region &cond_region = (*this)->region(0); diff --git a/paddle/pir/core/interface_support.h b/paddle/pir/core/interface_support.h index 3814570eb4f3b..de8e09403765c 100644 --- a/paddle/pir/core/interface_support.h +++ b/paddle/pir/core/interface_support.h @@ -42,8 +42,7 @@ class ConstructInterfacesOrTraits { static void PlacementConstrctInterface( InterfaceValue *&p_interface) { // NOLINT p_interface->swap(InterfaceValue::get()); - VLOG(6) << "New a interface: id[" - << (p_interface->type_id()).AsOpaquePointer() << "]."; + VLOG(6) << "New a interface: id[" << p_interface->type_id() << "]."; ++p_interface; } @@ -51,7 +50,7 @@ class ConstructInterfacesOrTraits { template static void PlacementConstrctTrait(pir::TypeId *&p_trait) { // NOLINT *p_trait = TypeId::get(); - VLOG(6) << "New a trait: id[" << p_trait->AsOpaquePointer() << "]."; + VLOG(6) << "New a trait: id[" << *p_trait << "]."; ++p_trait; } }; diff --git a/paddle/pir/core/ir_context.cc b/paddle/pir/core/ir_context.cc index b7aca14e8f60b..cab574c68d1f6 100644 --- a/paddle/pir/core/ir_context.cc +++ b/paddle/pir/core/ir_context.cc @@ -106,7 +106,7 @@ class IrContextImpl { void RegisterOpInfo(const std::string &name, OpInfo info) { std::lock_guard guard(registed_op_infos_lock_); VLOG(6) << "Register an operation of: [Name=" << name - << ", OpInfo ptr=" << info.AsOpaquePointer() << "]."; + << ", OpInfo ptr=" << info << "]."; registed_op_infos_.emplace(name, info); } @@ -115,7 +115,7 @@ class IrContextImpl { auto iter = registed_op_infos_.find(name); if (iter != registed_op_infos_.end()) { VLOG(8) << "Found a cached OpInfo of: [name=" << name - << ", OpInfo: ptr=" << iter->second.AsOpaquePointer() << "]."; + << ", OpInfo: ptr=" << iter->second << "]."; return iter->second; } VLOG(8) << "No cache found operation of: [Name=" << name << "]."; diff --git a/paddle/pir/core/op_info.h b/paddle/pir/core/op_info.h index 7065a295be082..fcc039a8a51d4 100644 --- a/paddle/pir/core/op_info.h +++ b/paddle/pir/core/op_info.h @@ -71,8 +71,7 @@ class IR_API OpInfo { template typename InterfaceT::Concept *GetInterfaceImpl() const; - operator const void *() const { return impl_; } - void *AsOpaquePointer() const { return impl_; } + operator void *() const { return impl_; } static OpInfo RecoverFromOpaquePointer(void *pointer) { return OpInfo(static_cast(pointer)); } @@ -105,7 +104,7 @@ namespace std { template <> struct hash { std::size_t operator()(const pir::OpInfo &obj) const { - return std::hash()(obj); + return std::hash()(obj); } }; } // namespace std diff --git a/paddle/pir/core/type_id.h b/paddle/pir/core/type_id.h index d2511be7fe9a6..53bae06d9d912 100644 --- a/paddle/pir/core/type_id.h +++ b/paddle/pir/core/type_id.h @@ -53,8 +53,7 @@ class TypeId { /// /// \brief Support PointerLikeTypeTraits. /// - operator const void *() const { return storage_; } - void *AsOpaquePointer() const { return storage_; } + operator void *() const { return storage_; } static TypeId RecoverFromOpaquePointer(void *pointer) { return TypeId(static_cast(pointer)); } @@ -146,7 +145,7 @@ namespace std { template <> struct hash { std::size_t operator()(const pir::TypeId &obj) const { - return std::hash()(obj); + return std::hash()(obj); } }; } // namespace std diff --git a/paddle/pir/core/value.cc b/paddle/pir/core/value.cc index a4bd4430507af..4e364e916c6c3 100644 --- a/paddle/pir/core/value.cc +++ b/paddle/pir/core/value.cc @@ -46,10 +46,7 @@ bool Value::operator<(const Value &other) const { Value::operator bool() const { return impl_; } -pir::Type Value::type() const { - CHECK_VALUE_NULL_IMPL(type); - return impl_->type(); -} +pir::Type Value::type() const { return impl_ ? impl_->type() : nullptr; } void Value::set_type(pir::Type type) { CHECK_VALUE_NULL_IMPL(set_type); @@ -66,8 +63,7 @@ Value::UseIterator Value::use_begin() const { return OpOperand(first_use()); } Value::UseIterator Value::use_end() const { return Value::UseIterator(); } OpOperand Value::first_use() const { - CHECK_VALUE_NULL_IMPL(first_use); - return impl_->first_use(); + return impl_ ? impl_->first_use() : nullptr; } bool Value::use_empty() const { return !first_use(); } diff --git a/paddle/pir/pattern_rewrite/pattern_match.cc b/paddle/pir/pattern_rewrite/pattern_match.cc index eccaf66cca9ce..028d0779dbf94 100644 --- a/paddle/pir/pattern_rewrite/pattern_match.cc +++ b/paddle/pir/pattern_rewrite/pattern_match.cc @@ -29,7 +29,7 @@ Pattern::Pattern(const std::string& root_name, PatternBenefit benefit, IrContext* context, const std::vector& generated_names) - : Pattern(context->GetRegisteredOpInfo(root_name).AsOpaquePointer(), + : Pattern(context->GetRegisteredOpInfo(root_name), RootKind::OperationInfo, generated_names, benefit, @@ -46,7 +46,7 @@ Pattern::Pattern(MatchInterfaceOpTypeTag tag, PatternBenefit benefit, IrContext* context, const std::vector& generated_names) - : Pattern(interface_id.AsOpaquePointer(), + : Pattern(interface_id, RootKind::InterfaceId, generated_names, benefit, @@ -57,11 +57,7 @@ Pattern::Pattern(MatchTraitOpTypeTag tag, PatternBenefit benefit, IrContext* context, const std::vector& generated_names) - : Pattern(trait_id.AsOpaquePointer(), - RootKind::TraitId, - generated_names, - benefit, - context) {} + : Pattern(trait_id, RootKind::TraitId, generated_names, benefit, context) {} Pattern::Pattern(void* root_val, RootKind root_kind, diff --git a/test/cpp/pir/core/op_info_test.cc b/test/cpp/pir/core/op_info_test.cc index fec5b71396095..0eb365805ebf5 100644 --- a/test/cpp/pir/core/op_info_test.cc +++ b/test/cpp/pir/core/op_info_test.cc @@ -39,7 +39,7 @@ TEST(ir_op_info_test, op_op_info_test) { auto& info_map = context->registered_op_info_map(); EXPECT_FALSE(info_map.empty()); - void* info_1 = op->info().AsOpaquePointer(); + void* info_1 = op->info(); auto info_2 = pir::OpInfo::RecoverFromOpaquePointer(info_1); EXPECT_EQ(op->info(), info_2); pir::Verify(program.module_op()); From 8f1b6d31379a1cec885dbae2a4cce21efaa69e7b Mon Sep 17 00:00:00 2001 From: XiaociZhang Date: Sun, 8 Oct 2023 10:09:39 +0800 Subject: [PATCH 02/62] [xpu] bugfix in paddle dependency packing (#57836) xccl/xdnn/xre do not re-download tar ball in re-building when date/version is changed in cmake. This is due to the extra '-c' option in wget command since the target tarball name always stays the same. --- tools/xpu/pack_paddle_depence.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/xpu/pack_paddle_depence.sh b/tools/xpu/pack_paddle_depence.sh index d683d082051bb..0538bf192695b 100644 --- a/tools/xpu/pack_paddle_depence.sh +++ b/tools/xpu/pack_paddle_depence.sh @@ -26,13 +26,13 @@ XDNN_DIR_NAME=$4 XCCL_URL=$5 XCCL_DIR_NAME=$6 -wget --no-check-certificate ${XRE_URL} -c -q -O xre.tar.gz +wget --no-check-certificate ${XRE_URL} -q -O xre.tar.gz tar xvf xre.tar.gz -wget --no-check-certificate ${XDNN_URL} -c -q -O xdnn.tar.gz +wget --no-check-certificate ${XDNN_URL} -q -O xdnn.tar.gz tar xvf xdnn.tar.gz -wget --no-check-certificate ${XCCL_URL} -c -q -O xccl.tar.gz +wget --no-check-certificate ${XCCL_URL} -q -O xccl.tar.gz tar xvf xccl.tar.gz mkdir -p xpu/include/xpu From ce04c5efeb4738f3d47b3ab508cb423c25c60d70 Mon Sep 17 00:00:00 2001 From: gouzil <66515297+gouzil@users.noreply.github.com> Date: Sun, 8 Oct 2023 15:04:23 +0800 Subject: [PATCH 03/62] [tesst] fix cuda (#57834) --- test/cpp/eager/performance_tests/CMakeLists.txt | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/test/cpp/eager/performance_tests/CMakeLists.txt b/test/cpp/eager/performance_tests/CMakeLists.txt index 1f5a15b3e1ea8..18821be4f630a 100644 --- a/test/cpp/eager/performance_tests/CMakeLists.txt +++ b/test/cpp/eager/performance_tests/CMakeLists.txt @@ -20,10 +20,12 @@ if(NOT (NOT WITH_PYTHON AND ON_INFER)) paddle_test(test_egr_performance_benchmark_fluid_cpu SRCS benchmark_fluid_cpu.cc DEPS performance_benchmark_utils) - paddle_test(test_egr_performance_benchmark_eager_cuda SRCS - benchmark_eager_cuda.cc DEPS performance_benchmark_utils) - paddle_test(test_egr_performance_benchmark_fluid_cuda SRCS - benchmark_fluid_cuda.cc DEPS performance_benchmark_utils) + if(WITH_GPU) + paddle_test(test_egr_performance_benchmark_eager_cuda SRCS + benchmark_eager_cuda.cc DEPS performance_benchmark_utils) + paddle_test(test_egr_performance_benchmark_fluid_cuda SRCS + benchmark_fluid_cuda.cc DEPS performance_benchmark_utils) + endif() if(WITH_ONNXRUNTIME AND WIN32) # Copy onnxruntime for some c++ test in Windows, since the test will From c13609a391b0f47ae2108eb59a0a404745bc92b7 Mon Sep 17 00:00:00 2001 From: wanghuancoder Date: Sun, 8 Oct 2023 17:15:45 +0800 Subject: [PATCH 04/62] del stride test (#57833) --- test/autograd/CMakeLists.txt | 8 ------ test/dygraph_to_static/CMakeLists.txt | 9 ------ test/legacy_test/CMakeLists.txt | 34 ----------------------- test/white_list/new_ir_op_test_white_list | 4 --- 4 files changed, 55 deletions(-) diff --git a/test/autograd/CMakeLists.txt b/test/autograd/CMakeLists.txt index d4f03dc9c548c..592517cb8e3da 100644 --- a/test/autograd/CMakeLists.txt +++ b/test/autograd/CMakeLists.txt @@ -15,16 +15,8 @@ foreach(TEST_OP ${TEST_OPS}) py_test_modules(${TEST_OP} MODULES ${TEST_OP} ENVS ${GC_ENVS}) endforeach() -set(STRIED_TESTS test_autograd_dynamic) - -foreach(STRIED_TEST ${STRIED_TESTS}) - py_test_modules(${STRIED_TEST}_with_stride MODULES ${STRIED_TEST} ENVS - FLAGS_use_stride_kernel=true) -endforeach() - set_tests_properties(test_autograd_dynamic PROPERTIES TIMEOUT 100) set_tests_properties(test_autograd_functional_dynamic PROPERTIES TIMEOUT 200) -set_tests_properties(test_autograd_dynamic_with_stride PROPERTIES TIMEOUT 100) set_tests_properties(test_autograd_functional_static PROPERTIES TIMEOUT 160) set_tests_properties(test_minimize PROPERTIES TIMEOUT 60) if(NOT WIN32) diff --git a/test/dygraph_to_static/CMakeLists.txt b/test/dygraph_to_static/CMakeLists.txt index d8aca1e3f5671..4231938cf1ee6 100644 --- a/test/dygraph_to_static/CMakeLists.txt +++ b/test/dygraph_to_static/CMakeLists.txt @@ -60,14 +60,6 @@ foreach(TEST_OP ${TEST_OPS}) endif() endforeach() -set(STRIED_TESTS test_bert test_lstm test_ptb_lm_v2 test_slice) - -foreach(STRIED_TEST ${STRIED_TESTS}) - py_test_modules(${STRIED_TEST}_with_stride MODULES ${STRIED_TEST} ENVS - ${GC_ENVS} FLAGS_use_stride_kernel=true) - set_tests_properties(${STRIED_TEST}_with_stride PROPERTIES TIMEOUT 120) -endforeach() - set_tests_properties(test_se_resnet PROPERTIES TIMEOUT 900) set_tests_properties(test_yolov3 PROPERTIES TIMEOUT 900 LABELS "RUN_TYPE=EXCLUSIVE") @@ -75,7 +67,6 @@ set_tests_properties(test_mobile_net PROPERTIES TIMEOUT 120) set_tests_properties(test_seq2seq PROPERTIES TIMEOUT 150) set_tests_properties(test_cycle_gan PROPERTIES TIMEOUT 150) set_tests_properties(test_bert PROPERTIES TIMEOUT 180) -set_tests_properties(test_bert_with_stride PROPERTIES TIMEOUT 120) set_tests_properties(test_basic_api_transformation PROPERTIES TIMEOUT 120) set_tests_properties(test_reinforcement_learning PROPERTIES TIMEOUT 120) set_tests_properties(test_transformer PROPERTIES TIMEOUT 200) diff --git a/test/legacy_test/CMakeLists.txt b/test/legacy_test/CMakeLists.txt index 7aa25386076e5..6c5bb2801a6fc 100644 --- a/test/legacy_test/CMakeLists.txt +++ b/test/legacy_test/CMakeLists.txt @@ -1410,43 +1410,9 @@ foreach(IR_OP_TEST ${NEW_IR_OP_RELAXED_TESTS}) endif() endforeach() -set(STRIED_TESTS - test_complex_getitem - test_complex_grad_accumulated - test_complex_simplenet - test_conv1d_layer - test_conv1d_transpose_layer - test_conv2d_layer - test_diagonal_op - test_imperative_ocr_attention_model - test_imperative_ptb_rnn - test_imperative_ptb_rnn_sorted_gradient - test_initializer - test_inplace - test_real_imag_op - test_reshape_op - test_set_value_op - test_signal - test_slice_op - test_solve_op - test_squeeze_op - test_squeeze2_op - test_unbind_op - test_unsqueeze_op - test_unsqueeze2_op - test_var_base) - -foreach(STRIED_TEST ${STRIED_TESTS}) - py_test_modules(${STRIED_TEST}_with_stride MODULES ${STRIED_TEST} ENVS - FLAGS_use_stride_kernel=true) - set_tests_properties(${STRIED_TEST}_with_stride PROPERTIES TIMEOUT 120) -endforeach() - py_test_modules(test_stride MODULES test_stride ENVS FLAGS_use_stride_kernel=true) -set_tests_properties(test_slice_op_with_stride PROPERTIES TIMEOUT 300) - if((WITH_ROCM OR WITH_GPU) AND NOT WIN32) # These UTs are specially designed for FleetExecutor set_tests_properties( diff --git a/test/white_list/new_ir_op_test_white_list b/test/white_list/new_ir_op_test_white_list index ddd7abe19becd..cb33edca9dfbe 100644 --- a/test/white_list/new_ir_op_test_white_list +++ b/test/white_list/new_ir_op_test_white_list @@ -161,7 +161,6 @@ test_prior_box_op test_psroi_pool_op test_put_along_axis_op test_range -test_real_imag_op_with_stride test_reduce_op test_reduce_op_static_build test_reshape_op @@ -185,7 +184,6 @@ test_solve_op test_spectral_norm_op test_spectral_op test_squared_l2_norm_op -test_squeeze_op_with_stride test_svd_op test_take_along_axis_op test_temporal_shift_op @@ -197,13 +195,11 @@ test_tril_indices_op test_trilinear_interp_v2_op test_triu_indices_op test_trunc_op -test_unbind_op_with_stride test_unfold_op test_unique_consecutive_op test_unpool3d_op test_unpool_op test_unsqueeze2_op -test_unsqueeze_op_with_stride test_update_loss_scaling_op test_update_loss_scaling_op_static_build test_viterbi_decode_op From a498e0ba578a2154db3f8b5fb6544cd1e399760d Mon Sep 17 00:00:00 2001 From: Huihuang Zheng Date: Sun, 8 Oct 2023 18:32:36 +0800 Subject: [PATCH 05/62] [CINN] Replace Old Stage Style Lower and Polyhedral ASTGen in GraphCompiler to the New Lower and ASTGen (#57454) Replace old stage style lower and Polyhedral ASTGen in graph_compiler and op_lowering_impl to the new lower and ASTGen. TODO: if this PR successfully run recently, we will remove old style stage & schedule completely and clean code in the next PR. In the next PR, we will rename LowerToAst to Lower, rename LowerToAstVec to LowerVec, and replace the test codes where use them. --- paddle/cinn/ast_gen_ius/ast_gen.cc | 83 ++++++++++++++++ paddle/cinn/ast_gen_ius/tensor_group.cc | 94 ++++++++++++++++--- paddle/cinn/ast_gen_ius/tensor_group.h | 15 +++ .../auto_gen_rule/multi_level_tiling_test.cc | 2 +- paddle/cinn/backends/codegen_c_test.cc | 2 +- paddle/cinn/hlir/framework/graph_compiler.cc | 24 +++-- .../cinn/hlir/framework/op_lowering_impl.cc | 17 ++-- .../cinn/ir/test/schedule_block_graph_test.cc | 4 +- paddle/cinn/lang/lower.cc | 32 +++---- paddle/cinn/lang/lower.h | 2 +- paddle/cinn/lang/lower_tensor_group.cc | 48 +++++++--- paddle/cinn/lang/lower_tensor_group.h | 10 +- paddle/cinn/lang/lower_test.cc | 49 +++++++--- 13 files changed, 298 insertions(+), 84 deletions(-) diff --git a/paddle/cinn/ast_gen_ius/ast_gen.cc b/paddle/cinn/ast_gen_ius/ast_gen.cc index 3d94f1fc8b7a0..c8be20ae3afa6 100644 --- a/paddle/cinn/ast_gen_ius/ast_gen.cc +++ b/paddle/cinn/ast_gen_ius/ast_gen.cc @@ -19,6 +19,7 @@ #include "paddle/cinn/ir/operation.h" #include "paddle/cinn/ir/tensor.h" #include "paddle/cinn/lang/compute.h" +#include "paddle/cinn/optim/replace_var_with_expr.h" namespace cinn { namespace ast_gen_ius { @@ -84,11 +85,75 @@ ir::Expr AstGen::Build(const ir::Tensor& tensor, TensorGroup* tensor_group) { tensor_group->MarkShareMemBuffer(tensor, init_tensor); tensor_group->CtrlDepend(tensor, init_tensor); Expr init_body = ir::Store::Make(init_tensor, init_value, axis_exprs); + // create schedule block itervars, i0,i1... + std::vector block_vars; + std::vector iter_values; + // reduce body and reduce init schedule block should have different objects + // for same axis so we re-create objects + std::vector axis_vars = common::GenDefaultAxis(axis_len); + for (int i = 0; i < shape.size(); ++i) { + block_vars.push_back(Var(Expr(0), + shape[i], + cinn::UniqName("i" + std::to_string(i)), + /*is_reduce = */ false)); + optim::ReplaceVarWithExpr(&init_body, axis[i], block_vars[i]); + axis_vars[i]->is_reduce_axis = false; + if (shape[i] == Expr(1)) { + iter_values.push_back(Expr(0)); + } else { + iter_values.push_back(axis_vars[i]); + } + } + init_body = ir::ScheduleBlockRealize::Make( + iter_values, + ir::ScheduleBlock::Make( + block_vars, {}, {}, reduce_init_name, init_body)); // For the remaining reduce axis, make reduce body const std::vector& reduce_axis = tensor->reduce_axis; ir::Expr reduce_body = ConvertReduceBody(tensor->body(), tensor, axis_exprs); + // create schedule block itervars, i0,i1... + std::vector reduce_block_vars; + std::vector reduce_iter_values; + // reduce body and reduce init schedule block should have different objects + // for same axis so we re-create objects + std::vector reduce_axis_vars = common::GenDefaultAxis(axis_len); + for (int i = 0; i < shape.size(); ++i) { + reduce_block_vars.push_back(Var(Expr(0), + shape[i], + cinn::UniqName("i" + std::to_string(i)), + /*is_reduce = */ false)); + reduce_axis_vars[i]->is_reduce_axis = false; + if (shape[i] == Expr(1)) { + reduce_iter_values.push_back(Expr(0)); + } else { + reduce_iter_values.push_back(axis_vars[i]); + } + } + for (int i = 0; i < reduce_axis.size(); ++i) { + int count = shape.size() + i; + reduce_block_vars.push_back( + Var(reduce_axis[i]->lower_bound, + reduce_axis[i]->upper_bound, + cinn::UniqName("i" + std::to_string(count)), + /*is_reduce = */ true)); + ir::Var reduce_axis_var = reduce_axis[i]; + reduce_axis_var->is_reduce_axis = true; + reduce_iter_values.push_back(reduce_axis_var); + } + for (int i = 0; i < axis.size(); ++i) { + optim::ReplaceVarWithExpr(&reduce_body, axis[i], reduce_block_vars[i]); + } + for (int i = axis.size(); i < reduce_block_vars.size(); ++i) { + optim::ReplaceVarWithExpr( + &reduce_body, reduce_axis[i - axis.size()], reduce_block_vars[i]); + } + + reduce_body = ir::ScheduleBlockRealize::Make( + reduce_iter_values, + ir::ScheduleBlock::Make( + reduce_block_vars, {}, {}, tensor->name, reduce_body)); for (int i = static_cast(reduce_axis.size()) - 1; i >= 0; --i) { reduce_body = ir::For::Make(reduce_axis[i], reduce_axis[i]->lower_bound, @@ -114,6 +179,24 @@ ir::Expr AstGen::Build(const ir::Tensor& tensor, TensorGroup* tensor_group) { return body; } else { ir::Expr body = ir::Store::Make(tensor, tensor->body(), axis_exprs); + // create schedule block itervars, i0,i1... + std::vector block_vars; + std::vector iter_values; + std::vector axis_vars = common::GenDefaultAxis(axis_len); + for (int i = 0; i < shape.size(); ++i) { + block_vars.push_back(Var( + Expr(0), shape[i], cinn::UniqName("i" + std::to_string(i)), false)); + optim::ReplaceVarWithExpr(&body, axis[i], block_vars[i]); + axis_vars[i]->is_reduce_axis = false; + if (shape[i] == Expr(1)) { + iter_values.push_back(Expr(0)); + } else { + iter_values.push_back(axis_vars[i]); + } + } + body = ir::ScheduleBlockRealize::Make( + iter_values, + ir::ScheduleBlock::Make(block_vars, {}, {}, tensor->name, body)); for (int i = static_cast(axis_len) - 1; i >= 0; --i) { ir::Var loop_var = axis[i]; ir::Expr loop_extent = shape[i]; diff --git a/paddle/cinn/ast_gen_ius/tensor_group.cc b/paddle/cinn/ast_gen_ius/tensor_group.cc index e8b9c6a345e72..34e6e5beb0f9d 100644 --- a/paddle/cinn/ast_gen_ius/tensor_group.cc +++ b/paddle/cinn/ast_gen_ius/tensor_group.cc @@ -21,26 +21,37 @@ #include "paddle/cinn/ir/ir_base.h" #include "paddle/cinn/ir/tensor.h" #include "paddle/cinn/ir/utils/ir_nodes_collector.h" +#include "paddle/cinn/poly/stage.h" namespace cinn { namespace ast_gen_ius { TensorGroup::TensorGroup(const std::vector& tensors) { - std::set all_tensors(tensors.begin(), tensors.end()); - - for (auto& tensor : tensors) { + for (const ir::Tensor& tensor : tensors) { output_tensor_names_.insert(tensor->name); - std::set used_tensors = ir::ir_utils::CollectIRNodes( - tensor->body(), [](const Expr* x) { return x->as_tensor(); }); - for (const Expr& x : used_tensors) { - const ir::Tensor to_dep = x.as_tensor_ref(); - all_tensors.insert(to_dep); - this->CtrlDepend(tensor, to_dep); + this->Insert(tensor); + } +} + +void TensorGroup::ShowLog() const { + VLOG(6) << "Showing log for TensorGroup"; + for (auto& p : name_to_tensor_) { + VLOG(6) << "Tensor name = " << p.first << " depends on {"; + if (ctrl_dep_.count(p.first)) { + for (auto& dep_name : ctrl_dep_.at(p.first)) { + VLOG(6) << dep_name; + } } + VLOG(6) << "}"; } +} - for (const ir::Tensor& t : all_tensors) { - name_to_tensor_.insert({t->name, t}); +TensorGroup::TensorGroup( + const std::unordered_map& tensor_map) { + for (const auto& map_pair : tensor_map) { + const ir::Tensor& tensor = map_pair.second; + output_tensor_names_.insert(tensor->name); + this->Insert(tensor); } } @@ -51,7 +62,23 @@ bool TensorGroup::Contain(const std::string& name) const { } void TensorGroup::Insert(const ir::Tensor& tensor) { - name_to_tensor_.insert({tensor->name, tensor}); + if (!name_to_tensor_.count(tensor->name)) { + name_to_tensor_.insert({tensor->name, tensor}); + } + + // Using set to de-duplicate + std::set dep_tensors; + std::set used_tensors = ir::ir_utils::CollectIRNodes( + tensor->body(), [](const Expr* x) { return x->as_tensor(); }); + for (const Expr& x : used_tensors) { + const ir::Tensor to_dep = x.as_tensor_ref(); + dep_tensors.insert(to_dep); + this->CtrlDepend(tensor, to_dep); + } + + for (const ir::Tensor& t : dep_tensors) { + this->Insert(t); + } } ir::Tensor TensorGroup::Get(const std::string& name) { @@ -72,6 +99,8 @@ std::vector TensorGroup::GetGenFuncTopoOrder( for (const auto& dep_pair : ctrl_dep_) { const std::unordered_set& dep_tensor_names = dep_pair.second; in_degree[dep_pair.first] = dep_tensor_names.size(); + VLOG(6) << "indegree[" << dep_pair.first + << "] = " << dep_tensor_names.size(); } std::vector ret; @@ -95,7 +124,6 @@ std::vector TensorGroup::GetGenFuncTopoOrder( while (!node_set.empty()) { const std::string cur = *(node_set.begin()); node_set.erase(node_set.begin()); - if (!input_arg_names.count(cur)) { ret.push_back(name_to_tensor_[cur]); } @@ -187,5 +215,45 @@ absl::flat_hash_map TensorGroup::AllocateBuffers() { return name_to_tensor_; } +void StageMapShareMemory(const poly::StageMap& stages) { + absl::flat_hash_map tensor_map; + for (auto& stage : stages) { + tensor_map[stage.second->tensor()->name] = stage.second->tensor(); + } + for (auto& stage : stages) { + if (!stage.second->tensor()->buffer.defined() && + !stage.second->meta.tensors_to_share_buffer_with.empty()) { + for (auto& str : stage.second->meta.tensors_to_share_buffer_with) { + if (tensor_map[str]->buffer.defined()) { + auto edited_shape = tensor_map[str]->buffer->shape; + stage.second->tensor()->Bind(tensor_map[str]->buffer); + tensor_map[str]->buffer->shape = edited_shape; + VLOG(3) << "Stage Tensor " << stage.second->tensor()->name + << " bind buffer to " << tensor_map[str]->name << " , " + << tensor_map[str]->buffer->name; + } + } + } + } +} + +TensorGroup ConvertStageMapToTensorGroup(const poly::StageMap& stage_map) { + std::vector stage_tensors; + std::set reshape_tensors; + for (auto iter = stage_map.begin(); iter != stage_map.end(); ++iter) { + if (iter->second->has_expression()) { + const std::string& tensor_name = iter->first; + stage_tensors.push_back(ir::Tensor(iter->second->tensor())); + if (utils::Endswith(tensor_name, "_reshape")) { + reshape_tensors.insert(ir::Tensor(iter->second->tensor())); + } + } + } + + ast_gen_ius::TensorGroup tensor_group(stage_tensors); + StageMapShareMemory(stage_map); + return tensor_group; +} + } // namespace ast_gen_ius } // namespace cinn diff --git a/paddle/cinn/ast_gen_ius/tensor_group.h b/paddle/cinn/ast_gen_ius/tensor_group.h index c6e12690e9dcc..d981b0f674f09 100644 --- a/paddle/cinn/ast_gen_ius/tensor_group.h +++ b/paddle/cinn/ast_gen_ius/tensor_group.h @@ -24,6 +24,7 @@ #include "paddle/cinn/ir/ir.h" #include "paddle/cinn/ir/ir_base.h" #include "paddle/cinn/ir/tensor.h" +#include "paddle/cinn/poly/stage.h" namespace cinn { namespace ast_gen_ius { @@ -41,11 +42,21 @@ class TensorGroup { */ explicit TensorGroup(const std::vector& tensors); + /** + * Constructor for a TensorGroup, the argument tensors should be output tensor + * arguments of the AST body to be generated. The dependent tensors of the + * output tensors will be collected during construction. + */ + explicit TensorGroup( + const std::unordered_map& tensor_map); + /** * Destructor. */ ~TensorGroup(); + void ShowLog() const; + /** * Returns true if TensorGroup collection contains a tensor with input name. */ @@ -119,5 +130,9 @@ class TensorGroup { std::unordered_map share_memory_tensor_; }; +// TODO(zhhsplendid): remove stage_map need to change all fcompute CINNValuePack +// we will change it in the next PR +TensorGroup ConvertStageMapToTensorGroup(const poly::StageMap& stage_map); + } // namespace ast_gen_ius } // namespace cinn diff --git a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/multi_level_tiling_test.cc b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/multi_level_tiling_test.cc index a1be2399ce6e9..fa7206bdae7dd 100644 --- a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/multi_level_tiling_test.cc +++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/multi_level_tiling_test.cc @@ -308,7 +308,7 @@ TEST_F(TestMultiLevelTiling, Matmul) { ScheduleBlock(temp_matmul_out_local_temp_buffer) { i0_0, i1_0, i2 = axis.bind(((8 * i_0_j_0_fused) + ((8 * i_1) + ((8 * i_2) + ((8 * i_j_fused) + i_3)))), ((32 * j_1) + ((32 * j_2) + j_3)), ((8 * reduce_k_0) + ((8 * reduce_k_1) + reduce_k_2))) - read_buffers(_temp_matmul_out[i(undefined:undefined), j(undefined:undefined)], _X[i(undefined:undefined), reduce_k(undefined:undefined)], _Y[reduce_k(undefined:undefined), j(undefined:undefined)]) + read_buffers(_temp_matmul_out[i(undefined:undefined), j(undefined:undefined)], _X[i(undefined:undefined), reduce_k(0:32)], _Y[reduce_k(0:32), j(undefined:undefined)]) write_buffers(_temp_matmul_out[i(undefined:undefined), j(undefined:undefined)]) { temp_matmul_out_local_temp_buffer[((8 * i_0_j_0_fused) + ((8 * i_1) + ((8 * i_2) + ((8 * i_j_fused) + i_3)))), ((32 * j_1) + ((32 * j_2) + j_3))] = (temp_matmul_out_local_temp_buffer[((8 * i_0_j_0_fused) + ((8 * i_1) + ((8 * i_2) + ((8 * i_j_fused) + i_3)))), ((32 * j_1) + ((32 * j_2) + j_3))] + (X_reshape_shared_temp_buffer[((8 * i_0_j_0_fused) + ((8 * i_1) + ((8 * i_2) + ((8 * i_j_fused) + i_3)))), ((8 * reduce_k_0) + ((8 * reduce_k_1) + reduce_k_2))] * Y_reshape_shared_temp_buffer[((8 * reduce_k_0) + ((8 * reduce_k_1) + reduce_k_2)), ((32 * j_1) + ((32 * j_2) + j_3))])) diff --git a/paddle/cinn/backends/codegen_c_test.cc b/paddle/cinn/backends/codegen_c_test.cc index 8db31b6c6007f..caf4950cdfe8c 100644 --- a/paddle/cinn/backends/codegen_c_test.cc +++ b/paddle/cinn/backends/codegen_c_test.cc @@ -69,7 +69,7 @@ TEST(CodeGenC, module) { ast_gen_ius::TensorGroup tensor_group({A, B, C}); auto func = lang::LowerToAst("add1", {A, B, C}, &tensor_group); - LOG(INFO) << "Huihuang debug: " << func << std::endl; + LOG(INFO) << "Func to codegen: " << func << std::endl; builder.AddFunction(func); diff --git a/paddle/cinn/hlir/framework/graph_compiler.cc b/paddle/cinn/hlir/framework/graph_compiler.cc index 2720388e7e22c..acd4387efb712 100644 --- a/paddle/cinn/hlir/framework/graph_compiler.cc +++ b/paddle/cinn/hlir/framework/graph_compiler.cc @@ -32,6 +32,8 @@ #include "paddle/cinn/utils/enum_string.h" #include "paddle/cinn/utils/profiler.h" +#include "paddle/cinn/ast_gen_ius/tensor_group.h" + namespace cinn { namespace hlir { namespace framework { @@ -372,14 +374,17 @@ std::vector GetFuncFromImpl( poly::StageMap stages = C.back(); std::string func_name_prefix = "fn_"; - auto funcs = lang::LowerVec(func_name_prefix + node_id, - stages, - all_arg_tensors, - {}, - {}, - nullptr, - target, - true); + + ast_gen_ius::TensorGroup tensor_group = + ast_gen_ius::ConvertStageMapToTensorGroup(stages); + auto funcs = lang::LowerToAstVec( + func_name_prefix + node_id, all_arg_tensors, &tensor_group, target); + + VLOG(4) << "Lower op: " << node_id << ", get " << funcs.size() + << " LoweredFunc:\n"; + for (auto fun : funcs) { + VLOG(4) << fun; + } std::vector schedule_inputs; for (int i = 0; i < C.size() - 1; ++i) { @@ -426,7 +431,8 @@ std::vector GetFuncFromImpl( optim::OptimizeExprGPU(&(funcs_after_schedule[i]->body)); #endif auto temp_buffers = lang::GetTempBuffers( - all_arg_tensors, stages, funcs_after_schedule[i]->body); + all_arg_tensors, tensor_group, funcs_after_schedule[i]->body); + funcs_after_schedule[i]->temp_bufs = temp_buffers; funcs_after_schedule[i] = ir::_LoweredFunc_::Make(funcs_after_schedule[i]->name, diff --git a/paddle/cinn/hlir/framework/op_lowering_impl.cc b/paddle/cinn/hlir/framework/op_lowering_impl.cc index 156ad756a50af..ad5a903bedadc 100644 --- a/paddle/cinn/hlir/framework/op_lowering_impl.cc +++ b/paddle/cinn/hlir/framework/op_lowering_impl.cc @@ -14,6 +14,7 @@ #include "paddle/cinn/hlir/framework/op_lowering_impl.h" +#include "paddle/cinn/ast_gen_ius/tensor_group.h" #include "paddle/cinn/hlir/framework/compile_error.h" #include "paddle/cinn/hlir/framework/graph_compiler_util.h" #include "paddle/cinn/hlir/framework/op_lowering_util.h" @@ -391,16 +392,16 @@ std::vector OpLowererImpl::DoOpLower( } // 2.Do lower - std::vector funcs = lang::LowerVec("fn_" + node->id(), - tmp_stages, - *op_func_arg_tensors, - {}, - {}, - nullptr, - this->target_, - true); + ast_gen_ius::TensorGroup tensor_group = + ast_gen_ius::ConvertStageMapToTensorGroup(tmp_stages); + std::vector funcs = lang::LowerToAstVec( + "fn_" + node->id(), *op_func_arg_tensors, {&tensor_group}, this->target_); + VLOG(4) << "Lower op: " << node->op()->name << ", get " << funcs.size() << " LoweredFunc:\n"; + for (auto fun : funcs) { + VLOG(4) << fun; + } op_func_arg_tensors->clear(); for (int idx = 0; idx < pack.size() - 1; ++idx) { diff --git a/paddle/cinn/ir/test/schedule_block_graph_test.cc b/paddle/cinn/ir/test/schedule_block_graph_test.cc index 80c39f493be41..20c7f03b4d235 100644 --- a/paddle/cinn/ir/test/schedule_block_graph_test.cc +++ b/paddle/cinn/ir/test/schedule_block_graph_test.cc @@ -97,8 +97,8 @@ frontend::Program CreateReduceProgram() { TEST(ScheduleBlockGraph, elementwise) { frontend::Program program = CreateElementwiseProgram(); IRSchedule ir_sch = MakeIRSchedule(&program); - ScheduleBlockGraph sbg(ir_sch); LOG(INFO) << GetIR(ir_sch); + ScheduleBlockGraph sbg(ir_sch); LOG(INFO) << sbg.Visualize(); CHECK_EQ(sbg.BlockIdsInOrder().size(), 6); CHECK_EQ(sbg.nodes().size(), 6); @@ -138,8 +138,8 @@ TEST(ScheduleBlockGraph, elementwise) { TEST(ScheduleBlockGraph, reduce) { frontend::Program program = CreateReduceProgram(); IRSchedule ir_sch = MakeIRSchedule(&program); - ScheduleBlockGraph sbg(ir_sch); LOG(INFO) << GetIR(ir_sch); + ScheduleBlockGraph sbg(ir_sch); LOG(INFO) << sbg.Visualize(); CHECK_EQ(sbg.BlockIdsInOrder().size(), 8); CHECK_EQ(sbg.nodes().size(), 8); diff --git a/paddle/cinn/lang/lower.cc b/paddle/cinn/lang/lower.cc index 92812e65f412d..c509a1977555f 100644 --- a/paddle/cinn/lang/lower.cc +++ b/paddle/cinn/lang/lower.cc @@ -104,7 +104,7 @@ std::vector GetTempBuffers(const std::vector& tensor_args, auto all_temp_tensors = ir::ir_utils::CollectIRNodesWithoutTensor(body, [&](const Expr* x) { return x->as_tensor() && x->as_tensor()->buffer.defined() && - (!tensor_group.Contain(x->as_tensor()->name) && + (!tensor_group.Contain(x->as_tensor()->name) || ((!buffer_arg_names.count(x->as_tensor()->buffer->name) && !tensor_arg_names.count(x->as_tensor()->name)) || utils::Endswith(x->as_tensor()->buffer->name, "temp_buffer"))); @@ -284,15 +284,25 @@ ir::LoweredFunc LowerToAst(const std::string& name, const std::vector& tensor_args, ast_gen_ius::TensorGroup* tensor_group, const Target& target) { - // Merge the ctrl_deps with the given temp_tensors ang get a new temp_tensors + std::vector result = + LowerToAstVec(name, tensor_args, tensor_group, target); + CHECK_EQ(result.size(), 1UL) << "LowerToAst contains not only 1 LoweredFunc, " + "use LowerToAstVec instead."; + return result[0]; +} + +std::vector LowerToAstVec( + const std::string& name, + const std::vector& tensor_args, + ast_gen_ius::TensorGroup* tensor_group, + const Target& target) { std::set ctrl_deps = CollectTempTensorsFromCtrlDepends(tensor_group, tensor_args); - std::vector group_vec = {tensor_group}; auto lower_instance = detail::LowerTensorGroup( name, tensor_args, {}, - group_vec, + tensor_group, std::vector(ctrl_deps.begin(), ctrl_deps.end()), target); std::vector result = lower_instance(); @@ -301,19 +311,7 @@ ir::LoweredFunc LowerToAst(const std::string& name, res->device_api = ir::DeviceAPI::GPU; } } - return result[0]; -} - -std::vector LowerToAstVec( - const std::string& name, - const std::vector& tensor_args, - std::vector tensor_groups, - const Target& target) { - std::vector ret; - for (ast_gen_ius::TensorGroup* tg : tensor_groups) { - ret.push_back(LowerToAst(name, tensor_args, tg, target)); - } - return ret; + return result; } ir::LoweredFunc Lower(const std::string& name, diff --git a/paddle/cinn/lang/lower.h b/paddle/cinn/lang/lower.h index c80d4bc769cdf..b3f27129778b9 100644 --- a/paddle/cinn/lang/lower.h +++ b/paddle/cinn/lang/lower.h @@ -82,7 +82,7 @@ ir::LoweredFunc LowerToAst(const std::string &name, std::vector LowerToAstVec( const std::string &name, const std::vector &tensor_args, - std::vector tensor_groups, + ast_gen_ius::TensorGroup *tensor_group, const Target &target = common::DefaultHostTarget()); std::vector GetTempBuffers( diff --git a/paddle/cinn/lang/lower_tensor_group.cc b/paddle/cinn/lang/lower_tensor_group.cc index 6bbe101791362..f59ac4ceff52f 100644 --- a/paddle/cinn/lang/lower_tensor_group.cc +++ b/paddle/cinn/lang/lower_tensor_group.cc @@ -41,24 +41,29 @@ LowerTensorGroup::LowerTensorGroup( const std::string& fn_name, const std::vector& tensor_args, const std::vector& scalar_args, - const std::vector& tensor_groups, + ast_gen_ius::TensorGroup* tensor_group, const std::vector& temp_tensor_args, const Target& target) : fn_name_(fn_name), tensor_args_(tensor_args), scalar_args_(scalar_args), - tensor_groups_(tensor_groups), + tensor_group_(tensor_group), temp_tensor_args_(temp_tensor_args), target_(target) {} std::vector LowerTensorGroup::operator()() { std::vector result; int num_func = 0; - for (ast_gen_ius::TensorGroup* tensor_group : tensor_groups_) { - // 1. Generate function body - ir::Expr func_body = GenerateFunctionBody(tensor_group); + + // 1. Generate function body + std::vector func_bodies = GenerateFunctionBody(tensor_group_); + for (ir::Expr& func_body : func_bodies) { + func_body = ir::ScheduleBlockRealize::Make( + {}, + ir::ScheduleBlock::Make( + {}, {}, {}, common::UniqName("root"), func_body)); // 2. Assign buffer to tensors - auto tensor_map = tensor_group->AllocateBuffers(); + auto tensor_map = tensor_group_->AllocateBuffers(); // copy the tensor(with buffer assigned) back to func's args. for (auto& arg : tensor_args_) { if (arg->is_placeholder_node() || arg->buffer.defined()) { @@ -195,21 +200,36 @@ std::vector LowerTensorGroup::GenerateFunctionArgumentList( return args; } -ir::Expr LowerTensorGroup::GenerateFunctionBody( +std::vector LowerTensorGroup::GenerateFunctionBody( ast_gen_ius::TensorGroup* tensor_group) { - std::vector ordered_tensors = - tensor_group->GetGenFuncTopoOrder(tensor_args_); + // TODO(zhhsplendid): GetGenFuncTopoOrder() may remove args + std::vector ordered_tensors = tensor_group->GetGenFuncTopoOrder(); + + std::vector result; std::vector bodies; for (const ir::Tensor& tensor : ordered_tensors) { - if (!tensor->is_placeholder_node()) { + VLOG(6) << "tensor_name = " << tensor->name; + if (!tensor->is_placeholder_node() && tensor->has_expression()) { + VLOG(6) << "ast_gen_ius::AstGen::Build for Tensor " << tensor; bodies.emplace_back(ast_gen_ius::AstGen::Build(tensor, tensor_group)); + + bool gpu_local = + tensor->buffer.defined() && + (tensor->buffer->memory_type == ir::MemoryType::GPUShared || + tensor->buffer->memory_type == ir::MemoryType::GPULocal); + if (target_ == common::DefaultNVGPUTarget() && !gpu_local) { + result.push_back(bodies.size() == 1 ? bodies[0] + : ir::Block::Make(bodies)); + bodies.clear(); + } } } - if (bodies.size() == 1) { - return bodies[0]; - } - return ir::Block::Make(bodies); + if (!bodies.empty()) { + result.push_back(bodies.size() == 1 ? bodies[0] : ir::Block::Make(bodies)); + bodies.clear(); + } + return result; } } // namespace detail diff --git a/paddle/cinn/lang/lower_tensor_group.h b/paddle/cinn/lang/lower_tensor_group.h index c66dc014d0f9a..358e2d9ec953d 100644 --- a/paddle/cinn/lang/lower_tensor_group.h +++ b/paddle/cinn/lang/lower_tensor_group.h @@ -47,13 +47,14 @@ class LowerTensorGroup { LowerTensorGroup(const std::string& fn_name, const std::vector& tensor_args, const std::vector& scalar_args, - const std::vector& tensor_groups, + ast_gen_ius::TensorGroup* tensor_group, const std::vector& temp_tensor_args = {}, const Target& target = common::DefaultHostTarget()); std::vector operator()(); - ir::Expr GenerateFunctionBody(ast_gen_ius::TensorGroup* tensor_group); + std::vector GenerateFunctionBody( + ast_gen_ius::TensorGroup* tensor_group); std::vector GenerateFunctionArgumentList(ir::Expr fn_body); @@ -62,11 +63,8 @@ class LowerTensorGroup { const std::vector& tensor_args_; const std::vector& scalar_args_; std::vector temp_tensor_args_; - std::vector tensor_groups_; + ast_gen_ius::TensorGroup* tensor_group_; Target target_; - - //! CUDA axis info for this function. - std::vector cuda_axis_info_; }; } // namespace detail diff --git a/paddle/cinn/lang/lower_test.cc b/paddle/cinn/lang/lower_test.cc index e97d0f596a7ea..452b9e7afb772 100644 --- a/paddle/cinn/lang/lower_test.cc +++ b/paddle/cinn/lang/lower_test.cc @@ -177,11 +177,18 @@ TEST(lower_to_ast, basic) { auto out = R"ROC( function cal_B (_A, _B) { - serial for (i, 0, 100) + ScheduleBlock(root) { - serial for (j, 0, 15) + serial for (i, 0, 100) { - B[i, j] = (A[i, j] + 1.00000000f) + serial for (j, 0, 15) + { + ScheduleBlock(B) + { + i0, i1 = axis.bind(i, j) + B[i0, i1] = (A[i0, i1] + 1.00000000f) + } + } } } } @@ -212,13 +219,20 @@ TEST(lower_to_ast, three_dim) { auto out = R"ROC( function cal_C (_A, _B, _C) { - serial for (i, 0, 100) + ScheduleBlock(root) { - serial for (j, 0, 15) + serial for (i, 0, 100) { - serial for (k, 0, 200) + serial for (j, 0, 15) { - C[i, j, k] = (A[i, j] * B[j, k]) + serial for (k, 0, 200) + { + ScheduleBlock(C) + { + i0, i1, i2 = axis.bind(i, j, k) + C[i0, i1, i2] = (A[i0, i1] * B[i1, i2]) + } + } } } } @@ -247,14 +261,25 @@ TEST(lower_to_ast, matmul_with_reduce_sum) { auto out = R"ROC( function matmul (_A, _B, _C) { - serial for (i, 0, 100) + ScheduleBlock(root) { - serial for (j, 0, 50) + serial for (i, 0, 100) { - C__reduce_init[i, j] = 0.00000000f - serial for (k0, 0, 20) + serial for (j, 0, 50) { - C[i, j] = (C[i, j] + (A[i, k0] * B[k0, j])) + ScheduleBlock(C__reduce_init) + { + i0, i1 = axis.bind(i, j) + C__reduce_init[i0, i1] = 0.00000000f + } + serial for (k0, 0, 20) + { + ScheduleBlock(C) + { + i0_0, i1_0, i2 = axis.bind(i, j, k0) + C[i0_0, i1_0] = (C[i0_0, i1_0] + (A[i0_0, i2] * B[i2, i1_0])) + } + } } } } From e5bdde1ccf09af628eccfa2a5f2ca2e99f7eae28 Mon Sep 17 00:00:00 2001 From: winter-wang <78149749+winter-wang@users.noreply.github.com> Date: Sun, 8 Oct 2023 22:22:21 +0800 Subject: [PATCH 06/62] [PIR] add print function for pd_op.while (#57917) --- cmake/generic.cmake | 2 +- .../instruction/cond_instruction.cc | 1 + .../pir_adaptor/pir_adaptor_util.cc | 1 + .../translator/program_translator.cc | 1 + .../pir/dialect/operator/ir/CMakeLists.txt | 2 +- .../dialect/operator/ir/control_flow_op.cc | 113 ++++++++++++++++++ .../pir/dialect/operator/ir/control_flow_op.h | 68 +++++++++++ .../pir/dialect/operator/ir/manual_op.cc | 61 ---------- .../fluid/pir/dialect/operator/ir/manual_op.h | 35 ------ .../pir/dialect/operator/ir/op_dialect.cc | 11 +- .../pir/transforms/pd_op_to_kernel_pass.cc | 1 + paddle/pir/core/op_base.h | 2 + .../standalone_executor_new_ir_test.cc | 1 + .../pir/control_flow_dialect/if_op_test.cc | 2 +- .../pir/control_flow_dialect/while_op_test.cc | 2 +- test/cpp/pir/core/program_translator_test.cc | 1 + .../ir_kernel_dialect_pass_test.cc | 1 + 17 files changed, 202 insertions(+), 103 deletions(-) create mode 100644 paddle/fluid/pir/dialect/operator/ir/control_flow_op.cc create mode 100644 paddle/fluid/pir/dialect/operator/ir/control_flow_op.h diff --git a/cmake/generic.cmake b/cmake/generic.cmake index 13aaf0d760f16..9ed3d53ccdc2b 100644 --- a/cmake/generic.cmake +++ b/cmake/generic.cmake @@ -88,7 +88,7 @@ # To build a unit test binary, which is an executable binary with libpaddle.so # automatically linked: # -# paddle_test(example SHARED) +# paddle_test(example SRCS example_test.cc) # # including binary directory for generated headers. diff --git a/paddle/fluid/framework/new_executor/instruction/cond_instruction.cc b/paddle/fluid/framework/new_executor/instruction/cond_instruction.cc index 5d958d7266505..8c89800dd2d95 100644 --- a/paddle/fluid/framework/new_executor/instruction/cond_instruction.cc +++ b/paddle/fluid/framework/new_executor/instruction/cond_instruction.cc @@ -34,6 +34,7 @@ #include "paddle/pir/core/value.h" #include "paddle/fluid/framework/new_executor/instruction/instruction_util.h" +#include "paddle/fluid/pir/dialect/operator/ir/control_flow_op.h" #include "paddle/fluid/pir/dialect/operator/ir/manual_op.h" namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/new_executor/pir_adaptor/pir_adaptor_util.cc b/paddle/fluid/framework/new_executor/pir_adaptor/pir_adaptor_util.cc index 2789c7b62bff5..f8400b1c289a5 100644 --- a/paddle/fluid/framework/new_executor/pir_adaptor/pir_adaptor_util.cc +++ b/paddle/fluid/framework/new_executor/pir_adaptor/pir_adaptor_util.cc @@ -43,6 +43,7 @@ #include "glog/logging.h" #include "paddle/fluid/framework/op_info.h" #include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/pir/dialect/operator/ir/control_flow_op.h" #include "paddle/fluid/pir/dialect/operator/ir/manual_op.h" namespace paddle { diff --git a/paddle/fluid/ir_adaptor/translator/program_translator.cc b/paddle/fluid/ir_adaptor/translator/program_translator.cc index 2ebece4fbfef7..313a78da1aab9 100644 --- a/paddle/fluid/ir_adaptor/translator/program_translator.cc +++ b/paddle/fluid/ir_adaptor/translator/program_translator.cc @@ -23,6 +23,7 @@ #include "paddle/fluid/ir_adaptor/translator/op_translator.h" #include "paddle/fluid/ir_adaptor/translator/type_translator.h" #include "paddle/fluid/ir_adaptor/translator/utils.h" +#include "paddle/fluid/pir/dialect/operator/ir/control_flow_op.h" #include "paddle/fluid/pir/dialect/operator/ir/manual_op.h" #include "paddle/phi/core/enforce.h" #include "paddle/pir/core/attribute.h" diff --git a/paddle/fluid/pir/dialect/operator/ir/CMakeLists.txt b/paddle/fluid/pir/dialect/operator/ir/CMakeLists.txt index befbb84a7117d..3026da6200254 100644 --- a/paddle/fluid/pir/dialect/operator/ir/CMakeLists.txt +++ b/paddle/fluid/pir/dialect/operator/ir/CMakeLists.txt @@ -190,7 +190,7 @@ cc_library( DEPS phi pd_interface pd_trait type_info) cc_library( pd_op_dialect_op - SRCS ${op_source_file} manual_op.cc + SRCS ${op_source_file} manual_op.cc control_flow_op.cc DEPS pd_op_dialect_core) cc_library( api_builder diff --git a/paddle/fluid/pir/dialect/operator/ir/control_flow_op.cc b/paddle/fluid/pir/dialect/operator/ir/control_flow_op.cc new file mode 100644 index 0000000000000..94ba9a2e2e37f --- /dev/null +++ b/paddle/fluid/pir/dialect/operator/ir/control_flow_op.cc @@ -0,0 +1,113 @@ +// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#ifdef GET_OP_LIST +#undef GET_OP_LIST +paddle::dialect::IfOp, paddle::dialect::WhileOp +#else +#include "paddle/fluid/pir/dialect/operator/ir/control_flow_op.h" + +#include "paddle/pir/core/builder.h" +#include "paddle/pir/core/ir_printer.h" +#include "paddle/pir/core/operation_utils.h" + +namespace paddle { +namespace dialect { + +void IfOp::Build(pir::Builder &builder, // NOLINT + pir::OperationArgument &argument, // NOLINT + pir::Value cond, + std::vector &&output_types) { + VLOG(4) << "Start build IfOp"; + argument.AddRegions(2u); + argument.AddInput(cond); + argument.output_types.swap(output_types); +} +pir::Block *IfOp::true_block() { + pir::Region &true_region = (*this)->region(0); + if (true_region.empty()) true_region.emplace_back(); + return true_region.front(); +} +pir::Block *IfOp::false_block() { + pir::Region &false_region = (*this)->region(1); + if (false_region.empty()) false_region.emplace_back(); + return false_region.front(); +} +void IfOp::Print(pir::IrPrinter &printer) { + auto &os = printer.os; + auto op = operation(); + printer.PrintOpResult(op); + os << " = pd_op.if"; + printer.PrintOpOperands(op); + os << " -> "; + printer.PrintOpReturnType(op); + os << "{"; + for (auto item : *true_block()) { + os << "\n "; + printer.PrintOperation(item); + } + os << "\n } else {"; + for (auto item : *false_block()) { + os << "\n "; + printer.PrintOperation(item); + } + os << "\n }"; +} +void IfOp::Verify() {} + +void WhileOp::Build(pir::Builder &builder, // NOLINT + pir::OperationArgument &argument, // NOLINT + const std::vector &inputs, + const std::vector &output_types) { + argument.AddInputs(inputs); + argument.AddOutputs(output_types); + argument.AddRegions(2u); +} +pir::Block *WhileOp::cond_block() { + pir::Region &cond_region = (*this)->region(0); + if (cond_region.empty()) cond_region.emplace_back(); + return cond_region.front(); +} +pir::Block *WhileOp::body_block() { + pir::Region &body_region = (*this)->region(1); + if (body_region.empty()) body_region.emplace_back(); + return body_region.front(); +} + +void WhileOp::Print(pir::IrPrinter &printer) { + auto &os = printer.os; + auto op = operation(); + printer.PrintOpResult(op); + os << " \"" << name() << "\""; + printer.PrintOpOperands(op); + os << " -> "; + printer.PrintOpReturnType(op); + os << "{"; + for (auto item : *cond_block()) { + os << "\n "; + printer.PrintOperation(item); + } + os << "\n } do {"; + for (auto item : *body_block()) { + os << "\n "; + printer.PrintOperation(item); + } + os << "\n }"; +} +} // namespace dialect +} // namespace paddle + +IR_DEFINE_EXPLICIT_TYPE_ID(paddle::dialect::IfOp) +IR_DEFINE_EXPLICIT_TYPE_ID(paddle::dialect::WhileOp) + +#endif diff --git a/paddle/fluid/pir/dialect/operator/ir/control_flow_op.h b/paddle/fluid/pir/dialect/operator/ir/control_flow_op.h new file mode 100644 index 0000000000000..3f93c51a534e9 --- /dev/null +++ b/paddle/fluid/pir/dialect/operator/ir/control_flow_op.h @@ -0,0 +1,68 @@ +// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include + +#include "paddle/pir/core/op_base.h" + +namespace paddle { +namespace dialect { + +class IfOp : public pir::Op { + public: + using Op::Op; + static const char *name() { return "pd_op.if"; } + static constexpr const char **attributes_name = nullptr; + static constexpr uint32_t attributes_num = 0; + static void Build(pir::Builder &builder, // NOLINT + pir::OperationArgument &argument, // NOLINT + pir::Value cond, + std::vector &&output_types); + + // static void Build(pir::Builder &builder, // NOLINT + // pir::OperationArgument &argument, // NOLINT + // pir::Value cond, + // std::unique_ptr&& true_block, + // std::unique_ptr&& false_block); + + pir::Value cond() { return operand_source(0); } + pir::Block *true_block(); + pir::Block *false_block(); + void Print(pir::IrPrinter &printer); // NOLINT + void Verify(); +}; + +class WhileOp : public pir::Op { + public: + using Op::Op; + static const char *name() { return "pd_op.while"; } + static constexpr uint32_t attributes_num = 0; + static constexpr const char **attributes_name = nullptr; + + static void Build(pir::Builder &builder, // NOLINT + pir::OperationArgument &argument, // NOLINT + const std::vector &inputs, + const std::vector &output_types); + pir::Block *cond_block(); + pir::Block *body_block(); + void Print(pir::IrPrinter &printer); // NOLINT + void Verify() {} +}; + +} // namespace dialect +} // namespace paddle + +IR_DECLARE_EXPLICIT_TYPE_ID(paddle::dialect::IfOp) +IR_DECLARE_EXPLICIT_TYPE_ID(paddle::dialect::WhileOp) diff --git a/paddle/fluid/pir/dialect/operator/ir/manual_op.cc b/paddle/fluid/pir/dialect/operator/ir/manual_op.cc index 8a4e4cda9f50b..eb5f1f5a53670 100644 --- a/paddle/fluid/pir/dialect/operator/ir/manual_op.cc +++ b/paddle/fluid/pir/dialect/operator/ir/manual_op.cc @@ -1040,65 +1040,6 @@ void SplitGradOp::InferMeta(phi::InferMetaContext *infer_meta) { fn(infer_meta); } -void IfOp::Build(pir::Builder &builder, // NOLINT - pir::OperationArgument &argument, // NOLINT - pir::Value cond, - std::vector &&output_types) { - VLOG(4) << "Start build IfOp"; - argument.AddRegions(2u); - argument.AddInput(cond); - argument.output_types.swap(output_types); -} -pir::Block *IfOp::true_block() { - pir::Region &true_region = (*this)->region(0); - if (true_region.empty()) true_region.emplace_back(); - return true_region.front(); -} -pir::Block *IfOp::false_block() { - pir::Region &false_region = (*this)->region(1); - if (false_region.empty()) false_region.emplace_back(); - return false_region.front(); -} -void IfOp::Print(pir::IrPrinter &printer) { - auto &os = printer.os; - auto op = operation(); - printer.PrintOpResult(op); - os << " = pd_op.if"; - printer.PrintOpOperands(op); - os << " -> "; - printer.PrintOpReturnType(op); - os << "{"; - for (auto item : *true_block()) { - os << "\n "; - printer.PrintOperation(item); - } - os << "\n } else {"; - for (auto item : *false_block()) { - os << "\n "; - printer.PrintOperation(item); - } - os << "\n }"; -} -void IfOp::Verify() {} - -void WhileOp::Build(pir::Builder &builder, // NOLINT - pir::OperationArgument &argument, // NOLINT - const std::vector &inputs, - const std::vector &output_types) { - argument.AddInputs(inputs); - argument.AddOutputs(output_types); - argument.AddRegions(2u); -} -pir::Block *WhileOp::cond_block() { - pir::Region &cond_region = (*this)->region(0); - if (cond_region.empty()) cond_region.emplace_back(); - return cond_region.front(); -} -pir::Block *WhileOp::body_block() { - pir::Region &body_region = (*this)->region(1); - if (body_region.empty()) body_region.emplace_back(); - return body_region.front(); -} } // namespace dialect } // namespace paddle @@ -1108,5 +1049,3 @@ IR_DEFINE_EXPLICIT_TYPE_ID(paddle::dialect::AddN_Op) IR_DEFINE_EXPLICIT_TYPE_ID(paddle::dialect::AddNWithKernelOp) IR_DEFINE_EXPLICIT_TYPE_ID(paddle::dialect::FusedGemmEpilogueOp) IR_DEFINE_EXPLICIT_TYPE_ID(paddle::dialect::FusedGemmEpilogueGradOp) -IR_DEFINE_EXPLICIT_TYPE_ID(paddle::dialect::IfOp) -IR_DEFINE_EXPLICIT_TYPE_ID(paddle::dialect::WhileOp) diff --git a/paddle/fluid/pir/dialect/operator/ir/manual_op.h b/paddle/fluid/pir/dialect/operator/ir/manual_op.h index 93f24e80cb524..c6fc7cb32b316 100644 --- a/paddle/fluid/pir/dialect/operator/ir/manual_op.h +++ b/paddle/fluid/pir/dialect/operator/ir/manual_op.h @@ -176,39 +176,6 @@ class SplitGradOp : public pir::Op { static void InferMeta(phi::InferMetaContext *infer_meta); }; -class IfOp : public pir::Op { - public: - using Op::Op; - static const char *name() { return "pd_op.if"; } - static constexpr const char **attributes_name = nullptr; - static constexpr uint32_t attributes_num = 0; - static void Build(pir::Builder &builder, // NOLINT - pir::OperationArgument &argument, // NOLINT - pir::Value cond, - std::vector &&output_types); - pir::Value cond() { return operand_source(0); } - pir::Block *true_block(); - pir::Block *false_block(); - void Print(pir::IrPrinter &printer); // NOLINT - void Verify(); -}; - -class WhileOp : public pir::Op { - public: - using Op::Op; - static const char *name() { return "pd.while"; } - static constexpr uint32_t attributes_num = 0; - static constexpr const char **attributes_name = nullptr; - - static void Build(pir::Builder &builder, // NOLINT - pir::OperationArgument &argument, // NOLINT - const std::vector &inputs, - const std::vector &output_types); - void Verify() {} - pir::Block *cond_block(); - pir::Block *body_block(); -}; - } // namespace dialect } // namespace paddle @@ -218,5 +185,3 @@ IR_DECLARE_EXPLICIT_TYPE_ID(paddle::dialect::AddN_Op) IR_DECLARE_EXPLICIT_TYPE_ID(paddle::dialect::AddNWithKernelOp) IR_DECLARE_EXPLICIT_TYPE_ID(paddle::dialect::FusedGemmEpilogueOp) IR_DECLARE_EXPLICIT_TYPE_ID(paddle::dialect::FusedGemmEpilogueGradOp) -IR_DECLARE_EXPLICIT_TYPE_ID(paddle::dialect::IfOp) -IR_DECLARE_EXPLICIT_TYPE_ID(paddle::dialect::WhileOp) diff --git a/paddle/fluid/pir/dialect/operator/ir/op_dialect.cc b/paddle/fluid/pir/dialect/operator/ir/op_dialect.cc index ac62747026ed0..9a7c6b9de2ea2 100644 --- a/paddle/fluid/pir/dialect/operator/ir/op_dialect.cc +++ b/paddle/fluid/pir/dialect/operator/ir/op_dialect.cc @@ -16,6 +16,7 @@ #include "paddle/fluid/pir/dialect/operator/ir/op_attribute.h" // NOTE(zhangbo9674): File pd_op.h is generated by op_gen.py, see details in // paddle/fluid/pir/dialect/CMakeLists.txt. +#include "paddle/fluid/pir/dialect/operator/ir/control_flow_op.h" #include "paddle/fluid/pir/dialect/operator/ir/op_type.h" #include "paddle/fluid/pir/dialect/operator/ir/pd_op.h" #include "paddle/fluid/pir/dialect/operator/ir/type_storage.h" @@ -50,14 +51,16 @@ void OperatorDialect::initialize() { #define GET_OP_LIST #include "paddle/fluid/pir/dialect/operator/ir/pd_op.h" // NOLINT >(); + RegisterOps< +#define GET_OP_LIST +#include "paddle/fluid/pir/dialect/operator/ir/control_flow_op.cc" // NOLINT + >(); RegisterOps(); + paddle::dialect::SplitGradOp>(); RegisterInterfaces(); } @@ -163,6 +166,8 @@ void OperatorDialect::PrintOperation(pir::Operation *op, pir::IrPrinter &printer) const { if (auto if_op = op->dyn_cast()) { if_op.Print(printer); + } else if (auto while_op = op->dyn_cast()) { + while_op.Print(printer); } else { printer.PrintGeneralOperation(op); } diff --git a/paddle/fluid/pir/transforms/pd_op_to_kernel_pass.cc b/paddle/fluid/pir/transforms/pd_op_to_kernel_pass.cc index 0059731809108..c322f71893ff7 100644 --- a/paddle/fluid/pir/transforms/pd_op_to_kernel_pass.cc +++ b/paddle/fluid/pir/transforms/pd_op_to_kernel_pass.cc @@ -19,6 +19,7 @@ #include "paddle/fluid/pir/dialect/kernel/ir/kernel_op.h" #include "paddle/fluid/pir/dialect/kernel/ir/kernel_type.h" #include "paddle/fluid/pir/dialect/operator/interface/op_yaml_info.h" +#include "paddle/fluid/pir/dialect/operator/ir/control_flow_op.h" #include "paddle/fluid/pir/dialect/operator/ir/manual_op.h" #include "paddle/fluid/pir/dialect/operator/ir/op_attribute.h" #include "paddle/fluid/pir/dialect/operator/ir/op_dialect.h" diff --git a/paddle/pir/core/op_base.h b/paddle/pir/core/op_base.h index f9de8dfc6cf8d..8e67a392c51cf 100644 --- a/paddle/pir/core/op_base.h +++ b/paddle/pir/core/op_base.h @@ -22,6 +22,8 @@ #include "paddle/pir/core/utils.h" namespace pir { +class Builder; +class IrPrinter; class IR_API OpBase { public: diff --git a/test/cpp/new_executor/standalone_executor_new_ir_test.cc b/test/cpp/new_executor/standalone_executor_new_ir_test.cc index 02ca49d180baa..9bdc5c3d3c718 100644 --- a/test/cpp/new_executor/standalone_executor_new_ir_test.cc +++ b/test/cpp/new_executor/standalone_executor_new_ir_test.cc @@ -23,6 +23,7 @@ #include "paddle/phi/core/kernel_registry.h" #include "paddle/fluid/framework/new_executor/new_ir_interpreter.h" +#include "paddle/fluid/pir/dialect/operator/ir/control_flow_op.h" #include "paddle/fluid/pir/dialect/operator/ir/op_dialect.h" #include "paddle/fluid/pir/dialect/operator/ir/pd_op.h" #include "paddle/fluid/pir/transforms/pd_op_to_kernel_pass.h" diff --git a/test/cpp/pir/control_flow_dialect/if_op_test.cc b/test/cpp/pir/control_flow_dialect/if_op_test.cc index f2e49b150b7bc..218a67e1acc5b 100644 --- a/test/cpp/pir/control_flow_dialect/if_op_test.cc +++ b/test/cpp/pir/control_flow_dialect/if_op_test.cc @@ -14,7 +14,7 @@ #include #include -#include "paddle/fluid/pir/dialect/operator/ir/manual_op.h" +#include "paddle/fluid/pir/dialect/operator/ir/control_flow_op.h" #include "paddle/fluid/pir/dialect/operator/ir/op_dialect.h" #include "paddle/fluid/pir/dialect/operator/ir/pd_op.h" #include "paddle/pir/core/builder.h" diff --git a/test/cpp/pir/control_flow_dialect/while_op_test.cc b/test/cpp/pir/control_flow_dialect/while_op_test.cc index 6c558cc982926..609f1f8eb8d2e 100644 --- a/test/cpp/pir/control_flow_dialect/while_op_test.cc +++ b/test/cpp/pir/control_flow_dialect/while_op_test.cc @@ -14,7 +14,7 @@ #include #include -#include "paddle/fluid/pir/dialect/operator/ir/manual_op.h" +#include "paddle/fluid/pir/dialect/operator/ir/control_flow_op.h" #include "paddle/fluid/pir/dialect/operator/ir/op_dialect.h" #include "paddle/fluid/pir/dialect/operator/ir/pd_op.h" #include "paddle/pir/core/builder.h" diff --git a/test/cpp/pir/core/program_translator_test.cc b/test/cpp/pir/core/program_translator_test.cc index c95d5952577ba..483299c206129 100644 --- a/test/cpp/pir/core/program_translator_test.cc +++ b/test/cpp/pir/core/program_translator_test.cc @@ -27,6 +27,7 @@ #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/ir_adaptor/translator/translate.h" #include "paddle/fluid/ir_adaptor/translator/utils.h" +#include "paddle/fluid/pir/dialect/operator/ir/control_flow_op.h" #include "paddle/fluid/pir/dialect/operator/ir/manual_op.h" #include "paddle/fluid/pir/dialect/operator/ir/op_dialect.h" #include "paddle/fluid/pir/dialect/operator/ir/pd_op.h" diff --git a/test/cpp/pir/kernel_dialect/ir_kernel_dialect_pass_test.cc b/test/cpp/pir/kernel_dialect/ir_kernel_dialect_pass_test.cc index bb99e86dfc21c..6812e7a9ed194 100644 --- a/test/cpp/pir/kernel_dialect/ir_kernel_dialect_pass_test.cc +++ b/test/cpp/pir/kernel_dialect/ir_kernel_dialect_pass_test.cc @@ -23,6 +23,7 @@ #include "paddle/fluid/pir/dialect/kernel/ir/kernel_dialect.h" #include "paddle/fluid/pir/dialect/kernel/ir/kernel_op.h" #include "paddle/fluid/pir/dialect/operator/interface/op_yaml_info.h" +#include "paddle/fluid/pir/dialect/operator/ir/control_flow_op.h" #include "paddle/fluid/pir/dialect/operator/ir/op_attribute.h" #include "paddle/fluid/pir/dialect/operator/ir/op_dialect.h" #include "paddle/fluid/pir/dialect/operator/ir/op_type.h" From ed4395e49cb28f9311987b0b7df7438e61afd547 Mon Sep 17 00:00:00 2001 From: wanghuancoder Date: Mon, 9 Oct 2023 09:23:57 +0800 Subject: [PATCH 07/62] del unuseful op4 (#57739) * del unuseful op --- .../framework/ir/auto_mixed_precision_pass.cc | 1 - paddle/fluid/framework/op_compatible_info.cc | 1 - paddle/fluid/operators/center_loss_op.cc | 166 ------------ paddle/fluid/operators/center_loss_op.cu | 161 ----------- paddle/fluid/operators/center_loss_op.h | 163 ----------- paddle/fluid/operators/cos_sim_op.cc | 252 ------------------ paddle/fluid/operators/cos_sim_op.cu | 20 -- paddle/fluid/operators/cos_sim_op.h | 167 ------------ python/paddle/amp/amp_lists.py | 1 - test/legacy_test/test_center_loss.py | 96 ------- test/legacy_test/test_cos_sim_op.py | 122 --------- test/white_list/no_grad_set_white_list.py | 1 - test/white_list/op_accuracy_white_list.py | 1 - 13 files changed, 1152 deletions(-) delete mode 100644 paddle/fluid/operators/center_loss_op.cc delete mode 100644 paddle/fluid/operators/center_loss_op.cu delete mode 100644 paddle/fluid/operators/center_loss_op.h delete mode 100644 paddle/fluid/operators/cos_sim_op.cc delete mode 100644 paddle/fluid/operators/cos_sim_op.cu delete mode 100644 paddle/fluid/operators/cos_sim_op.h delete mode 100644 test/legacy_test/test_center_loss.py delete mode 100644 test/legacy_test/test_cos_sim_op.py diff --git a/paddle/fluid/framework/ir/auto_mixed_precision_pass.cc b/paddle/fluid/framework/ir/auto_mixed_precision_pass.cc index 698de5d90c256..14f42b129effa 100644 --- a/paddle/fluid/framework/ir/auto_mixed_precision_pass.cc +++ b/paddle/fluid/framework/ir/auto_mixed_precision_pass.cc @@ -184,7 +184,6 @@ void AutoMixedPrecisionPass::SetDefaultBlacklist() const { "log", "mean", "sum", - "cos_sim", "softmax_with_cross_entropy", "sigmoid_cross_entropy_with_logits", "c_softmax_with_cross_entropy", diff --git a/paddle/fluid/framework/op_compatible_info.cc b/paddle/fluid/framework/op_compatible_info.cc index 37c8dd22c174a..fe7180dd373bb 100644 --- a/paddle/fluid/framework/op_compatible_info.cc +++ b/paddle/fluid/framework/op_compatible_info.cc @@ -68,7 +68,6 @@ void OpCompatibleMap::InitOpCompatibleMap() { op_compatible_map_["sequence_pad"] = {"1.6.0", OpCompatibleType::DEFIN_NOT}; op_compatible_map_["sequence_unpad"] = {"1.6.0", OpCompatibleType::DEFIN_NOT}; - op_compatible_map_["center_loss"] = {"1.6.0", OpCompatibleType::DEFIN_NOT}; op_compatible_map_["coalesce_tensor"] = {"1.6.0", OpCompatibleType::DEFIN_NOT}; op_compatible_map_["crop_tensor"] = {"1.6.0", OpCompatibleType::DEFIN_NOT}; diff --git a/paddle/fluid/operators/center_loss_op.cc b/paddle/fluid/operators/center_loss_op.cc deleted file mode 100644 index 28226d5d94d5a..0000000000000 --- a/paddle/fluid/operators/center_loss_op.cc +++ /dev/null @@ -1,166 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/center_loss_op.h" - -#include -#include - -namespace paddle { -namespace operators { -class CenterLossOp : public framework::OperatorWithKernel { - public: - CenterLossOp(const std::string &type, - const framework::VariableNameMap &inputs, - const framework::VariableNameMap &outputs, - const framework::AttributeMap &attrs) - : OperatorWithKernel(type, inputs, outputs, attrs) {} - - void InferShape(framework::InferShapeContext *ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "CenterLoss"); - auto x_dims = ctx->GetInputDim("X"); - - OP_INOUT_CHECK(ctx->HasInput("CenterUpdateRate"), - "Input", - "CenterUpdateRate", - "CenterLoss"); - OP_INOUT_CHECK(ctx->HasInput("Label"), "Input", "Label", "CenterLoss"); - OP_INOUT_CHECK(ctx->HasInput("Centers"), "Input", "Centers", "CenterLoss"); - OP_INOUT_CHECK(ctx->HasOutput("SampleCenterDiff"), - "Output", - "SampleCenterDiff", - "CenterLoss"); - OP_INOUT_CHECK(ctx->HasOutput("Loss"), "Output", "Loss", "CenterLoss"); - OP_INOUT_CHECK( - ctx->HasOutput("CentersOut"), "Output", "CentersOut", "CenterLoss"); - - ctx->SetOutputDim("SampleCenterDiff", - {x_dims[0], product(x_dims) / x_dims[0]}); - ctx->SetOutputDim("CentersOut", ctx->GetInputDim("Centers")); - ctx->SetOutputDim("Loss", {x_dims[0], 1}); - ctx->ShareLoD("X", /*->*/ "Loss"); - } - - protected: - phi::KernelKey GetExpectedKernelType( - const framework::ExecutionContext &ctx) const override { - return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(ctx, "X"), - ctx.device_context().GetPlace()); - } -}; - -class CenterLossOpMaker : public framework::OpProtoAndCheckerMaker { - public: - void Make() override { - AddInput("X", "(Tensor) Input tensor of center_loss operator."); - AddInput("Label", "(Tensor) Input tensor of center_loss operator."); - AddInput("Centers", "(Tensor) Input tensor of center_loss operator."); - AddInput("CenterUpdateRate", - "(Tensor) Input tensor of center_loss operator."); - - AddOutput("CentersOut", "(Tensor) Input tensor of center_loss operator."); - AddOutput("SampleCenterDiff", - "(Tensor) output tensor of center_loss operator."); - AddOutput("Loss", "(Tensor) Output tensor of center_loss operator."); - - AddAttr("cluster_num", - "The output cluster num of the center_loss operator."); - AddAttr("need_update", "whether need to update center info."); - AddComment(R"DOC( -**CenterLoss operator** -implemention of the center loss function in the papper<>, equations in this implement -is:loss = 1/2 * (x-y)^2 ,where x(X) means the deep feature(output of last hidden layer ) -and y(Label) the target label -)DOC"); - } -}; - -class CenterLossGradOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext *ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("SampleCenterDiff"), - "Input", - "SampleCenterDiff", - "CenterLossGrad"); - OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Loss")), - "Input", - framework::GradVarName("Loss"), - "CenterLossGrad"); - OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("X")), - "Output", - framework::GradVarName("X"), - "CenterLossGrad"); - - auto x_dims = ctx->GetInputDim("X"); - auto x_grad_name = framework::GradVarName("X"); - - if (ctx->HasOutput(x_grad_name)) { - ctx->SetOutputDim(x_grad_name, x_dims); - } - } - - protected: - phi::KernelKey GetExpectedKernelType( - const framework::ExecutionContext &ctx) const override { - return phi::KernelKey( - OperatorWithKernel::IndicateVarDataType(ctx, "SampleCenterDiff"), - ctx.device_context().GetPlace()); - } -}; - -template -class CenterLossOpGradMaker : public framework::SingleGradOpMaker { - public: - using framework::SingleGradOpMaker::SingleGradOpMaker; - - protected: - void Apply(GradOpPtr retv) const override { - retv->SetType("center_loss_grad"); - retv->SetInput(framework::GradVarName("Loss"), this->OutputGrad("Loss")); - retv->SetInput("SampleCenterDiff", this->Output("SampleCenterDiff")); - retv->SetInput("X", this->Input("X")); - retv->SetOutput(framework::GradVarName("X"), this->InputGrad("X")); - - retv->SetAttrMap(this->Attrs()); - } -}; - -DECLARE_NO_NEED_BUFFER_VARS_INFERER(CenterLossGradNoNeedBufVarsInferer, "X"); - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -REGISTER_OPERATOR(center_loss, - ops::CenterLossOp, - ops::CenterLossOpMaker, - ops::CenterLossOpGradMaker, - ops::CenterLossOpGradMaker); - -REGISTER_OPERATOR(center_loss_grad, - ops::CenterLossGradOp, - ops::CenterLossGradNoNeedBufVarsInferer); - -PD_REGISTER_STRUCT_KERNEL( - center_loss, CPU, ALL_LAYOUT, ops::CenterLossKernel, float, double) {} -PD_REGISTER_STRUCT_KERNEL(center_loss_grad, - CPU, - ALL_LAYOUT, - ops::CenterLossGradKernel, - float, - double) {} diff --git a/paddle/fluid/operators/center_loss_op.cu b/paddle/fluid/operators/center_loss_op.cu deleted file mode 100644 index 73567c195d97f..0000000000000 --- a/paddle/fluid/operators/center_loss_op.cu +++ /dev/null @@ -1,161 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include - -#include "paddle/fluid/operators/center_loss_op.h" -#include "paddle/fluid/platform/device/gpu/gpu_info.h" -#include "paddle/phi/backends/gpu/gpu_primitives.h" -namespace paddle { -namespace operators { - -using phi::PADDLE_CUDA_NUM_THREADS; - -template -__global__ void ComputeDifferent(T *centers_diff, - const T *X, - const T *centers, - const int64_t *ids, - const int64_t N, - const int64_t K, - const int64_t D) { - int idx = threadIdx.x; - int idy = blockIdx.x + threadIdx.y * GridDimX; - - while (idy < K) { - int64_t id = ids[idy]; - PADDLE_ENFORCE(id >= 0, "Id should larger than 0 but received id: %d.", id); - PADDLE_ENFORCE( - id < N, "Id should smaller than %d but received id: %d.", N, id); - - T *out = centers_diff + idy * D; - const T *x = X + idy * D; - const T *cent = centers + id * D; - for (int i = idx; i < D; i += BlockDimX) { - out[i] = x[i] - cent[i]; - } - idy += BlockDimY * GridDimX; - } -} - -template -__global__ void UpdateCenters(T *centers, - T *centers_diff, - const int64_t *ids, - const int64_t N, - const int64_t K, - const int64_t D, - const T *alpha) { - int idx = threadIdx.x; - int idy = blockIdx.x + threadIdx.y * GridDimX; - int count; - while (idy < K) { - int count = 1; - int64_t id = ids[idy]; - PADDLE_ENFORCE(id >= 0, "Id should larger than 0 but received id: %d.", id); - PADDLE_ENFORCE( - id < N, "Id should smaller than %d but received id: %d.", N, id); - - for (int i = 0; i < K; i++) { - if (ids[i] == id) { - count++; - } - } - const T *diff = centers_diff + idy * D; - T *cent = centers + id * D; - for (int i = idx; i < D; i += BlockDimX) { - phi::CudaAtomicAdd(¢[i], alpha[0] * diff[i] / count); - } - idy += BlockDimY * GridDimX; - } -} - -template -class CenterLossCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - auto &device_context = ctx.template device_context(); - auto stream = device_context.stream(); - auto *X = ctx.Input("X"); // deep feature - auto *labels = ctx.Input("Label"); - auto *centers = ctx.Input("Centers"); - auto *update_rate = ctx.Input("CenterUpdateRate"); - int cluster_num = ctx.Attr("cluster_num"); - auto *lr_center = update_rate->data(); - bool need_update = static_cast(ctx.Attr("need_update")); - - auto x_data = X->data(); - auto label_data = labels->data(); - - auto x_dims = X->dims(); - int batch_size = x_dims[0]; - const int deep_feat_dim = x_dims[1]; - - auto *centers_diff = ctx.Output("SampleCenterDiff"); - auto centers_diff_data = centers_diff->mutable_data(ctx.GetPlace()); - - auto centers_data = centers->data(); - auto centers_dim = centers->dims(); - auto *out_loss = ctx.Output("Loss"); - auto loss_data = out_loss->mutable_data(ctx.GetPlace()); - - auto *centers_out = ctx.Output("CentersOut"); - auto *centers_out_data = centers_out->mutable_data(ctx.GetPlace()); - - auto ctx_place = ctx.GetPlace(); - if (centers != centers_out) { - framework::TensorCopy( - *static_cast(centers), - ctx_place, - *platform::DeviceContextPool::Instance().Get(ctx_place), - static_cast(centers_out)); - } - - int64_t numel = X->numel(); - - size_t N = centers->dims()[0]; - size_t D = centers->dims()[1]; - size_t K = labels->numel(); - - dim3 threads(128, 8); - dim3 grids(8, 1); - - ComputeDifferent<<>>( - centers_diff_data, x_data, centers_data, label_data, N, K, D); - - auto &place = *ctx.template device_context().eigen_device(); - auto sub_result = EigenMatrix::From(*centers_diff); - - auto sub_res_pow2 = (sub_result * sub_result) / T(2.0); - auto z = EigenVector::Flatten(*out_loss); - z.device(place) = sub_res_pow2.sum(Eigen::array({{1}})); - if (need_update) { - UpdateCenters<<>>( - centers_out_data, centers_diff_data, label_data, N, K, D, lr_center); - } - } -}; -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -PD_REGISTER_STRUCT_KERNEL( - center_loss, GPU, ALL_LAYOUT, ops::CenterLossCUDAKernel, float, double) {} -PD_REGISTER_STRUCT_KERNEL(center_loss_grad, - GPU, - ALL_LAYOUT, - ops::CenterLossGradKernel, - float, - double) {} diff --git a/paddle/fluid/operators/center_loss_op.h b/paddle/fluid/operators/center_loss_op.h deleted file mode 100644 index 5e5575c68cb0b..0000000000000 --- a/paddle/fluid/operators/center_loss_op.h +++ /dev/null @@ -1,163 +0,0 @@ -/*Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include -#include -#include - -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/phi/common/transform.h" -#include "paddle/phi/kernels/funcs/blas/blas.h" - -namespace paddle { -namespace operators { - -template -using EigenVector = framework::EigenVector; -template -using EigenMatrix = framework::EigenMatrix; - -template -struct SubFunctor { - inline HOSTDEVICE T operator()(T a, T b) const { return a - b; } -}; - -template -class CenterLossKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - auto *X = ctx.Input("X"); // deep feature - auto *labels = ctx.Input("Label"); - auto *centers = ctx.Input("Centers"); - auto *update_rate = ctx.Input("CenterUpdateRate"); - int cluster_num = ctx.Attr("cluster_num"); - auto *lr_center = update_rate->data(); - T alpha = lr_center[0]; - bool need_update = static_cast(ctx.Attr("need_update")); - - auto x_data = X->data(); - auto label_data = labels->data(); - - auto centers_dim = centers->dims(); - auto centers_data = centers->data(); - - auto x_dims = X->dims(); - int batch_size = x_dims[0]; - int deep_feat_dim = x_dims[1]; - - auto centers_diff = ctx.Output("SampleCenterDiff"); - auto centers_diff_data = centers_diff->mutable_data(ctx.GetPlace()); - auto *out_loss = ctx.Output("Loss"); - - auto *centers_out = ctx.Output("CentersOut"); - auto *centers_out_data = centers_out->mutable_data(ctx.GetPlace()); - - if (centers_out_data != centers_data) { - int size = centers_out->numel() * sizeof(T); - memcpy(centers_out_data, centers_data, size); - } - - std::vector center_update_count(cluster_num, 1); - auto &dev_ctx = ctx.template device_context(); - - auto loss_data = out_loss->mutable_data(ctx.GetPlace()); - - phi::DenseTensor centers_diffacc; // used to accumulate all diff - auto centers_diffacc_data = - centers_diffacc.mutable_data(centers_dim, ctx.GetPlace()); - int numel = centers_diffacc.numel(); - std::memset(centers_diffacc_data, 0, sizeof(T) * numel); - - auto blas = phi::funcs::GetBlas(dev_ctx); - int tLabel; - - const T *x_index; - const T *center_index; - T *center_out_index; - T *center_loss_diff_index; - T *acc_index; - phi::Transform trans; - - for (int i = 0; i < batch_size; ++i) { - tLabel = label_data[i]; - center_update_count[tLabel]++; - x_index = x_data + i * deep_feat_dim; // xi index - center_index = centers_data + tLabel * deep_feat_dim; // center index - center_loss_diff_index = centers_diff_data + i * deep_feat_dim; - trans(dev_ctx, - x_index, - x_index + deep_feat_dim, - center_index, - center_loss_diff_index, - SubFunctor()); - - acc_index = centers_diffacc_data + tLabel * deep_feat_dim; - blas.VADD(deep_feat_dim, - center_loss_diff_index, - acc_index, - acc_index); // accumulate - loss_data[i] = - blas.DOT( - deep_feat_dim, center_loss_diff_index, center_loss_diff_index) / - T(2.0); - } - - // update centers data - if (need_update == true) { - for (int i = 0; i < cluster_num; i++) { - acc_index = centers_diffacc_data + i * deep_feat_dim; - center_out_index = centers_out_data + i * deep_feat_dim; - T scale = alpha / center_update_count[i]; - blas.SCAL(deep_feat_dim, scale, acc_index); - blas.VADD(deep_feat_dim, acc_index, center_out_index, center_out_index); - } - } - } -}; - -template -class CenterLossGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &context) const override { - auto *in0 = context.Input("SampleCenterDiff"); - auto *in1 = context.Input(framework::GradVarName("Loss")); - auto *x_g = context.Output(framework::GradVarName("X")); - auto sub_result = EigenMatrix::From(*in0); - auto out_grad = EigenMatrix::From(*in1); - - auto x_dims = x_g->dims(); - int cols = x_g->numel() / x_dims[0]; - // calculate gradient - auto grad_mat = - (out_grad.broadcast(Eigen::array({{1, cols}}))) * sub_result; - - // propagate back to input - auto &eigen_place = - *context.template device_context().eigen_device(); - x_g->mutable_data(context.GetPlace()); - // eigen matrix - auto x_grad = EigenMatrix::From(*x_g, phi::make_ddim({x_dims[0], cols})); - x_grad.device(eigen_place) = grad_mat; - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/cos_sim_op.cc b/paddle/fluid/operators/cos_sim_op.cc deleted file mode 100644 index 6dd84d58ae9a5..0000000000000 --- a/paddle/fluid/operators/cos_sim_op.cc +++ /dev/null @@ -1,252 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/cos_sim_op.h" - -#include - -namespace paddle { -namespace operators { - -class CosSimOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext* ctx) const override { - // notnull check - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "CosSim"); - OP_INOUT_CHECK(ctx->HasInput("Y"), "Input", "Y", "CosSim"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "CosSim"); - OP_INOUT_CHECK(ctx->HasOutput("XNorm"), "Output", "XNorm", "CosSim"); - OP_INOUT_CHECK(ctx->HasOutput("YNorm"), "Output", "YNorm", "CosSim"); - - // shape check - auto x_dims = ctx->GetInputDim("X"); - auto y_dims = ctx->GetInputDim("Y"); - - bool check = true; - if ((!ctx->IsRuntime()) && - (phi::product(x_dims) <= 0 || phi::product(y_dims) <= 0)) { - check = false; - } - - if (check) { - PADDLE_ENFORCE_EQ( - x_dims.size(), - y_dims.size(), - platform::errors::InvalidArgument( - "ShapeError: Ranks of Input(X) and Input(Y) must be equal." - "But received: Ranks of Input(X) is [%d], Ranks of Input(Y) is " - "[%d]", - x_dims.size(), - y_dims.size())); - PADDLE_ENFORCE_GE( - x_dims.size(), - 2, - platform::errors::InvalidArgument( - "ShapeError: Rank of Input(X) must not be less than 2." - "But received: Ranks of Input(X) is [%d]", - x_dims.size())); - PADDLE_ENFORCE_EQ( - phi::slice_ddim(x_dims, 1, x_dims.size()), - phi::slice_ddim(y_dims, 1, y_dims.size()), - platform::errors::InvalidArgument( - "All dimensions except the 1st of Input(X) and Input(Y) " - "must be equal.")); - PADDLE_ENFORCE_EQ( - x_dims[0] == y_dims[0] || y_dims[0] == 1, - true, - platform::errors::InvalidArgument( - "The 1st dimension of Input(Y) %d must be equal to Input(X) %d or" - " just 1 (which will be broadcasted to match Input(X)).", - y_dims[0], - x_dims[0])); - } - - // resize tensor - ctx->SetOutputDim("Out", {x_dims[0], 1}); - ctx->SetOutputDim("XNorm", {x_dims[0], 1}); - ctx->SetOutputDim("YNorm", {y_dims[0], 1}); - ctx->ShareLoD("X", /*->*/ "Out"); - } -}; - -class CosSimOpMaker : public framework::OpProtoAndCheckerMaker { - public: - void Make() override { - AddInput("X", - "The 1st input of cos_sim op, Tensor with shape ``[N_1, N_2, " - "..., N_k]``, the data type is float32."); - AddInput("Y", - "The 2nd input of cos_sim op, Tensor with shape ``[N_1 or 1, N_2, " - "..., N_k]``, the data type is float32."); - AddOutput("Out", "The output of cos_sim op."); - AddOutput("XNorm", - "Norm of the first input, reduced along the 1st " - "dimension.") - .AsIntermediate(); - AddOutput("YNorm", - "Norm of the second input, reduced along the 1st " - "dimension.") - .AsIntermediate(); - AddAttr(framework::kAllKernelsMustComputeRuntimeShape, - "Skip calling InferShape() function in the runtime.") - .SetDefault(true); - - AddComment(R"DOC( -**Cosine Similarity Operator** - -$Out = \frac{X^T * Y}{(\sqrt{X^T * X} * \sqrt{Y^T * Y})}$ - -The input X and Y must have the same shape, except that the 1st dimension -of input Y could be just 1 (different from input X), which will be -broadcasted to match the shape of input X before computing their cosine -similarity. - -)DOC"); - } -}; - -class CosSimOpGrad : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext* ctx) const override { - // notnull check - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "CosSimGrad"); - OP_INOUT_CHECK(ctx->HasInput("Y"), "Input", "Y", "CosSimGrad"); - OP_INOUT_CHECK(ctx->HasInput("XNorm"), "Input", "XNorm", "CosSimGrad"); - OP_INOUT_CHECK(ctx->HasInput("YNorm"), "Input", "YNorm", "CosSimGrad"); - OP_INOUT_CHECK(ctx->HasInput("Out"), "Input", "Out", "CosSimGrad"); - OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")), - "Input", - framework::GradVarName("Out"), - "CosSimGrad"); - - // shape check - auto x_dims = ctx->GetInputDim("X"); - auto y_dims = ctx->GetInputDim("Y"); - auto xnorm_dims = ctx->GetInputDim("XNorm"); - auto ynorm_dims = ctx->GetInputDim("YNorm"); - auto out_dims = ctx->GetInputDim("Out"); - auto out_grad_dims = ctx->GetInputDim(framework::GradVarName("Out")); - - PADDLE_ENFORCE_GE( - x_dims.size(), - y_dims.size(), - platform::errors::InvalidArgument( - "ShapeError: Ranks of Input(X) and Input(Y) must be equal." - "But received: Ranks of Input(X) is [%d], Ranks of Input(Y) is " - "[%d]", - x_dims.size(), - y_dims.size())); - PADDLE_ENFORCE_GE( - x_dims.size(), - 2, - platform::errors::InvalidArgument( - "ShapeError: Rank of Input(X) must not be less than 2." - "But received: Ranks of Input(X) is [%d]", - x_dims.size())); - PADDLE_ENFORCE_EQ( - phi::slice_ddim(x_dims, 1, x_dims.size()), - phi::slice_ddim(y_dims, 1, y_dims.size()), - platform::errors::InvalidArgument( - "All dimensions except the 1st of Input(X) [%s] and Input(Y) [%s] " - "must be equal.", - x_dims, - y_dims)); - PADDLE_ENFORCE_EQ( - true, - x_dims[0] == y_dims[0] || y_dims[0] == 1, - platform::errors::InvalidArgument( - "The 1st dimension of Input(Y) %d must be equal to Input(X) %d or" - " just 1 (which will be broadcasted to match Input(X)).", - y_dims[0], - x_dims[0])); - auto target_xnorm_dims = phi::make_ddim({x_dims[0], 1}); - auto target_ynorm_dims = phi::make_ddim({y_dims[0], 1}); - PADDLE_ENFORCE_EQ( - xnorm_dims, - target_xnorm_dims, - platform::errors::InvalidArgument( - "Shape of Input(XNorm) [%s] must be (X.Dim(0), 1) - [%s]", - xnorm_dims, - target_xnorm_dims)); - PADDLE_ENFORCE_EQ( - ynorm_dims, - target_ynorm_dims, - platform::errors::InvalidArgument( - "Shape of Input(YNorm) [%s] must be (Y.Dim(0), 1) - [%s]", - ynorm_dims, - target_ynorm_dims)); - PADDLE_ENFORCE_EQ( - out_dims, - target_xnorm_dims, - platform::errors::InvalidArgument( - "Shape of Input(Out) [%s] must be (X.Dim(0), 1) - [%s]", - out_dims, - target_xnorm_dims)); - PADDLE_ENFORCE_EQ( - out_grad_dims, - target_xnorm_dims, - platform::errors::InvalidArgument( - "Shape of Input(Out@Grad) [%s] must be (X.Dim(0), 1) - [%s]", - out_grad_dims, - target_xnorm_dims)); - - // resize tensor - auto x_grad_name = framework::GradVarName("X"); - auto y_grad_name = framework::GradVarName("Y"); - if (ctx->HasOutput(x_grad_name)) { - ctx->SetOutputDim(x_grad_name, x_dims); - } - if (ctx->HasOutput(y_grad_name)) { - ctx->SetOutputDim(y_grad_name, y_dims); - } - } -}; - -template -class CosSimGradOpMaker : public framework::SingleGradOpMaker { - public: - using framework::SingleGradOpMaker::SingleGradOpMaker; - - protected: - void Apply(GradOpPtr grad_op) const override { - grad_op->SetType("cos_sim_grad"); - grad_op->SetInput("X", this->Input("X")); - grad_op->SetInput("Y", this->Input("Y")); - grad_op->SetInput("XNorm", this->Output("XNorm")); - grad_op->SetInput("YNorm", this->Output("YNorm")); - grad_op->SetInput("Out", this->Output("Out")); - grad_op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out")); - grad_op->SetOutput(framework::GradVarName("X"), this->InputGrad("X")); - grad_op->SetOutput(framework::GradVarName("Y"), this->InputGrad("Y")); - grad_op->SetAttrMap(this->Attrs()); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OPERATOR(cos_sim, - ops::CosSimOp, - ops::CosSimOpMaker, - ops::CosSimGradOpMaker, - ops::CosSimGradOpMaker); -REGISTER_OPERATOR(cos_sim_grad, ops::CosSimOpGrad); -PD_REGISTER_STRUCT_KERNEL(cos_sim, CPU, ALL_LAYOUT, ops::CosSimKernel, float) {} -PD_REGISTER_STRUCT_KERNEL( - cos_sim_grad, CPU, ALL_LAYOUT, ops::CosSimGradKernel, float) {} diff --git a/paddle/fluid/operators/cos_sim_op.cu b/paddle/fluid/operators/cos_sim_op.cu deleted file mode 100644 index 82174a246757e..0000000000000 --- a/paddle/fluid/operators/cos_sim_op.cu +++ /dev/null @@ -1,20 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#include "paddle/fluid/operators/cos_sim_op.h" - -namespace ops = paddle::operators; - -PD_REGISTER_STRUCT_KERNEL(cos_sim, GPU, ALL_LAYOUT, ops::CosSimKernel, float) {} -PD_REGISTER_STRUCT_KERNEL( - cos_sim_grad, GPU, ALL_LAYOUT, ops::CosSimGradKernel, float) {} diff --git a/paddle/fluid/operators/cos_sim_op.h b/paddle/fluid/operators/cos_sim_op.h deleted file mode 100644 index 115bfa0a42e56..0000000000000 --- a/paddle/fluid/operators/cos_sim_op.h +++ /dev/null @@ -1,167 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/math/cos_sim_functor.h" -#include "paddle/fluid/platform/for_range.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace paddle { -namespace operators { - -template -class CosSimKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - // get phi::DenseTensor - auto* in_x = context.Input("X"); - auto* in_y = context.Input("Y"); - auto* out_z = context.Output("Out"); - auto* out_x_norm = context.Output("XNorm"); - auto* out_y_norm = context.Output("YNorm"); - - int rows_x = in_x->dims()[0]; - int rows_y = in_y->dims()[0]; - out_z->Resize({rows_x, 1}); - out_x_norm->Resize({rows_x, 1}); - out_y_norm->Resize({rows_y, 1}); - out_z->mutable_data(context.GetPlace()); - out_x_norm->mutable_data(context.GetPlace()); - out_y_norm->mutable_data(context.GetPlace()); - out_z->set_lod(in_x->lod()); - - int cols = phi::product(in_x->dims()) / rows_x; - - if (rows_x == rows_y) { - math::CosSimFunctor functor(in_x->data(), - in_y->data(), - out_x_norm->data(), - out_y_norm->data(), - out_z->data(), - cols); - platform::ForRange for_range( - static_cast(context.device_context()), rows_x); - for_range(functor); - } else { - math::CosSimFunctor functor(in_x->data(), - in_y->data(), - out_x_norm->data(), - out_y_norm->data(), - out_z->data(), - cols); - platform::ForRange for_range( - static_cast(context.device_context()), rows_x); - for_range(functor); - } - } -}; - -template -class CosSimGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - // get phi::DenseTensor - auto* in_x = context.Input("X"); - auto* in_y = context.Input("Y"); - auto* in_z = context.Input("Out"); - auto* in_x_norm = context.Input("XNorm"); - auto* in_y_norm = context.Input("YNorm"); - auto* out_grad_x = - context.Output(framework::GradVarName("X")); - auto* out_grad_y = - context.Output(framework::GradVarName("Y")); - auto* in_grad_z = - context.Input(framework::GradVarName("Out")); - - // compute gradident - int rows_x = in_x->dims()[0]; - int rows_y = in_y->dims()[0]; - int cols = phi::product(in_x->dims()) / rows_x; - - if (rows_x == rows_y) { - if (out_grad_x) { - out_grad_x->Resize(in_x->dims()); - math::CosSimGradFunctor functor( - in_x_norm->data(), - in_y_norm->data(), - in_x->data(), - in_y->data(), - in_z->data(), - in_grad_z->data(), - out_grad_x->mutable_data(context.GetPlace()), - cols); - platform::ForRange for_range( - static_cast(context.device_context()), - rows_x); - for_range(functor); - } - if (out_grad_y) { - out_grad_y->Resize(in_y->dims()); - math::CosSimGradFunctor functor( - in_y_norm->data(), - in_x_norm->data(), - in_y->data(), - in_x->data(), - in_z->data(), - in_grad_z->data(), - out_grad_y->mutable_data(context.GetPlace()), - cols); - platform::ForRange for_range( - static_cast(context.device_context()), - rows_x); - for_range(functor); - } - } else { - if (out_grad_x) { - out_grad_x->Resize(in_x->dims()); - math::CosSimDxFunctor functor( - in_x_norm->data(), - in_y_norm->data(), - in_x->data(), - in_y->data(), - in_z->data(), - in_grad_z->data(), - out_grad_x->mutable_data(context.GetPlace()), - cols); - platform::ForRange for_range( - static_cast(context.device_context()), - rows_x); - for_range(functor); - } - if (out_grad_y) { - out_grad_y->Resize(in_y->dims()); - out_grad_y->mutable_data(context.GetPlace()); - phi::funcs::SetConstant set_zero; - auto& dev_ctx = context.template device_context(); - set_zero(dev_ctx, out_grad_y, static_cast(0)); - - math::CosSimDyFunctor functor; - functor(dev_ctx, - in_x_norm->data(), - in_y_norm->data(), - in_x->data(), - in_y->data(), - in_z->data(), - in_grad_z->data(), - static_cast(rows_x), - static_cast(cols), - out_grad_y->data()); - } - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/python/paddle/amp/amp_lists.py b/python/paddle/amp/amp_lists.py index 7d014b1bf14f9..b4b4fc95cb049 100644 --- a/python/paddle/amp/amp_lists.py +++ b/python/paddle/amp/amp_lists.py @@ -44,7 +44,6 @@ 'cosh', 'atanh', 'tanh_shrink', - 'cos_sim', 'erfinv', 'exp', 'expm1', diff --git a/test/legacy_test/test_center_loss.py b/test/legacy_test/test_center_loss.py deleted file mode 100644 index 31863cd93f767..0000000000000 --- a/test/legacy_test/test_center_loss.py +++ /dev/null @@ -1,96 +0,0 @@ -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import unittest - -import numpy as np -from op_test import OpTest - -import paddle - - -class TestCenterLossOp(OpTest): - def setUp(self): - self.op_type = "center_loss" - self.dtype = np.float64 - self.init_dtype_type() - batch_size = 12 - feet_dim = 10 - cluster_num = 8 - self.attrs = {} - self.attrs['cluster_num'] = cluster_num - self.attrs['lambda'] = 0.1 - self.config() - self.attrs['need_update'] = self.need_update - labels = np.random.randint(cluster_num, size=batch_size, dtype='int64') - feat = np.random.random((batch_size, feet_dim)).astype(np.float64) - centers = np.random.random((cluster_num, feet_dim)).astype(np.float64) - var_sum = np.zeros((cluster_num, feet_dim), dtype=np.float64) - centers_select = centers[labels] - output = feat - centers_select - diff_square = np.square(output).reshape(batch_size, feet_dim) - loss = 0.5 * np.sum(diff_square, axis=1).reshape(batch_size, 1) - cout = [] - for i in range(cluster_num): - cout.append(0) - for i in range(batch_size): - cout[labels[i]] += 1 - var_sum[labels[i]] += output[i] - for i in range(cluster_num): - var_sum[i] /= 1 + cout[i] - var_sum *= 0.1 - result = centers + var_sum - rate = np.array([0.1]).astype(np.float64) - - self.inputs = { - 'X': feat, - 'Label': labels, - 'Centers': centers, - 'CenterUpdateRate': rate, - } - - if self.need_update: - self.outputs = { - 'SampleCenterDiff': output, - 'Loss': loss, - 'CentersOut': result, - } - else: - self.outputs = { - 'SampleCenterDiff': output, - 'Loss': loss, - 'CentersOut': centers, - } - - def config(self): - self.need_update = True - - def init_dtype_type(self): - pass - - def test_check_output(self): - self.check_output(check_dygraph=False) - - def test_check_grad(self): - self.check_grad(['X'], 'Loss', check_dygraph=False) - - -class TestCenterLossOpNoUpdate(TestCenterLossOp): - def config(self): - self.need_update = False - - -if __name__ == "__main__": - paddle.enable_static() - unittest.main() diff --git a/test/legacy_test/test_cos_sim_op.py b/test/legacy_test/test_cos_sim_op.py deleted file mode 100644 index f9c761c9eedf3..0000000000000 --- a/test/legacy_test/test_cos_sim_op.py +++ /dev/null @@ -1,122 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import unittest - -import numpy as np -from op_test import OpTest - - -class TestCosSimOp(OpTest): - def setUp(self): - self.op_type = "cos_sim" - self.inputs = { - 'X': np.random.random((6, 20)).astype("float32"), - 'Y': np.random.random((6, 20)).astype("float32"), - } - expect_x_norm = np.linalg.norm(self.inputs['X'], axis=1) - expect_y_norm = np.linalg.norm(self.inputs['Y'], axis=1) - expect_out = ( - (self.inputs['X'] * self.inputs['Y']).sum(axis=1) - / expect_x_norm - / expect_y_norm - ) - self.outputs = { - 'XNorm': np.expand_dims(expect_x_norm, 1), - 'YNorm': np.expand_dims(expect_y_norm, 1), - 'Out': np.expand_dims(expect_out, 1), - } - - def test_check_output(self): - self.check_output() - - def test_check_grad_normal(self): - self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.06) - - def test_check_grad_ingore_x(self): - self.check_grad( - ['Y'], 'Out', max_relative_error=0.06, no_grad_set=set("X") - ) - - def test_check_grad_ingore_y(self): - self.check_grad( - ['X'], 'Out', max_relative_error=0.06, no_grad_set=set('Y') - ) - - -class TestCosSimOp2(TestCosSimOp): - def setUp(self): - self.op_type = "cos_sim" - self.inputs = { - 'X': np.random.random((6, 100)).astype("float32"), - 'Y': np.random.random((1, 100)).astype("float32"), - } - expect_x_norm = np.linalg.norm(self.inputs['X'], axis=1) - expect_y_norm = np.linalg.norm(self.inputs['Y'], axis=1) - expect_out = ( - (self.inputs['X'] * self.inputs['Y']).sum(axis=1) - / expect_x_norm - / expect_y_norm - ) - self.outputs = { - 'XNorm': np.expand_dims(expect_x_norm, 1), - 'YNorm': np.expand_dims(expect_y_norm, 1), - 'Out': np.expand_dims(expect_out, 1), - } - - -class TestCosSimOp3(TestCosSimOp): - def setUp(self): - self.op_type = "cos_sim" - self.inputs = { - 'X': np.random.random((6, 5, 4)).astype("float32"), - 'Y': np.random.random((6, 5, 4)).astype("float32"), - } - expect_x_norm = np.linalg.norm(self.inputs['X'], axis=(1, 2)) - expect_y_norm = np.linalg.norm(self.inputs['Y'], axis=(1, 2)) - expect_out = ( - (self.inputs['X'] * self.inputs['Y']).sum(axis=(1, 2)) - / expect_x_norm - / expect_y_norm - ) - self.outputs = { - 'XNorm': np.expand_dims(expect_x_norm, 1), - 'YNorm': np.expand_dims(expect_y_norm, 1), - 'Out': np.expand_dims(expect_out, 1), - } - - -class TestCosSimOp4(TestCosSimOp): - def setUp(self): - self.op_type = "cos_sim" - self.inputs = { - 'X': np.random.random((6, 5, 20)).astype("float32"), - 'Y': np.random.random((1, 5, 20)).astype("float32"), - } - expect_x_norm = np.linalg.norm(self.inputs['X'], axis=(1, 2)) - expect_y_norm = np.linalg.norm(self.inputs['Y'], axis=(1, 2)) - expect_out = ( - (self.inputs['X'] * self.inputs['Y']).sum(axis=(1, 2)) - / expect_x_norm - / expect_y_norm - ) - self.outputs = { - 'XNorm': np.expand_dims(expect_x_norm, 1), - 'YNorm': np.expand_dims(expect_y_norm, 1), - 'Out': np.expand_dims(expect_out, 1), - } - - -if __name__ == '__main__': - unittest.main() diff --git a/test/white_list/no_grad_set_white_list.py b/test/white_list/no_grad_set_white_list.py index 33960cf4c64d3..525cce49df3dc 100644 --- a/test/white_list/no_grad_set_white_list.py +++ b/test/white_list/no_grad_set_white_list.py @@ -26,7 +26,6 @@ 'conv2d_transpose', 'conv3d', 'conv3d_transpose', - 'cos_sim', 'cross_entropy', 'cross_entropy2', 'data_norm', diff --git a/test/white_list/op_accuracy_white_list.py b/test/white_list/op_accuracy_white_list.py index 49b501e765b54..8565f6c7892e5 100644 --- a/test/white_list/op_accuracy_white_list.py +++ b/test/white_list/op_accuracy_white_list.py @@ -22,7 +22,6 @@ 'conv3d', 'conv3d_transpose', 'conv_shift', - 'cos_sim', 'cudnn_lstm', 'cvm', 'data_norm', From bdba65c2c9d4f9317ecff4eb5711401d65210533 Mon Sep 17 00:00:00 2001 From: Bo Zhang <105368690+zhangbopd@users.noreply.github.com> Date: Mon, 9 Oct 2023 09:44:54 +0800 Subject: [PATCH 08/62] Standard naming Part 1 (#57912) * change name * split symboltable --- paddle/pir/dialect/shape/ir/shape_dialect.h | 6 - paddle/pir/dialect/shape/ir/shape_op.cc | 158 +++--- paddle/pir/dialect/shape/ir/shape_op.h | 57 +- .../shape/utils/shape_optimization_utils.cc | 15 + .../shape/utils/shape_optimization_utils.h | 15 + paddle/pir/dialect/shape/utils/shape_utils.cc | 490 +++++++++--------- paddle/pir/dialect/shape/utils/shape_utils.h | 127 +++-- .../pir/dialect/shape/utils/symbol_table.cc | 34 ++ paddle/pir/dialect/shape/utils/symbol_table.h | 68 +++ .../cpp/pir/shape_dialect/symbolic_op_test.cc | 99 ++-- 10 files changed, 596 insertions(+), 473 deletions(-) create mode 100644 paddle/pir/dialect/shape/utils/shape_optimization_utils.cc create mode 100644 paddle/pir/dialect/shape/utils/shape_optimization_utils.h create mode 100644 paddle/pir/dialect/shape/utils/symbol_table.cc create mode 100644 paddle/pir/dialect/shape/utils/symbol_table.h diff --git a/paddle/pir/dialect/shape/ir/shape_dialect.h b/paddle/pir/dialect/shape/ir/shape_dialect.h index b4ae3aa617210..b8fe39bd8d500 100644 --- a/paddle/pir/dialect/shape/ir/shape_dialect.h +++ b/paddle/pir/dialect/shape/ir/shape_dialect.h @@ -24,12 +24,6 @@ namespace dialect { class IR_API ShapeDialect : public Dialect { public: explicit ShapeDialect(IrContext* context); - /// - /// \brief Each Dialect needs to provide a name function to return the name of - /// the Dialect. - /// - /// \return The name of this Dialect. - /// static const char* name() { return "shape"; } void PrintOperation(Operation* op, IrPrinter& printer) const override; // NOLINT diff --git a/paddle/pir/dialect/shape/ir/shape_op.cc b/paddle/pir/dialect/shape/ir/shape_op.cc index aa2e9c2e26e4c..885f50d080143 100644 --- a/paddle/pir/dialect/shape/ir/shape_op.cc +++ b/paddle/pir/dialect/shape/ir/shape_op.cc @@ -16,115 +16,122 @@ #include "paddle/pir/core/builtin_attribute.h" #include "paddle/pir/core/builtin_op.h" #include "paddle/pir/core/builtin_type.h" +#include "paddle/pir/core/enforce.h" -namespace pir { -namespace dialect { +namespace pir::dialect { -const char *SymbolicDim::attributes_name[attributes_num] = {"knownNegativeOne", - "knownNonNegative", - "knownNonSizeOne", - "knownNonSizeZero", - "sym_name", - "value"}; // NOLINT +const char *SymbolicDim::attributes_name[attributes_num] = { + "known_negative_one", // value = -1 + "known_non_negative", // value >= 0 + "known_non_size_one", // value != 1 + "known_non_size_zero", // value != 0 + "sym_name", + "value"}; // NOLINT void SymbolicDim::Build(Builder &builder, OperationArgument &argument, const std::string &sym_name, int64_t value, - bool knownNonNegative, - bool knownNegativeOne, - bool knownNonSizeOne, - bool knownNonSizeZero) { - Attribute attr_sym_name = StrAttribute::get(IrContext::Instance(), sym_name); + bool known_non_negative, + bool known_negative_one, + bool known_non_size_one, + bool known_non_size_zero) { + IrContext *ctx = IrContext::Instance(); + auto attr_sym_name = StrAttribute::get(ctx, sym_name); + auto attr_value = Int64Attribute::get(ctx, value); + auto attr_known_none_negative = BoolAttribute::get(ctx, known_non_negative); + auto attr_known_negative_one = BoolAttribute::get(ctx, known_negative_one); + auto attr_known_non_size_one = BoolAttribute::get(ctx, known_non_size_one); + auto attr_known_non_size_zero = BoolAttribute::get(ctx, known_non_size_zero); + argument.AddAttribute("sym_name", attr_sym_name); - Attribute attr_value = Int64Attribute::get(IrContext::Instance(), value); argument.AddAttribute("value", attr_value); - Attribute attr_knownNonNegative = - BoolAttribute::get(IrContext::Instance(), knownNonNegative); - argument.AddAttribute("knownNonNegative", attr_knownNonNegative); - Attribute attr_knownNegativeOne = - BoolAttribute::get(IrContext::Instance(), knownNegativeOne); - argument.AddAttribute("knownNegativeOne", attr_knownNegativeOne); - Attribute attr_knownNonSizeOne = - BoolAttribute::get(IrContext::Instance(), knownNonSizeOne); - argument.AddAttribute("knownNonSizeOne", attr_knownNonSizeOne); - Attribute attr_knownNonSizeZero = - BoolAttribute::get(IrContext::Instance(), knownNonSizeZero); - argument.AddAttribute("knownNonSizeZero", attr_knownNonSizeZero); + argument.AddAttribute("known_non_negative", attr_known_none_negative); + argument.AddAttribute("known_negative_one", attr_known_negative_one); + argument.AddAttribute("known_non_size_one", attr_known_non_size_one); + argument.AddAttribute("known_non_size_zero", attr_known_non_size_zero); } -const std::string SymbolicDim::getSymName() { +const std::string SymbolicDim::GetSymName() { return attribute("sym_name").AsString(); } -int64_t SymbolicDim::getValue() { +int64_t SymbolicDim::GetDimSize() { return attribute("value").data(); } -bool SymbolicDim::getKnownNonNegative() { - return attribute("knownNonNegative").data(); +bool SymbolicDim::GetKnownNonNegative() { + return attribute("known_non_negative").data(); } -bool SymbolicDim::getKnownNegativeOne() { - return attribute("knownNegativeOne").data(); +bool SymbolicDim::GetKnownNegativeOne() { + return attribute("known_negative_one").data(); } -bool SymbolicDim::getKnownNonSizeOne() { - return attribute("knownNonSizeOne").data(); +bool SymbolicDim::GetKnownNonSizeOne() { + return attribute("known_non_size_one").data(); } -bool SymbolicDim::getKnownNonSizeZero() { - return attribute("knownNonSizeZero").data(); +bool SymbolicDim::GetKnownNonSizeZero() { + return attribute("known_non_size_zero").data(); } -void SymbolicDim::updateSymName(std::string attrValue) { +void SymbolicDim::SetSymName(const std::string &attr_value) { operation()->set_attribute( - "sym_name", StrAttribute::get(IrContext::Instance(), attrValue)); + "sym_name", StrAttribute::get(IrContext::Instance(), attr_value)); } -void SymbolicDim::updateValue(int64_t attrValue) { +void SymbolicDim::SetDimSize(int64_t attr_value) { operation()->set_attribute( - "value", Int64Attribute::get(IrContext::Instance(), attrValue)); + "value", Int64Attribute::get(IrContext::Instance(), attr_value)); } -void SymbolicDim::updateKnownNonNegative(bool attrValue) { - operation()->set_attribute( - "knownNonNegative", BoolAttribute::get(IrContext::Instance(), attrValue)); +void SymbolicDim::UpdateKnownNonNegative(bool flag) { + operation()->set_attribute("known_non_negative", + BoolAttribute::get(IrContext::Instance(), flag)); } -void SymbolicDim::updateKnownNegativeOne(bool attrValue) { - operation()->set_attribute( - "knownNegativeOne", BoolAttribute::get(IrContext::Instance(), attrValue)); +void SymbolicDim::UpdateKnownNegativeOne(bool flag) { + operation()->set_attribute("known_negative_one", + BoolAttribute::get(IrContext::Instance(), flag)); } -void SymbolicDim::updateKnownNonSizeOne(bool attrValue) { - operation()->set_attribute( - "knownNonSizeOne", BoolAttribute::get(IrContext::Instance(), attrValue)); +void SymbolicDim::UpdateKnownNonSizeOne(bool flag) { + operation()->set_attribute("known_non_size_one", + BoolAttribute::get(IrContext::Instance(), flag)); } -void SymbolicDim::updateKnownNonSizeZero(bool attrValue) { - operation()->set_attribute( - "knownNonSizeZero", BoolAttribute::get(IrContext::Instance(), attrValue)); +void SymbolicDim::UpdateKnownNonSizeZero(bool flag) { + operation()->set_attribute("known_non_size_zero", + BoolAttribute::get(IrContext::Instance(), flag)); } bool SymbolicDim::IsDynamic() { - return getValue() == ShapedTypeInterface::kDynamic; + return GetDimSize() == ShapedTypeInterface::kDynamic; } bool SymbolicDim::Merge(SymbolicDim other) { - if (!IsDynamic() && !other.IsDynamic() && getValue() != other.getValue()) + VLOG(4) << "Try to merge two SymbolicDim ops."; + + if (!IsDynamic() && !other.IsDynamic() && GetDimSize() != other.GetDimSize()) return false; - if (IsDynamic() && !other.IsDynamic()) updateValue(other.getValue()); - if (!IsDynamic() && other.IsDynamic()) other.updateValue(getValue()); - - bool knownNonNegativeFlag = - getKnownNonNegative() || other.getKnownNonNegative(); - bool knownNegativeOneFlag = - getKnownNegativeOne() || other.getKnownNegativeOne(); - bool knownNonSizeOneFlag = getKnownNonSizeOne() || - other.getKnownNonSizeOne() || knownNegativeOneFlag; - bool knownNonSizeZeroFlag = getKnownNonSizeZero() || - other.getKnownNonSizeZero() || - knownNegativeOneFlag; - - if (knownNonNegativeFlag && knownNegativeOneFlag) return false; - - updateKnownNonSizeZero(knownNonSizeZeroFlag); - updateKnownNonSizeOne(knownNonSizeOneFlag); - updateKnownNegativeOne(knownNegativeOneFlag); - updateKnownNonNegative(knownNonNegativeFlag); + if (IsDynamic() && !other.IsDynamic()) SetDimSize(other.GetDimSize()); + if (!IsDynamic() && other.IsDynamic()) other.SetDimSize(GetDimSize()); + + // eiter value >= 0 + bool known_non_negative_flag = + GetKnownNonNegative() || other.GetKnownNonNegative(); + + // eiter value == -1 + bool known_negative_one_flag = + GetKnownNegativeOne() || other.GetKnownNegativeOne(); + + if (known_non_negative_flag && known_negative_one_flag) return false; + + bool known_non_size_one_flag = GetKnownNonSizeOne() || + other.GetKnownNonSizeOne() || + known_negative_one_flag; + + bool known_non_size_zero_flag = GetKnownNonSizeZero() || + other.GetKnownNonSizeZero() || + known_negative_one_flag; + + UpdateKnownNonSizeZero(known_non_size_zero_flag); + UpdateKnownNonSizeOne(known_non_size_one_flag); + UpdateKnownNegativeOne(known_negative_one_flag); + UpdateKnownNonNegative(known_non_negative_flag); return true; } @@ -196,7 +203,7 @@ std::vector TieProductEqualOp::rhs() { } const char *TieShapeOp::attributes_name[attributes_num] = { - SymbolicDim::getSymbolicDimAttrName().c_str()}; // NOLINT + SymbolicDim::GetSymbolicDimAttrName().c_str()}; // NOLINT void TieShapeOp::Build(Builder &builder, OperationArgument &argument, @@ -266,8 +273,7 @@ void TensorDimOp::Build(Builder &builder, Value TensorDimOp::source() { return operand_source(0); } Value TensorDimOp::index() { return operand_source(1); } -} // namespace dialect -} // namespace pir +} // namespace pir::dialect IR_DEFINE_EXPLICIT_TYPE_ID(pir::dialect::SymbolicDim) IR_DEFINE_EXPLICIT_TYPE_ID(pir::dialect::DimOp) diff --git a/paddle/pir/dialect/shape/ir/shape_op.h b/paddle/pir/dialect/shape/ir/shape_op.h index 3163d404a61ee..c8ec2df012341 100644 --- a/paddle/pir/dialect/shape/ir/shape_op.h +++ b/paddle/pir/dialect/shape/ir/shape_op.h @@ -19,13 +19,12 @@ #include "paddle/pir/core/ir_printer.h" #include "paddle/pir/core/op_base.h" -namespace pir { -namespace dialect { +namespace pir::dialect { class IR_API SymbolicDim : public Op { public: using Op::Op; - static const char *name() { return "shape.SymbolicDim"; } + static const char *name() { return "shape.symbolic_dim"; } static constexpr uint32_t attributes_num = 6; static const char *attributes_name[attributes_num]; @@ -34,28 +33,41 @@ class IR_API SymbolicDim : public Op { OperationArgument &argument, // NOLINT const std::string &sym_name, int64_t value = ShapedTypeInterface::kDynamic, - bool knownNonNegative = false, - bool knownNegativeOne = false, - bool knownNonSizeOne = false, - bool knownNonSizeZero = false); - const std::string getSymName(); - int64_t getValue(); - bool getKnownNonNegative(); - bool getKnownNegativeOne(); - bool getKnownNonSizeOne(); - bool getKnownNonSizeZero(); - - void updateSymName(std::string attrValue); - void updateValue(int64_t attrValue); - void updateKnownNonNegative(bool attrValue); - void updateKnownNegativeOne(bool attrValue); - void updateKnownNonSizeOne(bool attrValue); - void updateKnownNonSizeZero(bool attrValue); + bool known_non_negative = false, + bool known_negative_one = false, + bool known_non_size_one = false, + bool known_non_size_zero = false); + const std::string GetSymName(); + int64_t GetDimSize(); + + bool GetKnownNonNegative(); + bool GetKnownNegativeOne(); + bool GetKnownNonSizeOne(); + bool GetKnownNonSizeZero(); + + void SetSymName(const std::string &attr_value); + void SetDimSize(int64_t attr_value); + + // Sets `known_non_negative` to the value of `flag` + void UpdateKnownNonNegative(bool flag); + + // Sets `known_negative_one` to the value of `flag` + void UpdateKnownNegativeOne(bool flag); + + // Sets `known_non_size_one` to the value of `flag` + void UpdateKnownNonSizeOne(bool flag); + + // Sets `known_non_size_zero` to the value of `flag` + void UpdateKnownNonSizeZero(bool flag); + + // Returns true if this SymbolicDim is not known at compile-time. bool IsDynamic(); + + // Try to merge two SymbolicDim ops. bool Merge(SymbolicDim other); - static const std::string getSymbolicDimAttrName() { + static const std::string GetSymbolicDimAttrName() { return "kSymbolicDimAttr"; } @@ -160,8 +172,7 @@ class IR_API TensorDimOp : public Op { void Verify() {} }; -} // namespace dialect -} // namespace pir +} // namespace pir::dialect IR_EXPORT_DECLARE_EXPLICIT_TYPE_ID(pir::dialect::SymbolicDim); IR_EXPORT_DECLARE_EXPLICIT_TYPE_ID(pir::dialect::DimOp); diff --git a/paddle/pir/dialect/shape/utils/shape_optimization_utils.cc b/paddle/pir/dialect/shape/utils/shape_optimization_utils.cc new file mode 100644 index 0000000000000..35776be4f5325 --- /dev/null +++ b/paddle/pir/dialect/shape/utils/shape_optimization_utils.cc @@ -0,0 +1,15 @@ +// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/pir/dialect/shape/utils/shape_optimization_utils.h" diff --git a/paddle/pir/dialect/shape/utils/shape_optimization_utils.h b/paddle/pir/dialect/shape/utils/shape_optimization_utils.h new file mode 100644 index 0000000000000..7f31a4fb55cf1 --- /dev/null +++ b/paddle/pir/dialect/shape/utils/shape_optimization_utils.h @@ -0,0 +1,15 @@ +// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once diff --git a/paddle/pir/dialect/shape/utils/shape_utils.cc b/paddle/pir/dialect/shape/utils/shape_utils.cc index d9f8aee304325..ad2cc1d956918 100644 --- a/paddle/pir/dialect/shape/utils/shape_utils.cc +++ b/paddle/pir/dialect/shape/utils/shape_utils.cc @@ -17,57 +17,157 @@ #include "paddle/fluid/pir/dialect/operator/ir/op_type.h" namespace pir { +bool ShapeAnalysis::IsSameNumElements(Value lhs, Value rhs) { + if (lhs == rhs) return true; + auto lhs_type = lhs.type().dyn_cast(); + auto rhs_type = rhs.type().dyn_cast(); + + if (!lhs_type || !rhs_type || !lhs_type.HasRank() || !rhs_type.HasRank()) + return false; + + return IsProductEqual(lhs, 0, lhs_type.GetRank(), rhs, 0, rhs_type.GetRank()); +} + +bool ShapeAnalysis::IsProductEqual( + Value lhs, int lhs_from, int lhs_to, Value rhs, int rhs_from, int rhs_to) { + std::vector lhs_dim_idxs, rhs_dim_idxs; + + lhs_dim_idxs.reserve(lhs_to - lhs_from); + rhs_dim_idxs.reserve(rhs_to - rhs_from); + + for (int i = lhs_from; i < lhs_to; ++i) lhs_dim_idxs.push_back(i); + for (int i = rhs_from; i < rhs_to; ++i) rhs_dim_idxs.push_back(i); + + return IsProductEqual(lhs, lhs_dim_idxs, rhs, rhs_dim_idxs); +} + +ShapeConstraintIRAnalysis::ShapeConstraintIRAnalysis(ModuleOp m) + : m_(m), mgr_(m) { + mgr_.Load(); + for (auto op : *(m_.block())) { + auto tie_shape_op = op->dyn_cast(); + if (!tie_shape_op) continue; + Value result = tie_shape_op.value(); + auto& symbols = value_to_sym_dims_[result]; + auto attrs = + tie_shape_op + .attribute(SymbolicDim::GetSymbolicDimAttrName()) + .AsVector(); + for (const auto& attr : attrs) { + auto sym_op = mgr_.symbolTable().Lookup( + attr.dyn_cast().AsString()); + if (!sym_op) continue; + symbols.push_back(sym_op); + } + } +} + +ShapeConstraintIRAnalysis::~ShapeConstraintIRAnalysis() { mgr_.Save(); } + +bool ShapeConstraintIRAnalysis::IsShapeEqual(Value lhs, Value rhs) { + if (lhs == rhs) return true; + + auto lhs_type = lhs.type().dyn_cast(); + auto rhs_type = rhs.type().dyn_cast(); + + if (!lhs_type || !rhs_type || !lhs_type.HasRank() || !rhs_type.HasRank()) + return false; + + if (lhs_type.HasStaticShape() && rhs_type.HasStaticShape()) { + return vectorize(lhs_type.GetShape()) == vectorize(rhs_type.GetShape()); + } + + auto lhs_it = value_to_sym_dims_.find(lhs); + auto rhs_it = value_to_sym_dims_.find(rhs); + + if (lhs_it == value_to_sym_dims_.end() || + rhs_it == value_to_sym_dims_.end() || + lhs_it->second.size() != rhs_it->second.size()) + return false; + + std::vector lhs_syms; + std::vector rhs_syms; + for (auto sym : lhs_it->second) { + lhs_syms.push_back(mgr_.GetRootSymbolicDim(sym)); + } + for (auto sym : rhs_it->second) { + rhs_syms.push_back(mgr_.GetRootSymbolicDim(sym)); + } + return lhs_syms == rhs_syms; +} + +bool ShapeConstraintIRAnalysis::IsProductEqual(Value lhs, + std::vector lhs_dim_idxs, + Value rhs, + std::vector rhs_dim_idxs) { + SymbolicDimProduct lhs_prod; + SymbolicDimProduct rhs_prod; + + auto build_symbolic_dim_product = + [&](SymbolicDimProduct& prod, Value value, std::vector dim_idxs) { + auto type = value.type().dyn_cast(); + auto it = value_to_sym_dims_.find(value); + if (!type || !type.HasRank()) return false; + for (int idx : dim_idxs) { + if (type.GetShape()[idx] == ShapedTypeInterface::kDynamic) { + if (it == value_to_sym_dims_.end() || + static_cast(it->second.size()) <= idx) + return false; + prod.symbols.push_back(it->second[idx]); + } else { + prod.factor *= type.GetShape()[idx]; + } + } + return true; + }; + + if (!build_symbolic_dim_product(lhs_prod, lhs, lhs_dim_idxs) || + !build_symbolic_dim_product(rhs_prod, rhs, rhs_dim_idxs)) { + return false; + } + + return mgr_.IsSymbolicDimProductEqual(lhs_prod, rhs_prod); +} + +// Gives a consistent order of a list op SymbolicDim Ops bool CompareSymbolicDimNames(const std::string& lhs, const std::string& rhs) { + // S -> unknown dimension size at compile time + // C -> constant dimension size at compile time if (lhs.size() < 1 || (lhs[0] != 'S' && lhs[0] != 'C')) return lhs < rhs; if (rhs.size() < 1 || (rhs[0] != 'S' && rhs[0] != 'C')) return lhs < rhs; - int64_t lhsIdx = 0, rhsIdx = 0; + int64_t lhs_idx = 0, rhs_idx = 0; try { - lhsIdx = stol(lhs.substr(1)); - rhsIdx = stol(rhs.substr(1)); + lhs_idx = stol(lhs.substr(1)); + rhs_idx = stol(rhs.substr(1)); } catch (const std::exception& e) { IR_THROW("Invalid symbolic name"); } - return (lhs[0] < rhs[0]) || (lhs[0] == rhs[0] && lhsIdx < rhsIdx); + return (lhs[0] < rhs[0]) || (lhs[0] == rhs[0] && lhs_idx < rhs_idx); } +// Gives a consistent order of a list op SymbolicDimProducts bool CompareSymbolicDimProduct(SymbolicDimProduct& lhs, // NOLINT SymbolicDimProduct& rhs) { // NOLINT if (lhs.symbols.size() < rhs.symbols.size()) return true; if (lhs.symbols.size() == rhs.symbols.size()) { for (size_t idx = 0; idx < lhs.symbols.size(); ++idx) { - const std::string lhsName = lhs.symbols[idx].getSymName(); - const std::string rhsName = rhs.symbols[idx].getSymName(); - if (CompareSymbolicDimNames(lhsName, rhsName)) return true; - if (lhsName != rhsName) return false; + const std::string lhs_name = lhs.symbols[idx].GetSymName(); + const std::string rhs_name = rhs.symbols[idx].GetSymName(); + if (CompareSymbolicDimNames(lhs_name, rhs_name)) return true; + if (lhs_name != rhs_name) return false; } } return false; } -const std::string SymbolTable::insert(Operation* symbol) { - std::string name; - if (symbol->isa()) { - name = symbol->dyn_cast().getSymName(); - symbolTableMap_.insert({name, symbol}); - } - - // TODO(liujinnan): add more constraint_func name branch. - if (symbol->isa()) { - name = "tie_product_equal"; - symbolFuncMap_[name].emplace_back(symbol); - } - - return name; -} - bool SymbolicDimMgr::Load() { - auto funcOp = symbolTable_.getOp()->dyn_cast(); - assert(funcOp); - for (auto op_ : *(funcOp.block())) { - symbolTable_.insert(op_); + auto func_op = symbol_table_.getOp()->dyn_cast(); + assert(func_op); + for (auto op_ : *(func_op.block())) { + symbol_table_.insert(op_); if (SymbolicDim op = op_->dyn_cast()) { symbolDimUnionSet_[op] = op; - symbolNameSet_.insert(op.getSymName()); + symbolNameSet_.insert(op.GetSymName()); } } return LoadShapeConstraintGraph(); @@ -77,7 +177,7 @@ bool SymbolicDimMgr::LoadShapeConstraintGraph() { // TODO(liujinnan): add more constraint function. currently, only support // tie_product_equal. auto constraint_vec = - symbolTable_.Lookup("tie_product_equal"); + symbol_table_.Lookup("tie_product_equal"); if (!constraint_vec.size()) return true; @@ -89,7 +189,7 @@ bool SymbolicDimMgr::LoadShapeConstraintGraph() { product.factor *= constOp.value().dyn_cast().data(); continue; } else if (auto dimOp = definingOp->dyn_cast()) { - auto sym = symbolTable_.Lookup(dimOp.getName()); + auto sym = symbol_table_.Lookup(dimOp.getName()); if (!sym) return false; product.symbols.push_back(sym); continue; @@ -109,35 +209,29 @@ bool SymbolicDimMgr::LoadShapeConstraintGraph() { return true; } -int64_t gcd(int64_t m, int64_t n) { - if (!m) return n; - if (!n) return m; - return (m < n) ? gcd(m, n % m) : gcd(m % n, n); -} - bool SymbolicDimMgr::MapSymbolicDimProductEqual(const SymbolicDimProduct& lhs, const SymbolicDimProduct& rhs) { - SymbolicDimProduct newLhs, newRhs; - std::tie(newLhs, newRhs) = SimplifySymbolicDimProductPair(lhs, rhs); + SymbolicDimProduct new_lhs, new_rhs; + std::tie(new_lhs, new_rhs) = SimplifySymbolicDimProductPair(lhs, rhs); // early return for identity case. - if (newLhs == newRhs) return true; + if (new_lhs == new_rhs) return true; - if (newLhs.factor == newRhs.factor && newLhs.symbols.size() == 1 && - newRhs.symbols.size() == 1) { - return MapSymbolicDimEqual(newLhs.symbols[0], newRhs.symbols[0]); - } else if (newLhs.symbols.size() == 0 && newRhs.symbols.size() == 1 && - newRhs.factor == 1) { - return MapSymbolicDimEqual(NewConstantSymbolicDim(newLhs.factor), - newRhs.symbols[0]); - } else if (newRhs.symbols.size() == 0 && newLhs.symbols.size() == 1 && - newLhs.factor == 1) { - return MapSymbolicDimEqual(NewConstantSymbolicDim(newRhs.factor), - newLhs.symbols[0]); + if (new_lhs.factor == new_rhs.factor && new_lhs.symbols.size() == 1 && + new_rhs.symbols.size() == 1) { + return MapSymbolicDimEqual(new_lhs.symbols[0], new_rhs.symbols[0]); + } else if (new_lhs.symbols.size() == 0 && new_rhs.symbols.size() == 1 && + new_rhs.factor == 1) { + return MapSymbolicDimEqual(NewConstantSymbolicDim(new_lhs.factor), + new_rhs.symbols[0]); + } else if (new_rhs.symbols.size() == 0 && new_lhs.symbols.size() == 1 && + new_lhs.factor == 1) { + return MapSymbolicDimEqual(NewConstantSymbolicDim(new_rhs.factor), + new_lhs.symbols[0]); } - productEqualityMap_[newLhs][newRhs] = productEqualityMap_[newRhs][newLhs] = - true; + productEqualityMap_[new_lhs][new_rhs] = + productEqualityMap_[new_rhs][new_lhs] = true; productEqualityMapUpdated_ = false; return true; @@ -149,45 +243,46 @@ SymbolicDimMgr::SimplifySymbolicDimProductPair(const SymbolicDimProduct& x, auto lhs = SimplifySymbolicDimProduct(x); auto rhs = SimplifySymbolicDimProduct(y); - SymbolicDimProduct newLhs, newRhs; - int64_t gcdFactor = gcd(std::abs(lhs.factor), std::abs(rhs.factor)); - if (!gcdFactor) return std::make_pair(std::move(newLhs), std::move(newRhs)); + SymbolicDimProduct new_lhs, new_rhs; + int64_t gcd_factor = std::gcd(std::abs(lhs.factor), std::abs(rhs.factor)); + if (!gcd_factor) + return std::make_pair(std::move(new_lhs), std::move(new_rhs)); if (std::abs(lhs.factor) < std::abs(rhs.factor)) { - if (lhs.factor < 0) gcdFactor = -gcdFactor; + if (lhs.factor < 0) gcd_factor = -gcd_factor; } else { - if (rhs.factor < 0) gcdFactor = -gcdFactor; + if (rhs.factor < 0) gcd_factor = -gcd_factor; } - newLhs.factor = lhs.factor / gcdFactor; - newRhs.factor = rhs.factor / gcdFactor; + new_lhs.factor = lhs.factor / gcd_factor; + new_rhs.factor = rhs.factor / gcd_factor; - std::unordered_map lhsSymbolMap; - std::unordered_map rhsSymbolMap; - for (SymbolicDim op : lhs.symbols) ++lhsSymbolMap[op]; - for (SymbolicDim op : rhs.symbols) ++rhsSymbolMap[op]; + std::unordered_map lhs_symbol_map; + std::unordered_map rhs_symbol_map; + for (SymbolicDim op : lhs.symbols) ++lhs_symbol_map[op]; + for (SymbolicDim op : rhs.symbols) ++rhs_symbol_map[op]; for (SymbolicDim op : lhs.symbols) { - auto it = rhsSymbolMap.find(op); - if (it != rhsSymbolMap.end() && op.getKnownNonSizeZero()) { - if (--it->second == 0) rhsSymbolMap.erase(it); + auto it = rhs_symbol_map.find(op); + if (it != rhs_symbol_map.end() && op.GetKnownNonSizeZero()) { + if (--it->second == 0) rhs_symbol_map.erase(it); continue; } - newLhs.symbols.push_back(op); + new_lhs.symbols.push_back(op); } for (SymbolicDim op : rhs.symbols) { - auto it = lhsSymbolMap.find(op); - if (it != lhsSymbolMap.end() && op.getKnownNonSizeZero()) { - if (--it->second == 0) lhsSymbolMap.erase(it); + auto it = lhs_symbol_map.find(op); + if (it != lhs_symbol_map.end() && op.GetKnownNonSizeZero()) { + if (--it->second == 0) lhs_symbol_map.erase(it); continue; } - newRhs.symbols.push_back(op); + new_rhs.symbols.push_back(op); } - if (!newLhs.factor) newLhs.symbols.clear(); - if (!newRhs.factor) newRhs.symbols.clear(); + if (!new_lhs.factor) new_lhs.symbols.clear(); + if (!new_rhs.factor) new_rhs.symbols.clear(); - return std::make_pair(std::move(newLhs), std::move(newRhs)); + return std::make_pair(std::move(new_lhs), std::move(new_rhs)); } SymbolicDimProduct SymbolicDimMgr::SimplifySymbolicDimProduct( @@ -197,13 +292,13 @@ SymbolicDimProduct SymbolicDimMgr::SimplifySymbolicDimProduct( for (SymbolicDim op : x.symbols) copied.push_back(GetRootSymbolicDim(op)); sort(copied.begin(), copied.end(), [&](SymbolicDim lhs, SymbolicDim rhs) { - return CompareSymbolicDimNames(lhs.getSymName(), rhs.getSymName()); + return CompareSymbolicDimNames(lhs.GetSymName(), rhs.GetSymName()); }); SymbolicDimProduct newX; newX.factor = x.factor; for (SymbolicDim op : copied) { if (!op.IsDynamic()) { - newX.factor *= op.getValue(); + newX.factor *= op.GetDimSize(); } else { newX.symbols.push_back(op); } @@ -222,19 +317,19 @@ const std::string SymbolicDimMgr::GetNextName() { SymbolicDimMgr::SymbolicDimMgr(ModuleOp m) : m_(m) { for (auto op : *(m.block())) { if (op->isa()) { - symbolTable_ = SymbolTable(op); + symbol_table_ = SymbolTable(op); return; } } Builder builder = Builder(m_.ir_context(), m_.block(), m_.block()->begin()); dialect::FuncOp func = builder.Build(); - symbolTable_ = SymbolTable(func); + symbol_table_ = SymbolTable(func); } SymbolicDim SymbolicDimMgr::NewSymbolicDim(const std::string& name) { - auto funcOp = symbolTable_.getOp()->dyn_cast(); - assert(funcOp); - Builder builder = Builder(m_.ir_context(), funcOp.block()); + auto func_op = symbol_table_.getOp()->dyn_cast(); + assert(func_op); + Builder builder = Builder(m_.ir_context(), func_op.block()); // default settting dim != 0 dialect::SymbolicDim symbol = builder.Build(name.empty() ? GetNextName() : name, @@ -244,7 +339,7 @@ SymbolicDim SymbolicDimMgr::NewSymbolicDim(const std::string& name) { false, true); symbolDimUnionSet_[symbol] = symbol; - symbolTable_.insert(symbol); + symbol_table_.insert(symbol); return symbol; } @@ -255,11 +350,11 @@ SymbolicDim SymbolicDimMgr::NewConstantSymbolicDim(int64_t val) { it = constantSymbolicDimMap_ .insert(std::make_pair(val, NewSymbolicDim(name))) .first; - it->second.updateValue(val); - if (val == -1) it->second.updateKnownNegativeOne(true); - if (val >= 0) it->second.updateKnownNonNegative(true); - if (val != 1) it->second.updateKnownNonSizeOne(true); - if (val != 0) it->second.updateKnownNonSizeZero(true); + it->second.SetDimSize(val); + if (val == -1) it->second.UpdateKnownNegativeOne(true); + if (val >= 0) it->second.UpdateKnownNonNegative(true); + if (val != 1) it->second.UpdateKnownNonSizeOne(true); + if (val != 0) it->second.UpdateKnownNonSizeZero(true); } return GetRootSymbolicDim(it->second); } @@ -298,7 +393,7 @@ bool SymbolicDimMgr::MapSymbolicDimEqual(SymbolicDim lhs, SymbolicDim rhs) { SymbolicDim rhsRoot = GetRootSymbolicDim(rhs); if (lhsRoot != rhsRoot) { - if (CompareSymbolicDimNames(lhsRoot.getSymName(), rhsRoot.getSymName())) { + if (CompareSymbolicDimNames(lhsRoot.GetSymName(), rhsRoot.GetSymName())) { if (!lhsRoot.Merge(rhsRoot)) return false; symbolDimUnionSet_[rhsRoot] = lhsRoot; } else { @@ -311,32 +406,32 @@ bool SymbolicDimMgr::MapSymbolicDimEqual(SymbolicDim lhs, SymbolicDim rhs) { SymbolicDimProduct* SymbolicDimMgr::SymbolicDimProductDivide( const SymbolicDimProduct& lhs, const SymbolicDimProduct& rhs) { - SymbolicDimProduct newLhs, newRhs; - std::tie(newLhs, newRhs) = SimplifySymbolicDimProductPair(lhs, rhs); + SymbolicDimProduct new_lhs, new_rhs; + std::tie(new_lhs, new_rhs) = SimplifySymbolicDimProductPair(lhs, rhs); - if (newLhs.factor == 0 || newRhs.factor == 0) return nullptr; - if (newLhs.factor % newRhs.factor != 0) return nullptr; - if (newLhs.symbols.size() < newRhs.symbols.size()) return nullptr; + if (new_lhs.factor == 0 || new_rhs.factor == 0) return nullptr; + if (new_lhs.factor % new_rhs.factor != 0) return nullptr; + if (new_lhs.symbols.size() < new_rhs.symbols.size()) return nullptr; SymbolicDimProduct* result = new SymbolicDimProduct(); - result->factor = newLhs.factor / newRhs.factor; + result->factor = new_lhs.factor / new_rhs.factor; - std::unordered_map symProcMap; - for (SymbolicDim sym : newRhs.symbols) ++symProcMap[sym]; + std::unordered_map sym_proc_map; + for (SymbolicDim sym : new_rhs.symbols) ++sym_proc_map[sym]; - for (SymbolicDim sym : newLhs.symbols) { - auto it = symProcMap.find(sym); - if (it == symProcMap.end()) { + for (SymbolicDim sym : new_lhs.symbols) { + auto it = sym_proc_map.find(sym); + if (it == sym_proc_map.end()) { result->symbols.push_back(sym); continue; } if (--it->second == 0) { - symProcMap.erase(it); + sym_proc_map.erase(it); continue; } } - if (!symProcMap.empty()) return nullptr; + if (!sym_proc_map.empty()) return nullptr; return result; } @@ -451,13 +546,13 @@ bool SymbolicDimMgr::UpdateProductEqualityMap() { bool SymbolicDimMgr::IsSymbolicDimProductEqual(const SymbolicDimProduct& lhs, const SymbolicDimProduct& rhs) { - SymbolicDimProduct newLhs, newRhs; - std::tie(newLhs, newRhs) = SimplifySymbolicDimProductPair(lhs, rhs); + SymbolicDimProduct new_lhs, new_rhs; + std::tie(new_lhs, new_rhs) = SimplifySymbolicDimProductPair(lhs, rhs); // early return for identity case. - if (newLhs == newRhs) return true; + if (new_lhs == new_rhs) return true; IR_ENFORCE(UpdateProductEqualityMap(), "Update product equality map failed."); - return IsMultipleOfKnownSymbolicDimProductEqualPair(newLhs, newRhs); + return IsMultipleOfKnownSymbolicDimProductEqualPair(new_lhs, new_rhs); } bool SymbolicDimMgr::Save() { @@ -469,7 +564,7 @@ bool SymbolicDimMgr::Save() { assert(sym); SymbolicDim root = GetRootSymbolicDim(sym); Attribute rootSymbol = - StrAttribute::get(m_->ir_context(), root.getSymName()); + StrAttribute::get(m_->ir_context(), root.GetSymName()); newAttrs.push_back(rootSymbol); } return ArrayAttribute::get(m_->ir_context(), newAttrs); @@ -477,13 +572,13 @@ bool SymbolicDimMgr::Save() { // TODO(liujinnan): update attributes attached in DenseTensorType for (auto op : *(m_.block())) { - if (!op->HasAttribute(SymbolicDim::getSymbolicDimAttrName())) continue; + if (!op->HasAttribute(SymbolicDim::GetSymbolicDimAttrName())) continue; auto attrs = - op->attribute(SymbolicDim::getSymbolicDimAttrName()); + op->attribute(SymbolicDim::GetSymbolicDimAttrName()); auto symbolicShapeAttr = updateAttrs(attrs, [&](const std::string& name) { - return symbolTable_.Lookup(name); + return symbol_table_.Lookup(name); }); - op->set_attribute(SymbolicDim::getSymbolicDimAttrName(), symbolicShapeAttr); + op->set_attribute(SymbolicDim::GetSymbolicDimAttrName(), symbolicShapeAttr); } if (!UpdateProductEqualityMap()) { return false; @@ -493,24 +588,24 @@ bool SymbolicDimMgr::Save() { // TODO(liujinnan): collect uses in value. auto collectUsedSymbols = [&](ArrayAttribute attrs) { for (Attribute attr : attrs.AsVector()) { - auto sym = symbolTable_.Lookup( + auto sym = symbol_table_.Lookup( attr.dyn_cast().AsString()); assert(sym); if (usedSymbolicOps.insert(sym).second) - usedSymbolNames.push_back(sym.getSymName()); + usedSymbolNames.push_back(sym.GetSymName()); } }; for (auto op : *(m_.block())) { - if (!op->HasAttribute(SymbolicDim::getSymbolicDimAttrName())) continue; + if (!op->HasAttribute(SymbolicDim::GetSymbolicDimAttrName())) continue; auto attrs = - op->attribute(SymbolicDim::getSymbolicDimAttrName()); + op->attribute(SymbolicDim::GetSymbolicDimAttrName()); collectUsedSymbols(attrs); } - auto funcOp = symbolTable_.getOp()->dyn_cast(); - assert(funcOp); + auto func_op = symbol_table_.getOp()->dyn_cast(); + assert(func_op); for (auto& p : symbolDimUnionSet_) { if (!usedSymbolicOps.count(p.first)) { - funcOp.block()->erase(*(p.first.operation())); + func_op.block()->erase(*(p.first.operation())); } } @@ -553,18 +648,18 @@ bool SymbolicDimMgr::Save() { std::unordered_map name2Symbol; for (SymbolicDim op : usedSymbolicOps) { - auto name = op.getSymName(); - op.updateSymName(nameMapping[name]); + auto name = op.GetSymName(); + op.SetSymName(nameMapping[name]); name2Symbol[name] = op; } for (auto op : *(m_.block())) { - if (!op->HasAttribute(SymbolicDim::getSymbolicDimAttrName())) continue; + if (!op->HasAttribute(SymbolicDim::GetSymbolicDimAttrName())) continue; auto attrs = - op->attribute(SymbolicDim::getSymbolicDimAttrName()); + op->attribute(SymbolicDim::GetSymbolicDimAttrName()); auto symbolicShapeAttr = updateAttrs( attrs, [&](const std::string& name) { return name2Symbol[name]; }); - op->set_attribute(SymbolicDim::getSymbolicDimAttrName(), symbolicShapeAttr); + op->set_attribute(SymbolicDim::GetSymbolicDimAttrName(), symbolicShapeAttr); } // TODO(liujinnan): update attributes attached to values. @@ -573,18 +668,18 @@ bool SymbolicDimMgr::Save() { } bool SymbolicDimMgr::SaveShapeConstraintGraph() { - auto funcOp = symbolTable_.getOp()->dyn_cast(); - assert(funcOp); - auto op_it = funcOp.block()->rbegin(); - while (op_it != funcOp.block()->rend()) { + auto func_op = symbol_table_.getOp()->dyn_cast(); + assert(func_op); + auto op_it = func_op.block()->rbegin(); + while (op_it != func_op.block()->rend()) { if (((*op_it)->isa()) || ((*op_it)->isa())) op_it++; else - op_it = decltype(op_it)(funcOp.block()->erase(*(*op_it))); + op_it = decltype(op_it)(func_op.block()->erase(*(*op_it))); } - Builder builder = Builder(m_->ir_context(), funcOp.block()); + Builder builder = Builder(m_->ir_context(), func_op.block()); auto build_operands = [&](const SymbolicDimProduct& prod) { std::vector values; @@ -597,7 +692,7 @@ bool SymbolicDimMgr::SaveShapeConstraintGraph() { ->result(0)); } for (SymbolicDim sym : prod.symbols) { - values.push_back(builder.Build(sym.getSymName()).out()); + values.push_back(builder.Build(sym.GetSymName()).out()); } return values; }; @@ -618,113 +713,6 @@ bool SymbolicDimMgr::SaveShapeConstraintGraph() { return true; } -bool ShapeAnalysis::IsSameNumElements(Value lhs, Value rhs) { - if (lhs == rhs) return true; - auto lhsTy = lhs.type().dyn_cast(); - auto rhsTy = rhs.type().dyn_cast(); - - if (!lhsTy || !rhsTy || !lhsTy.HasRank() || !rhsTy.HasRank()) return false; - - return IsProductEqual(lhs, 0, lhsTy.GetRank(), rhs, 0, rhsTy.GetRank()); -} - -bool ShapeAnalysis::IsProductEqual( - Value lhs, int lhsFrom, int lhsTo, Value rhs, int rhsFrom, int rhsTo) { - std::vector lhsDimIdxs, rhsDimIdxs; - lhsDimIdxs.reserve(lhsTo - lhsFrom); - rhsDimIdxs.reserve(rhsTo - rhsFrom); - for (int i = lhsFrom; i < lhsTo; ++i) lhsDimIdxs.push_back(i); - for (int i = rhsFrom; i < rhsTo; ++i) rhsDimIdxs.push_back(i); - - return IsProductEqual(lhs, lhsDimIdxs, rhs, rhsDimIdxs); -} - -SymbolicDimShapeAnalysis::SymbolicDimShapeAnalysis(ModuleOp m) - : m_(m), mgr_(m) { - mgr_.Load(); - for (auto op : *(m_.block())) { - auto tieShapeOp = op->dyn_cast(); - if (!tieShapeOp) continue; - Value result = tieShapeOp.value(); - auto& symbols = value2SymDims_[result]; - auto attrs = - tieShapeOp - .attribute(SymbolicDim::getSymbolicDimAttrName()) - .AsVector(); - for (const auto& attr : attrs) { - auto symOp = mgr_.symbolTable().Lookup( - attr.dyn_cast().AsString()); - if (!symOp) continue; - symbols.push_back(symOp); - } - } -} - -SymbolicDimShapeAnalysis::~SymbolicDimShapeAnalysis() { mgr_.Save(); } - -bool SymbolicDimShapeAnalysis::IsShapeEqual(Value lhs, Value rhs) { - if (lhs == rhs) return true; - - auto lhsTy = lhs.type().dyn_cast(); - auto rhsTy = rhs.type().dyn_cast(); - - if (!lhsTy || !rhsTy || !lhsTy.HasRank() || !rhsTy.HasRank()) return false; - - if (lhsTy.HasStaticShape() && rhsTy.HasStaticShape()) { - return vectorize(lhsTy.GetShape()) == vectorize(rhsTy.GetShape()); - } - - auto lhsIt = value2SymDims_.find(lhs); - auto rhsIt = value2SymDims_.find(rhs); - - if (lhsIt == value2SymDims_.end() || rhsIt == value2SymDims_.end() || - lhsIt->second.size() != rhsIt->second.size()) - return false; - - std::vector lhsSyms; - std::vector rhsSyms; - for (auto sym : lhsIt->second) { - lhsSyms.push_back(mgr_.GetRootSymbolicDim(sym)); - } - for (auto sym : rhsIt->second) { - rhsSyms.push_back(mgr_.GetRootSymbolicDim(sym)); - } - return lhsSyms == rhsSyms; -} - -bool SymbolicDimShapeAnalysis::IsProductEqual(Value lhs, - std::vector lhsDimIdxs, - Value rhs, - std::vector rhsDimIdxs) { - SymbolicDimProduct lhsProd; - SymbolicDimProduct rhsProd; - - auto buildSymbolicDimProduct = - [&](SymbolicDimProduct& prod, Value value, std::vector dimIdxs) { - auto ty = value.type().dyn_cast(); - auto it = value2SymDims_.find(value); - if (!ty || !ty.HasRank()) return false; - for (int idx : dimIdxs) { - if (ty.GetShape()[idx] == ShapedTypeInterface::kDynamic) { - if (it == value2SymDims_.end() || - static_cast(it->second.size()) <= idx) - return false; - prod.symbols.push_back(it->second[idx]); - } else { - prod.factor *= ty.GetShape()[idx]; - } - } - return true; - }; - - if (!buildSymbolicDimProduct(lhsProd, lhs, lhsDimIdxs) || - !buildSymbolicDimProduct(rhsProd, rhs, rhsDimIdxs)) { - return false; - } - - return mgr_.IsSymbolicDimProductEqual(lhsProd, rhsProd); -} - ShapeComputationIRAnalysis::ShapeComputationIRAnalysis(ModuleOp m, SymbolicDimMgr& mgr) : m_(m), mgr_(mgr) {} @@ -776,9 +764,9 @@ bool ShapeComputationIRAnalysis::BuildShapeOnOperation(Operation* op) { if (op->isa()) { Value value = op->operand_source(0); std::vector symbols; - if (op->HasAttribute(SymbolicDim::getSymbolicDimAttrName())) { + if (op->HasAttribute(SymbolicDim::GetSymbolicDimAttrName())) { auto attrs = - op->attribute(SymbolicDim::getSymbolicDimAttrName()) + op->attribute(SymbolicDim::GetSymbolicDimAttrName()) .AsVector(); for (Attribute attr : attrs) { auto sym = mgr_.symbolTable().Lookup( @@ -792,10 +780,10 @@ bool ShapeComputationIRAnalysis::BuildShapeOnOperation(Operation* op) { std::vector attrs; for (SymbolicDim sym : symbols) { Attribute rootSymbol = - StrAttribute::get(m_->ir_context(), sym.getSymName()); + StrAttribute::get(m_->ir_context(), sym.GetSymName()); attrs.push_back(rootSymbol); } - op->set_attribute(SymbolicDim::getSymbolicDimAttrName(), + op->set_attribute(SymbolicDim::GetSymbolicDimAttrName(), ArrayAttribute::get(m_->ir_context(), attrs)); } rankedTensor2SymDims_[value] = std::move(symbols); @@ -808,12 +796,12 @@ bool ShapeComputationIRAnalysis::BuildShapeOnOperation(Operation* op) { } bool ShapeComputationIRAnalysis::BuildShapeOnValue(Value value) { - Type ty = value.type(); - if (IsIntOrIndex(ty)) { + Type type = value.type(); + if (IsIntOrIndex(type)) { SymbolicDim sym = mgr_.NewSymbolicDim(); value2SymDim_[value] = sym; - } else if (IsCandidateShapeTensorType(ty)) { - auto shapedTy = ty.dyn_cast(); + } else if (IsCandidateShapeTensorType(type)) { + auto shapedTy = type.dyn_cast(); std::vector symbols; for (size_t i = 0, d = shapedTy.GetShape()[0]; i < d; ++i) symbols.push_back(mgr_.NewSymbolicDim()); @@ -835,8 +823,8 @@ bool ShapeComputationIRAnalysis::ApplyOpConstraint(Operation* op) { bool ShapeComputationIRAnalysis::ApplyIndexOpConstraint(Operation* op) { if (op->num_results() == 0) return true; - Type ty = op->result(0).type(); - if (!IsIntOrIndex(ty)) return true; + Type type = op->result(0).type(); + if (!IsIntOrIndex(type)) return true; if (auto dimOp = op->dyn_cast()) { int64_t dimIndex = dimOp.index() @@ -844,7 +832,7 @@ bool ShapeComputationIRAnalysis::ApplyIndexOpConstraint(Operation* op) { .owner() ->attribute("value") .data(); - value2SymDim_[dimOp.out()].updateKnownNonNegative(true); + value2SymDim_[dimOp.out()].UpdateKnownNonNegative(true); if (!mgr_.MapSymbolicDimEqual( value2SymDim_[dimOp.out()], rankedTensor2SymDims_[dimOp.source()][dimIndex])) { @@ -869,7 +857,7 @@ bool ShapeComputationIRAnalysis::ApplyTieShapeOpConstraint(Operation* op) { if (!mgr_.MapSymbolicDimEqual(value2SymDim_[tieShape.dims()[idx]], value[idx])) return false; - mgr_.GetRootSymbolicDim(value[idx]).updateKnownNonNegative(true); + mgr_.GetRootSymbolicDim(value[idx]).UpdateKnownNonNegative(true); } } return true; @@ -881,8 +869,8 @@ bool IsIntOrIndex(Type type) { type.isa() || type.isa(); } -bool IsCandidateShapeTensorType(Type ty) { - if (auto tensorTy = ty.dyn_cast()) { +bool IsCandidateShapeTensorType(Type type) { + if (auto tensorTy = type.dyn_cast()) { auto shapedTy = tensorTy.dyn_cast(); return (shapedTy.GetRank() == 1 && shapedTy.HasStaticShape() && IsIntOrIndex(shapedTy.GetElementType()) && diff --git a/paddle/pir/dialect/shape/utils/shape_utils.h b/paddle/pir/dialect/shape/utils/shape_utils.h index bb6dd58cebb26..3388971d32aac 100644 --- a/paddle/pir/dialect/shape/utils/shape_utils.h +++ b/paddle/pir/dialect/shape/utils/shape_utils.h @@ -14,20 +14,39 @@ #pragma once -#include -#include -#include -#include -#include -#include -#include "paddle/pir/core/builtin_attribute.h" -#include "paddle/pir/core/builtin_op.h" -#include "paddle/pir/core/builtin_type_interfaces.h" -#include "paddle/pir/core/utils.h" -#include "paddle/pir/dialect/shape/ir/shape_op.h" +#include "paddle/pir/dialect/shape/utils/symbol_table.h" namespace pir { +// Helper class to query and manipulate shape constraint IR on buffer level. +class ShapeAnalysis { + public: + virtual ~ShapeAnalysis() = default; + + // Returns true if the two value have the same symbolic shape. + virtual bool IsShapeEqual(Value lhs, Value rhs) = 0; + + // Suppose: + // lhs_dim_idxs = {ld0, ld1, ...} + // rhs_dim_idxs = {rd0, rd1, ...} + // Returns true if: + // lhs.shape[ld0] * lhs.shape[ld1] * ... == + // rhs.shape[rd0] * rhs.shape[rd1] * ... + virtual bool IsProductEqual(Value lhs, + std::vector lhs_dim_idxs, + Value rhs, + std::vector rhs_dim_idxs) = 0; + + // Returns true if: + // lhs.shape[lhs_from] * ... lhs.shape[lhs_to-1] == + // rhs.shape[rhs_from] * ... rhs.shape[rhs_to-1] + virtual bool IsProductEqual( + Value lhs, int lhs_from, int lhs_to, Value rhs, int rhs_from, int rhs_to); + + // Returns true if the two value have the same number elements. + virtual bool IsSameNumElements(Value lhs, Value rhs); +}; + using dialect::SymbolicDim; struct SymbolicDimProduct { @@ -45,42 +64,6 @@ struct SymbolicDimProduct { } }; -class SymbolTable { - public: - explicit SymbolTable(Operation* symbolTableOp) - : symbolTableOp_(symbolTableOp) {} - SymbolTable() = default; - template - typename std::enable_if::value, - SymbolicDim>::type - Lookup(const std::string& name) const { - auto it = symbolTableMap_.find(name); - return it != symbolTableMap_.end() ? it->second->dyn_cast() - : SymbolicDim(nullptr); - } - template - typename std::enable_if::value, - std::vector>::type - Lookup(const std::string& name) const { - std::vector res; - auto it = symbolFuncMap_.find(name); - if (it != symbolFuncMap_.end()) { - for (auto& p : it->second) { - res.push_back(p->dyn_cast()); - } - } - return res; - } - - const std::string insert(Operation* symbol); - Operation* getOp() const { return symbolTableOp_; } - - private: - Operation* symbolTableOp_; - std::unordered_map symbolTableMap_; - std::unordered_map> symbolFuncMap_; -}; - struct SymDimHasher { size_t operator()(const dialect::SymbolicDim& symbol) const noexcept { return std::hash{}(symbol.operation()); @@ -107,7 +90,7 @@ class SymbolicDimMgr { std::vector CreateSymbolicDimsForRankedValue(Value value); SymbolicDim GetRootSymbolicDim(SymbolicDim symbol); bool IsSymbolicDimEqual(SymbolicDim lhs, SymbolicDim rhs); - SymbolTable& symbolTable() { return symbolTable_; } + SymbolTable& symbolTable() { return symbol_table_; } bool MapSymbolicDimEqual(SymbolicDim lhs, SymbolicDim rhs); SymbolicDimProduct SimplifySymbolicDimProduct(const SymbolicDimProduct& x); std::pair @@ -134,7 +117,7 @@ class SymbolicDimMgr { private: ModuleOp m_; - SymbolTable symbolTable_; + SymbolTable symbol_table_; int64_t nextSymbolicIdx_ = 0; @@ -153,39 +136,45 @@ class SymbolicDimMgr { bool productEqualityMapUpdated_ = true; }; -class ShapeAnalysis { +// A subclass to impement `ShapeAnalysis` on buffer level. +// The implementation is based on shape constraint ir. +class ShapeConstraintIRAnalysis : public ShapeAnalysis { public: - virtual ~ShapeAnalysis() = default; + // Build shape related analysis on the provided `op`. + // This generally can be divided into two steps: + // 1, load exsiting shape constraint ir (e.g. symbolic dim ops) + // 2, build mapping between memref values and symbolic dim ops. + explicit ShapeConstraintIRAnalysis(ModuleOp m); - virtual bool IsShapeEqual(Value lhs, Value rhs) = 0; - - virtual bool IsProductEqual(Value lhs, - std::vector lhsDimIdxs, - Value rhs, - std::vector rhsDimIdxs) = 0; - virtual bool IsProductEqual( - Value lhs, int lhsFrom, int lhsTo, Value rhs, int rhsFrom, int rhsTo); - virtual bool IsSameNumElements(Value lhs, Value rhs); -}; - -class SymbolicDimShapeAnalysis : public ShapeAnalysis { - public: - explicit SymbolicDimShapeAnalysis(ModuleOp m); - ~SymbolicDimShapeAnalysis(); + // auto-save updated shape constriant ir when destroying. + ~ShapeConstraintIRAnalysis(); + // Returns the `SymbolicDimMgr` this object holds. SymbolicDimMgr& symbolicDimMgr() { return mgr_; } const SymbolicDimMgr& symbolicDimMgr() const { return mgr_; } + + // Returns true if the two value have the same symbolic shape. bool IsShapeEqual(Value lhs, Value rhs) override; + // Suppose: + // lhs_dim_idxs = {ld0, ld1, ...} + // rhs_dim_idxs = {rd0, rd1, ...} + // Returns true if: + // lhs.shape[ld0] * lhs.shape[ld1] * ... == + // rhs.shape[rd0] * rhs.shape[rd1] * ... bool IsProductEqual(Value lhs, - std::vector lhsDimIdxs, + std::vector lhs_dim_idxs, Value rhs, - std::vector rhsDimIdxs) override; + std::vector rhs_dim_idxs) override; private: + // The operation this analysis runs on. ModuleOp m_; + // The `SymbolicDimMgr` this analysis holds. SymbolicDimMgr mgr_; - std::unordered_map> value2SymDims_; + // Map a ranked memref value to an array of symbolicDims, each represents one + // dimension size of the memref value. + std::unordered_map> value_to_sym_dims_; }; class ShapeComputationIRAnalysis { diff --git a/paddle/pir/dialect/shape/utils/symbol_table.cc b/paddle/pir/dialect/shape/utils/symbol_table.cc new file mode 100644 index 0000000000000..c4ed0807b0b43 --- /dev/null +++ b/paddle/pir/dialect/shape/utils/symbol_table.cc @@ -0,0 +1,34 @@ +// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/pir/dialect/shape/utils/symbol_table.h" + +namespace pir { + +const std::string SymbolTable::insert(Operation* symbol) { + std::string name; + if (symbol->isa()) { + name = symbol->dyn_cast().GetSymName(); + symbol_table_map_.insert({name, symbol}); + } + + // TODO(liujinnan): add more constraint_func name branch. + if (symbol->isa()) { + name = "tie_product_equal"; + symbol_func_map_[name].emplace_back(symbol); + } + + return name; +} +} // namespace pir diff --git a/paddle/pir/dialect/shape/utils/symbol_table.h b/paddle/pir/dialect/shape/utils/symbol_table.h new file mode 100644 index 0000000000000..f85ba2cfb8099 --- /dev/null +++ b/paddle/pir/dialect/shape/utils/symbol_table.h @@ -0,0 +1,68 @@ +// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include +#include "paddle/pir/core/builtin_attribute.h" +#include "paddle/pir/core/builtin_op.h" +#include "paddle/pir/core/builtin_type_interfaces.h" +#include "paddle/pir/core/utils.h" +#include "paddle/pir/dialect/shape/ir/shape_op.h" + +namespace pir { + +using dialect::SymbolicDim; +class SymbolTable { + public: + explicit SymbolTable(Operation* symbol_table_op) + : symbol_table_op_(symbol_table_op) {} + SymbolTable() = default; + template + typename std::enable_if::value, + SymbolicDim>::type + Lookup(const std::string& name) const { + auto it = symbol_table_map_.find(name); + return it != symbol_table_map_.end() ? it->second->dyn_cast() + : SymbolicDim(nullptr); + } + template + typename std::enable_if::value, + std::vector>::type + Lookup(const std::string& name) const { + std::vector res; + auto it = symbol_func_map_.find(name); + if (it != symbol_func_map_.end()) { + for (auto& p : it->second) { + res.push_back(p->dyn_cast()); + } + } + return res; + } + + const std::string insert(Operation* symbol); + Operation* getOp() const { return symbol_table_op_; } + + private: + Operation* symbol_table_op_; + std::unordered_map symbol_table_map_; + std::unordered_map> symbol_func_map_; +}; + +} // namespace pir diff --git a/test/cpp/pir/shape_dialect/symbolic_op_test.cc b/test/cpp/pir/shape_dialect/symbolic_op_test.cc index b2b62c7b46aa9..0f8ae6e204047 100644 --- a/test/cpp/pir/shape_dialect/symbolic_op_test.cc +++ b/test/cpp/pir/shape_dialect/symbolic_op_test.cc @@ -26,6 +26,7 @@ #include "paddle/pir/dialect/shape/ir/shape_dialect.h" #include "paddle/pir/dialect/shape/ir/shape_op.h" #include "paddle/pir/dialect/shape/utils/shape_utils.h" +#include "paddle/pir/dialect/shape/utils/symbol_table.h" pir::AttributeMap CreateAttributeMap( const std::vector &attribute_names, @@ -66,35 +67,37 @@ TEST(assist_struct_test, symbolic_dim) { pir::Program program(ctx); ctx->GetOrRegisterDialect(); pir::Builder builder = pir::Builder(ctx, program.block()); - pir::dialect::SymbolicDim symDim = builder.Build( + + pir::dialect::SymbolicDim sym_dim1 = builder.Build( "S0", 10, false, false, false, false); - pir::dialect::SymbolicDim symDim_ = builder.Build( + pir::dialect::SymbolicDim sym_dim2 = builder.Build( "S1", 10, false, false, false, false); - EXPECT_EQ(symDim.getValue(), 10); - EXPECT_EQ(symDim.getSymName(), "S0"); - EXPECT_FALSE(symDim.getKnownNegativeOne()); - EXPECT_FALSE(symDim.getKnownNonSizeOne()); - EXPECT_FALSE(symDim.getKnownNonSizeZero()); - EXPECT_FALSE(symDim.getKnownNonNegative()); - - EXPECT_FALSE(symDim.IsDynamic()); - EXPECT_TRUE(symDim.Merge(symDim_)); - - symDim.updateValue(20); - symDim.updateSymName("S2"); - symDim.updateKnownNegativeOne(true); - symDim.updateKnownNonSizeOne(true); - symDim.updateKnownNonSizeZero(true); - symDim.updateKnownNonNegative(true); - - EXPECT_FALSE(symDim.Merge(symDim_)); - - EXPECT_EQ(symDim.getValue(), 20); - EXPECT_EQ(symDim.getSymName(), "S2"); - EXPECT_TRUE(symDim.getKnownNegativeOne()); - EXPECT_TRUE(symDim.getKnownNonSizeOne()); - EXPECT_TRUE(symDim.getKnownNonSizeZero()); - EXPECT_TRUE(symDim.getKnownNonNegative()); + + EXPECT_EQ(sym_dim1.GetDimSize(), 10); + EXPECT_EQ(sym_dim1.GetSymName(), "S0"); + EXPECT_FALSE(sym_dim1.GetKnownNegativeOne()); + EXPECT_FALSE(sym_dim1.GetKnownNonSizeOne()); + EXPECT_FALSE(sym_dim1.GetKnownNonSizeZero()); + EXPECT_FALSE(sym_dim1.GetKnownNonNegative()); + + EXPECT_FALSE(sym_dim1.IsDynamic()); + EXPECT_TRUE(sym_dim1.Merge(sym_dim2)); + + sym_dim1.SetDimSize(20); + sym_dim1.SetSymName("S2"); + sym_dim1.UpdateKnownNegativeOne(true); + sym_dim1.UpdateKnownNonSizeOne(true); + sym_dim1.UpdateKnownNonSizeZero(true); + sym_dim1.UpdateKnownNonNegative(true); + + EXPECT_FALSE(sym_dim1.Merge(sym_dim2)); + + EXPECT_EQ(sym_dim1.GetDimSize(), 20); + EXPECT_EQ(sym_dim1.GetSymName(), "S2"); + EXPECT_TRUE(sym_dim1.GetKnownNegativeOne()); + EXPECT_TRUE(sym_dim1.GetKnownNonSizeOne()); + EXPECT_TRUE(sym_dim1.GetKnownNonSizeZero()); + EXPECT_TRUE(sym_dim1.GetKnownNonNegative()); } TEST(assist_struct_test, symbolic_dim_product) { @@ -150,13 +153,13 @@ TEST(assist_struct_test, symbolic_dim_mgr_simple) { std::vector symDimVec = symDimMgr.CreateSymbolicDimsForRankedValue(res); - EXPECT_EQ(symDimS0.getSymName(), "S0"); - EXPECT_EQ(symDimS1.getSymName(), "S1"); - EXPECT_EQ(symDimS1.getValue(), pir::ShapedTypeInterface::kDynamic); - EXPECT_EQ(symDimC10.getSymName(), "C10"); - EXPECT_EQ(symDimC10.getValue(), 10); - EXPECT_EQ(symDimVec[0].getSymName(), "S2"); - EXPECT_EQ(symDimVec[1].getSymName(), "C2"); + EXPECT_EQ(symDimS0.GetSymName(), "S0"); + EXPECT_EQ(symDimS1.GetSymName(), "S1"); + EXPECT_EQ(symDimS1.GetDimSize(), pir::ShapedTypeInterface::kDynamic); + EXPECT_EQ(symDimC10.GetSymName(), "C10"); + EXPECT_EQ(symDimC10.GetDimSize(), 10); + EXPECT_EQ(symDimVec[0].GetSymName(), "S2"); + EXPECT_EQ(symDimVec[1].GetSymName(), "C2"); EXPECT_EQ(symDimMgr.symbolTable().Lookup("S0"), symDimS0); EXPECT_EQ(symDimMgr.symbolTable().Lookup("C10"), @@ -296,10 +299,10 @@ TEST(assist_struct_test, symbolic_dim_mgr_complex) { auto arrayAttr = pir::ArrayAttribute::get(ctx, newAttrs); auto arrayAttrRef = pir::ArrayAttribute::get(ctx, newAttrsRef); auto arrayAttr_ = pir::ArrayAttribute::get(ctx, newAttrs_); - tieShapeOp->set_attribute(pir::dialect::SymbolicDim::getSymbolicDimAttrName(), + tieShapeOp->set_attribute(pir::dialect::SymbolicDim::GetSymbolicDimAttrName(), arrayAttr); tieShapeOp_->set_attribute( - pir::dialect::SymbolicDim::getSymbolicDimAttrName(), arrayAttr_); + pir::dialect::SymbolicDim::GetSymbolicDimAttrName(), arrayAttr_); EXPECT_TRUE(symDimMgr.Load()); @@ -362,10 +365,10 @@ TEST(assist_struct_test, symbolic_dim_mgr_complex) { EXPECT_TRUE(symDimMgr.IsSymbolicDimEqual(symDimS1, symDimS2)); EXPECT_TRUE(symDimMgr.IsSymbolicDimEqual(symDimS0, symDimS3)); EXPECT_TRUE(symDimMgr.IsSymbolicDimEqual(symDimS4, symDimS5)); - EXPECT_EQ(symDimS6.getValue(), 200); + EXPECT_EQ(symDimS6.GetDimSize(), 200); EXPECT_EQ(symDimMgr.symbolTable().Lookup("C20"), symDimC20); - EXPECT_EQ(symDimS7.getValue(), symDimC10.getValue()); + EXPECT_EQ(symDimS7.GetDimSize(), symDimC10.GetDimSize()); EXPECT_EQ(simplifiedProductS7.factor, 10); EXPECT_EQ(simplifiedProductS7.symbols.size(), static_cast(0)); EXPECT_EQ(newLhs.symbols.size(), static_cast(1)); @@ -384,7 +387,7 @@ TEST(assist_struct_test, symbolic_dim_mgr_complex) { pir::SymbolicDimMgr symDimMgr_(program.module_op()); EXPECT_TRUE(symDimMgr_.Load()); auto attrs = tieShapeOp.attribute( - pir::dialect::SymbolicDim::getSymbolicDimAttrName()); + pir::dialect::SymbolicDim::GetSymbolicDimAttrName()); EXPECT_FALSE( symDimMgr_.symbolTable().Lookup("S7")); EXPECT_EQ(symDimMgr_.symbolTable() @@ -469,13 +472,13 @@ TEST(shape_op, tie_shape) { std::vector newAttrs = {attrS0, attrS1}; auto arrayAttr = pir::ArrayAttribute::get(ctx, newAttrs); - tieShapeOp->set_attribute(pir::dialect::SymbolicDim::getSymbolicDimAttrName(), + tieShapeOp->set_attribute(pir::dialect::SymbolicDim::GetSymbolicDimAttrName(), arrayAttr); std::vector arrAttrVec = tieShapeOp ->attribute( - pir::dialect::SymbolicDim::getSymbolicDimAttrName()) + pir::dialect::SymbolicDim::GetSymbolicDimAttrName()) .AsVector(); EXPECT_EQ(tieShapeOpValue, res); @@ -483,7 +486,7 @@ TEST(shape_op, tie_shape) { EXPECT_EQ(arrAttrVec[0].dyn_cast(), attrS0); EXPECT_EQ(arrAttrVec[1].dyn_cast(), attrS1); EXPECT_TRUE(tieShapeOp->HasAttribute( - pir::dialect::SymbolicDim::getSymbolicDimAttrName())); + pir::dialect::SymbolicDim::GetSymbolicDimAttrName())); } TEST(shape_op, func_op) { @@ -561,17 +564,17 @@ TEST(assist_struct_test, shape_analysis) { auto attrOp5 = pir::ArrayAttribute::get(ctx, {attrS2}); tieShapeOp1->set_attribute( - pir::dialect::SymbolicDim::getSymbolicDimAttrName(), attrOp1); + pir::dialect::SymbolicDim::GetSymbolicDimAttrName(), attrOp1); tieShapeOp2->set_attribute( - pir::dialect::SymbolicDim::getSymbolicDimAttrName(), attrOp2); + pir::dialect::SymbolicDim::GetSymbolicDimAttrName(), attrOp2); tieShapeOp3->set_attribute( - pir::dialect::SymbolicDim::getSymbolicDimAttrName(), attrOp3); + pir::dialect::SymbolicDim::GetSymbolicDimAttrName(), attrOp3); tieShapeOp4->set_attribute( - pir::dialect::SymbolicDim::getSymbolicDimAttrName(), attrOp4); + pir::dialect::SymbolicDim::GetSymbolicDimAttrName(), attrOp4); tieShapeOp5->set_attribute( - pir::dialect::SymbolicDim::getSymbolicDimAttrName(), attrOp5); + pir::dialect::SymbolicDim::GetSymbolicDimAttrName(), attrOp5); - pir::SymbolicDimShapeAnalysis shapeAnalysis(program.module_op()); + pir::ShapeConstraintIRAnalysis shapeAnalysis(program.module_op()); EXPECT_TRUE(shapeAnalysis.IsShapeEqual(value3, value4)); EXPECT_FALSE(shapeAnalysis.IsShapeEqual(value1, value2)); EXPECT_FALSE(shapeAnalysis.IsShapeEqual(value1, value3)); From b2f8746d585216a2ef96dacda3db618559d88c40 Mon Sep 17 00:00:00 2001 From: gouzil <66515297+gouzil@users.noreply.github.com> Date: Mon, 9 Oct 2023 10:20:29 +0800 Subject: [PATCH 09/62] [CodeStyle] clean some `#noqa: F401` step: 1 (#57860) --- python/paddle/amp/__init__.py | 36 +++--- python/paddle/audio/backends/__init__.py | 8 +- python/paddle/audio/features/__init__.py | 12 +- python/paddle/audio/functional/__init__.py | 21 ++-- python/paddle/autograd/__init__.py | 27 +++-- python/paddle/base/reader.py | 6 +- python/paddle/callbacks.py | 20 ++-- python/paddle/cost_model/__init__.py | 2 +- python/paddle/dataset/__init__.py | 26 +++-- python/paddle/decomposition/primitives.py | 108 +++++++++--------- python/paddle/device/__init__.py | 18 +-- python/paddle/device/cuda/__init__.py | 3 +- python/paddle/distributed/__init__.py | 76 ++++++------ .../auto_parallel/static/dist_attribute.py | 8 +- .../auto_parallel/static/reshard.py | 2 +- python/paddle/distributed/collective.py | 26 +++-- python/paddle/distributed/fleet/__init__.py | 34 +++--- python/paddle/distributed/parallel.py | 5 +- python/paddle/hub.py | 6 +- python/paddle/linalg.py | 54 ++++----- 20 files changed, 271 insertions(+), 227 deletions(-) diff --git a/python/paddle/amp/__init__.py b/python/paddle/amp/__init__.py index 615e6c4f36d55..9984ba450afe7 100644 --- a/python/paddle/amp/__init__.py +++ b/python/paddle/amp/__init__.py @@ -12,20 +12,28 @@ # See the License for the specific language governing permissions and # limitations under the License. -from .auto_cast import auto_cast # noqa: F401 -from .auto_cast import decorate # noqa: F401 -from .auto_cast import amp_guard # noqa: F401 -from .auto_cast import amp_decorate # noqa: F401 -from .amp_lists import white_list # noqa: F401 -from .amp_lists import black_list # noqa: F401 - -from . import grad_scaler # noqa: F401 -from .grad_scaler import GradScaler # noqa: F401 -from .grad_scaler import AmpScaler # noqa: F401 -from .grad_scaler import OptimizerState # noqa: F401 - -from . import debugging # noqa: F401 -from . import accuracy_compare # noqa: F401 +from .auto_cast import ( # noqa: F401 + auto_cast, + decorate, + amp_guard, + amp_decorate, +) +from .amp_lists import ( # noqa: F401 + white_list, + black_list, +) + +from . import ( # noqa: F401 + debugging, + grad_scaler, + accuracy_compare, +) + +from .grad_scaler import ( # noqa: F401 + GradScaler, + AmpScaler, + OptimizerState, +) from paddle.base import core from paddle.base.framework import ( diff --git a/python/paddle/audio/backends/__init__.py b/python/paddle/audio/backends/__init__.py index ac19a14c69a01..f089e5bfe9cd3 100644 --- a/python/paddle/audio/backends/__init__.py +++ b/python/paddle/audio/backends/__init__.py @@ -12,9 +12,11 @@ # See the License for the specific language governing permissions and # limitations under the License. from . import init_backend -from .init_backend import get_current_backend # noqa: F401 -from .init_backend import list_available_backends # noqa: F401 -from .init_backend import set_backend +from .init_backend import ( + get_current_backend, + list_available_backends, + set_backend, +) init_backend._init_set_audio_backend() diff --git a/python/paddle/audio/features/__init__.py b/python/paddle/audio/features/__init__.py index 3c0bf499f1eff..ac48d594ded93 100644 --- a/python/paddle/audio/features/__init__.py +++ b/python/paddle/audio/features/__init__.py @@ -11,12 +11,14 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from .layers import LogMelSpectrogram # noqa: F401 -from .layers import MelSpectrogram # noqa: F401 -from .layers import MFCC # noqa: F401 -from .layers import Spectrogram # noqa: F401 +from .layers import ( + LogMelSpectrogram, + MelSpectrogram, + MFCC, + Spectrogram, +) -__all__ = [ # noqa +__all__ = [ 'LogMelSpectrogram', 'MelSpectrogram', 'MFCC', diff --git a/python/paddle/audio/functional/__init__.py b/python/paddle/audio/functional/__init__.py index b7db53d6c22a6..caf1cf18c1a35 100644 --- a/python/paddle/audio/functional/__init__.py +++ b/python/paddle/audio/functional/__init__.py @@ -11,16 +11,19 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from .functional import compute_fbank_matrix # noqa: F401 -from .functional import create_dct # noqa: F401 -from .functional import fft_frequencies # noqa: F401 -from .functional import hz_to_mel # noqa: F401 -from .functional import mel_frequencies # noqa: F401 -from .functional import mel_to_hz # noqa: F401 -from .functional import power_to_db # noqa: F401 -from .window import get_window # noqa: F401 +from .functional import ( + compute_fbank_matrix, + create_dct, + fft_frequencies, + hz_to_mel, + mel_frequencies, + mel_to_hz, + power_to_db, +) -__all__ = [ # noqa +from .window import get_window + +__all__ = [ 'compute_fbank_matrix', 'create_dct', 'fft_frequencies', diff --git a/python/paddle/autograd/__init__.py b/python/paddle/autograd/__init__.py index acfb6447283c5..1cfd2734386f3 100644 --- a/python/paddle/autograd/__init__.py +++ b/python/paddle/autograd/__init__.py @@ -12,20 +12,23 @@ # See the License for the specific language governing permissions and # limitations under the License. -from ..base.dygraph.base import grad # noqa: F401 -from ..base.dygraph.base import enable_grad # noqa: F401 -from ..base.dygraph.base import no_grad_ as no_grad # noqa: F401 -from ..base.dygraph.base import is_grad_enabled # noqa: F401 -from ..base.dygraph.base import set_grad_enabled # noqa: F401 -from . import backward_mode # noqa: F401 -from .autograd import jacobian, hessian # noqa: F401 -from .backward_mode import backward # noqa: F401 -from .py_layer import PyLayer # noqa: F401 -from .py_layer import PyLayerContext # noqa: F401 +from ..base.dygraph.base import ( # noqa: F401 + grad, + enable_grad, + no_grad_ as no_grad, + is_grad_enabled, + set_grad_enabled, +) +from . import ( # noqa: F401 + backward_mode, + ir_backward, +) +from .autograd import jacobian, hessian +from .backward_mode import backward +from .py_layer import PyLayer, PyLayerContext from .saved_tensors_hooks import saved_tensors_hooks -from . import ir_backward -__all__ = [ # noqa +__all__ = [ 'jacobian', 'hessian', 'backward', diff --git a/python/paddle/base/reader.py b/python/paddle/base/reader.py index 8c2ddd16961da..7fcccf8910fc4 100644 --- a/python/paddle/base/reader.py +++ b/python/paddle/base/reader.py @@ -42,12 +42,12 @@ _copy_reader_var_, monkey_patch_reader_methods, ) -from .multiprocess_utils import _cleanup # noqa: F401 -from .multiprocess_utils import multiprocess_queue_set # noqa: F401 -from .multiprocess_utils import ( +from .multiprocess_utils import ( # noqa: F401 CleanupFuncRegistrar, + _cleanup, _cleanup_mmap, _set_SIGCHLD_handler, + multiprocess_queue_set, ) from .unique_name import UniqueNameGenerator diff --git a/python/paddle/callbacks.py b/python/paddle/callbacks.py index 960399c6b9796..6e94a9d7b67df 100644 --- a/python/paddle/callbacks.py +++ b/python/paddle/callbacks.py @@ -12,16 +12,18 @@ # See the License for the specific language governing permissions and # limitations under the License. -from .hapi.callbacks import Callback # noqa: F401 -from .hapi.callbacks import EarlyStopping # noqa: F401 -from .hapi.callbacks import LRScheduler # noqa: F401 -from .hapi.callbacks import ModelCheckpoint # noqa: F401 -from .hapi.callbacks import ProgBarLogger # noqa: F401 -from .hapi.callbacks import ReduceLROnPlateau # noqa: F401 -from .hapi.callbacks import VisualDL # noqa: F401 -from .hapi.callbacks import WandbCallback # noqa: F401 +from .hapi.callbacks import ( + Callback, + EarlyStopping, + LRScheduler, + ModelCheckpoint, + ProgBarLogger, + ReduceLROnPlateau, + VisualDL, + WandbCallback, +) -__all__ = [ # noqa +__all__ = [ 'Callback', 'ProgBarLogger', 'ModelCheckpoint', diff --git a/python/paddle/cost_model/__init__.py b/python/paddle/cost_model/__init__.py index e6907128642c6..6fd0ef63f3c5d 100644 --- a/python/paddle/cost_model/__init__.py +++ b/python/paddle/cost_model/__init__.py @@ -12,6 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -from .cost_model import CostModel # noqa: F401 +from .cost_model import CostModel __all__ = ['CostModel'] diff --git a/python/paddle/dataset/__init__.py b/python/paddle/dataset/__init__.py index 4b71ff6ac66f1..eaa8ab0ddfed1 100644 --- a/python/paddle/dataset/__init__.py +++ b/python/paddle/dataset/__init__.py @@ -15,18 +15,20 @@ Dataset package. """ -import paddle.dataset.mnist # noqa: F401 -import paddle.dataset.imikolov # noqa: F401 -import paddle.dataset.imdb # noqa: F401 -import paddle.dataset.cifar # noqa: F401 -import paddle.dataset.movielens # noqa: F401 -import paddle.dataset.conll05 # noqa: F401 -import paddle.dataset.uci_housing # noqa: F401 -import paddle.dataset.wmt14 # noqa: F401 -import paddle.dataset.wmt16 # noqa: F401 -import paddle.dataset.flowers # noqa: F401 -import paddle.dataset.voc2012 # noqa: F401 -import paddle.dataset.image # noqa: F401 +from . import ( # noqa: F401 + mnist, + imikolov, + imdb, + cifar, + movielens, + conll05, + uci_housing, + wmt14, + wmt16, + flowers, + voc2012, + image, +) # set __all__ as empty for not showing APIs under paddle.dataset __all__ = [] diff --git a/python/paddle/decomposition/primitives.py b/python/paddle/decomposition/primitives.py index c2c6fcb08dadc..0ac15489ae557 100644 --- a/python/paddle/decomposition/primitives.py +++ b/python/paddle/decomposition/primitives.py @@ -12,58 +12,58 @@ # See the License for the specific language governing permissions and # limitations under the License. -from paddle.tensor import abs # noqa: F401 -from paddle.tensor import acos # noqa: F401 -from paddle.tensor import acosh # noqa: F401 -from paddle.tensor import add # noqa: F401 -from paddle.tensor import asin # noqa: F401 -from paddle.tensor import asinh # noqa: F401 -from paddle.tensor import atan # noqa: F401 -from paddle.tensor import atanh # noqa: F401 -from paddle.tensor import broadcast_shape # noqa: F401 -from paddle.tensor import broadcast_to # noqa: F401 -from paddle.tensor import concat # noqa: F401 -from paddle.tensor import cos # noqa: F401 -from paddle.tensor import cosh # noqa: F401 -from paddle.tensor import cumprod # noqa: F401 -from paddle.tensor import cumsum # noqa: F401 -from paddle.tensor import digamma # noqa: F401 -from paddle.tensor import divide # noqa: F401 -from paddle.tensor import erf # noqa: F401 -from paddle.tensor import erfinv # noqa: F401 -from paddle.tensor import exp # noqa: F401 -from paddle.tensor import expm1 # noqa: F401 -from paddle.tensor import fill_constant # noqa: F401 -from paddle.tensor import full # noqa: F401 -from paddle.tensor import gather # noqa: F401 -from paddle.tensor import greater_equal # noqa: F401 -from paddle.tensor import lgamma # noqa: F401 -from paddle.tensor import log # noqa: F401 -from paddle.tensor import log1p # noqa: F401 -from paddle.tensor import logcumsumexp # noqa: F401 -from paddle.tensor import logit # noqa: F401 -from paddle.tensor import logsumexp # noqa: F401 -from paddle.tensor import max # noqa: F401 -from paddle.tensor import min # noqa: F401 -from paddle.tensor import multiply # noqa: F401 -from paddle.tensor import ones # noqa: F401 -from paddle.tensor import pow # noqa: F401 -from paddle.tensor import prod # noqa: F401 -from paddle.tensor import reshape # noqa: F401 -from paddle.tensor import rsqrt # noqa: F401 -from paddle.tensor import sign # noqa: F401 -from paddle.tensor import sin # noqa: F401 -from paddle.tensor import sinh # noqa: F401 -from paddle.tensor import sqrt # noqa: F401 -from paddle.tensor import subtract # noqa: F401 -from paddle.tensor import sum # noqa: F401 -from paddle.tensor import tan # noqa: F401 -from paddle.tensor import tanh # noqa: F401 -from paddle.tensor import tile # noqa: F401 -from paddle.tensor import uniform # noqa: F401 -from paddle.tensor import zeros # noqa: F401 -from paddle.tensor.creation import assign # noqa: F401 -from paddle.tensor.creation import zeros_like # noqa: F401 +from paddle.tensor import ( # noqa: F401 + abs, + acos, + acosh, + add, + asin, + asinh, + atan, + atanh, + broadcast_shape, + broadcast_to, + concat, + cos, + cosh, + cumprod, + cumsum, + digamma, + divide, + erf, + erfinv, + exp, + expm1, + fill_constant, + full, + gather, + greater_equal, + lgamma, + log, + log1p, + logcumsumexp, + logit, + logsumexp, + max, + min, + multiply, + ones, + pow, + prod, + reshape, + rsqrt, + sign, + sin, + sinh, + sqrt, + subtract, + sum, + tan, + tanh, + tile, + uniform, + zeros, +) +from paddle.tensor.creation import assign, zeros_like # noqa: F401 from paddle.tensor.manipulation import cast # noqa: F401 -from paddle.tensor.math import maximum # noqa: F401 -from paddle.tensor.math import minimum # noqa: F401 +from paddle.tensor.math import maximum, minimum # noqa: F401 diff --git a/python/paddle/device/__init__.py b/python/paddle/device/__init__.py index f8bf7b3b91968..7ee16ffcf5464 100644 --- a/python/paddle/device/__init__.py +++ b/python/paddle/device/__init__.py @@ -19,13 +19,17 @@ import paddle from paddle.base import core from paddle.base import framework -from paddle.base.framework import is_compiled_with_cinn # noqa: F401 -from paddle.base.framework import is_compiled_with_cuda # noqa: F401 -from paddle.base.framework import is_compiled_with_rocm # noqa: F401 -from . import cuda -from . import xpu - -__all__ = [ # noqa +from paddle.base.framework import ( + is_compiled_with_cinn, + is_compiled_with_cuda, + is_compiled_with_rocm, +) +from . import ( # noqa: F401 + cuda, + xpu, +) + +__all__ = [ 'get_cudnn_version', 'set_device', 'get_device', diff --git a/python/paddle/device/cuda/__init__.py b/python/paddle/device/cuda/__init__.py index cb57e674e2017..0a094319f893f 100644 --- a/python/paddle/device/cuda/__init__.py +++ b/python/paddle/device/cuda/__init__.py @@ -17,8 +17,7 @@ from paddle.base.wrapped_decorator import signature_safe_contextmanager from paddle.utils import deprecated -from .streams import Stream # noqa: F401 -from .streams import Event # noqa: F401 +from .streams import Stream, Event __all__ = [ 'Stream', diff --git a/python/paddle/distributed/__init__.py b/python/paddle/distributed/__init__.py index 7f641a5e6fa54..ce777fa73fd87 100644 --- a/python/paddle/distributed/__init__.py +++ b/python/paddle/distributed/__init__.py @@ -12,27 +12,32 @@ # See the License for the specific language governing permissions and # limitations under the License. -import atexit +import atexit # noqa: F401 from . import io -from .spawn import spawn # noqa: F401 -from .launch.main import launch # noqa: F401 -from .parallel import init_parallel_env # noqa: F401 -from .parallel import get_rank # noqa: F401 -from .parallel import get_world_size # noqa: F401 -from .parallel import ParallelEnv # noqa: F401 -from .parallel import DataParallel -from .parallel_with_gloo import gloo_init_parallel_env -from .parallel_with_gloo import gloo_barrier -from .parallel_with_gloo import gloo_release +from .spawn import spawn +from .launch.main import launch +from .parallel import ( # noqa: F401 + init_parallel_env, + get_rank, + get_world_size, + ParallelEnv, + DataParallel, +) +from .parallel_with_gloo import ( + gloo_init_parallel_env, + gloo_barrier, + gloo_release, +) -from paddle.distributed.fleet.dataset import InMemoryDataset # noqa: F401 -from paddle.distributed.fleet.dataset import QueueDataset # noqa: F401 -from paddle.distributed.fleet.base.topology import ParallelMode # noqa: F401 +from paddle.distributed.fleet.dataset import InMemoryDataset, QueueDataset +from paddle.distributed.fleet.base.topology import ParallelMode -from .collective import split # noqa: F401 -from .collective import new_group # noqa: F401 -from .collective import is_available -from .communication import ( +from .collective import ( + split, + new_group, + is_available, +) +from .communication import ( # noqa: F401 stream, ReduceOp, all_gather, @@ -59,31 +64,38 @@ wait, barrier, get_backend, -) # noqa: F401 +) -from .auto_parallel.process_mesh import ProcessMesh # noqa: F401 -from .auto_parallel.api import DistAttr # noqa: F401 +from .auto_parallel.process_mesh import ProcessMesh from .auto_parallel import shard_op # noqa: F401 -from .auto_parallel.api import shard_tensor # noqa: F401 -from .auto_parallel.api import dtensor_from_fn # noqa: F401 -from .auto_parallel.api import reshard # noqa: F401 -from .auto_parallel.api import shard_layer # noqa: F401 + +from .auto_parallel.api import ( + DistAttr, + shard_tensor, + dtensor_from_fn, + reshard, + shard_layer, +) from .fleet import BoxPSDataset # noqa: F401 -from .entry_attr import ProbabilityEntry # noqa: F401 -from .entry_attr import CountFilterEntry # noqa: F401 -from .entry_attr import ShowClickEntry # noqa: F401 +from .entry_attr import ( # noqa: F401 + ProbabilityEntry, + CountFilterEntry, + ShowClickEntry, +) from . import cloud_utils # noqa: F401 -from .sharding import group_sharded_parallel # noqa: F401 -from .sharding import save_group_sharded_model # noqa: F401 +from .sharding import ( # noqa: F401 + group_sharded_parallel, + save_group_sharded_model, +) -from . import rpc +from . import rpc # noqa: F401 -__all__ = [ # noqa +__all__ = [ "io", "spawn", "launch", diff --git a/python/paddle/distributed/auto_parallel/static/dist_attribute.py b/python/paddle/distributed/auto_parallel/static/dist_attribute.py index a8ee0e313669a..46e36e51138f5 100644 --- a/python/paddle/distributed/auto_parallel/static/dist_attribute.py +++ b/python/paddle/distributed/auto_parallel/static/dist_attribute.py @@ -12,6 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License -from paddle.base.core import DistTensorSpec # noqa: F401 -from paddle.base.core import OperatorDistAttr # noqa: F401 -from paddle.base.core import TensorDistAttr # noqa: F401 +from paddle.base.core import ( # noqa: F401 + DistTensorSpec, + OperatorDistAttr, + TensorDistAttr, +) diff --git a/python/paddle/distributed/auto_parallel/static/reshard.py b/python/paddle/distributed/auto_parallel/static/reshard.py index facfe183c5d9a..cf1ed597536e3 100644 --- a/python/paddle/distributed/auto_parallel/static/reshard.py +++ b/python/paddle/distributed/auto_parallel/static/reshard.py @@ -1787,7 +1787,7 @@ def parse_op_desc( var_name, block, self.auto_parallel_main_prog ) for op_desc in op_desc_list: - if isinstance(op_desc, AllGatherOpDesc): # noqa: F401 + if isinstance(op_desc, AllGatherOpDesc): if var_name not in self.has_allgather.keys(): self.has_allgather[var_name] = [] if not self.has_allgather[var_name] or op_desc.group not in [ diff --git a/python/paddle/distributed/collective.py b/python/paddle/distributed/collective.py index 04f92558bdcda..a2bac699bb542 100644 --- a/python/paddle/distributed/collective.py +++ b/python/paddle/distributed/collective.py @@ -22,18 +22,20 @@ from paddle.framework import in_dynamic_mode from .communication.group import Group, _add_new_group, is_initialized -from .fleet.layers.mpu.mp_ops import _c_concat # noqa: F401 -from .fleet.layers.mpu.mp_ops import _c_identity # noqa: F401 -from .fleet.layers.mpu.mp_ops import _c_lookup_table # noqa: F401 -from .fleet.layers.mpu.mp_ops import _c_softmax_with_cross_entropy # noqa: F401 -from .fleet.layers.mpu.mp_ops import _c_split # noqa: F401 -from .fleet.layers.mpu.mp_ops import _Linear # noqa: F401 -from .fleet.layers.mpu.mp_ops import _linear # noqa: F401 -from .fleet.layers.mpu.mp_ops import _mp_allreduce # noqa: F401 -from .fleet.layers.mpu.mp_ops import _parallel_embedding # noqa: F401 -from .fleet.layers.mpu.mp_ops import _parallel_linear # noqa: F401 -from .fleet.layers.mpu.mp_ops import _set_var_distributed # noqa: F401 -from .fleet.layers.mpu.mp_ops import split # noqa: F401 +from .fleet.layers.mpu.mp_ops import ( # noqa: F401 + _c_concat, + _c_identity, + _c_lookup_table, + _c_softmax_with_cross_entropy, + _c_split, + _Linear, + _linear, + _mp_allreduce, + _parallel_embedding, + _parallel_linear, + _set_var_distributed, + split, +) __all__ = [] diff --git a/python/paddle/distributed/fleet/__init__.py b/python/paddle/distributed/fleet/__init__.py index 0cda5198ab3c9..a77e61bc2b401 100755 --- a/python/paddle/distributed/fleet/__init__.py +++ b/python/paddle/distributed/fleet/__init__.py @@ -13,30 +13,34 @@ # limitations under the License. # TODO: define distributed api under this directory, -from .base.role_maker import Role # noqa: F401 -from .base.role_maker import UserDefinedRoleMaker # noqa: F401 -from .base.role_maker import PaddleCloudRoleMaker # noqa: F401 -from .base.distributed_strategy import DistributedStrategy # noqa: F401 -from .base.util_factory import UtilBase # noqa: F401 -from .dataset import DatasetBase # noqa: F401 -from .dataset import InMemoryDataset # noqa: F401 -from .dataset import QueueDataset # noqa: F401 -from .dataset import FileInstantDataset # noqa: F401 -from .dataset import BoxPSDataset # noqa: F401 -from .data_generator.data_generator import MultiSlotDataGenerator # noqa: F401 +from .base.role_maker import ( + Role, + UserDefinedRoleMaker, + PaddleCloudRoleMaker, +) +from .base.distributed_strategy import DistributedStrategy +from .base.util_factory import UtilBase +from .dataset import ( # noqa: F401 + DatasetBase, + InMemoryDataset, + QueueDataset, + FileInstantDataset, + BoxPSDataset, +) +from .data_generator.data_generator import MultiSlotDataGenerator from .data_generator.data_generator import ( MultiSlotStringDataGenerator, -) # noqa: F401 +) from . import metrics # noqa: F401 from .base.topology import CommunicateTopology -from .base.topology import HybridCommunicateGroup # noqa: F401 +from .base.topology import HybridCommunicateGroup from .fleet import Fleet from .model import distributed_model from .optimizer import distributed_optimizer from .scaler import distributed_scaler from .utils import log_util -__all__ = [ # noqa +__all__ = [ "CommunicateTopology", "UtilBase", "HybridCommunicateGroup", @@ -103,4 +107,4 @@ get_log_level_code = log_util.get_log_level_code get_log_level_name = log_util.get_log_level_name save_cache_table = fleet.save_cache_table -from .. import auto_parallel as auto +from .. import auto_parallel as auto # noqa: F401 diff --git a/python/paddle/distributed/parallel.py b/python/paddle/distributed/parallel.py index 3815d0f475fbe..bec8b72fac52c 100644 --- a/python/paddle/distributed/parallel.py +++ b/python/paddle/distributed/parallel.py @@ -19,8 +19,7 @@ import warnings from collections import OrderedDict, namedtuple from contextlib import contextmanager -from multiprocessing import Manager # noqa: F401 -from multiprocessing import Process # noqa: F401 +from multiprocessing import Manager, Process import numpy as np @@ -43,7 +42,7 @@ _get_global_group, is_initialized, ) -from paddle.distributed.fleet.base.private_helper_function import ( # noqa: F401 +from paddle.distributed.fleet.base.private_helper_function import ( wait_server_ready, ) from paddle.distributed.fleet.launch_utils import check_backend diff --git a/python/paddle/hub.py b/python/paddle/hub.py index 1960d98e95b67..3e5da36cd93bb 100644 --- a/python/paddle/hub.py +++ b/python/paddle/hub.py @@ -12,8 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -from .hapi.hub import help # noqa: F401 -from .hapi.hub import list # noqa: F401 -from .hapi.hub import load # noqa: F401 +from .hapi.hub import help, list, load -__all__ = ['list', 'help', 'load'] # noqa +__all__ = ['list', 'help', 'load'] diff --git a/python/paddle/linalg.py b/python/paddle/linalg.py index 9239f68a73174..4c2d6c00b9f0d 100644 --- a/python/paddle/linalg.py +++ b/python/paddle/linalg.py @@ -12,34 +12,36 @@ # See the License for the specific language governing permissions and # limitations under the License. -from .tensor import inverse as inv # noqa: F401 -from .tensor.linalg import cholesky # noqa: F401 -from .tensor.linalg import cholesky_solve # noqa: F401 -from .tensor.linalg import cond # noqa: F401 -from .tensor.linalg import corrcoef # noqa: F401 -from .tensor.linalg import cov # noqa: F401 -from .tensor.linalg import det # noqa: F401 -from .tensor.linalg import eig # noqa: F401 -from .tensor.linalg import eigh # noqa: F401 -from .tensor.linalg import eigvals # noqa: F401 -from .tensor.linalg import eigvalsh # noqa: F401 -from .tensor.linalg import lu # noqa: F401 -from .tensor.linalg import lu_unpack # noqa: F401 -from .tensor.linalg import matrix_power # noqa: F401 -from .tensor.linalg import matrix_rank # noqa: F401 -from .tensor.linalg import multi_dot # noqa: F401 -from .tensor.linalg import norm # noqa: F401 -from .tensor.linalg import pca_lowrank # noqa: F401 -from .tensor.linalg import pinv # noqa: F401 -from .tensor.linalg import qr # noqa: F401 -from .tensor.linalg import slogdet # noqa: F401 -from .tensor.linalg import solve # noqa: F401 -from .tensor.linalg import svd # noqa: F401 -from .tensor.linalg import triangular_solve # noqa: F401 -from .tensor.linalg import lstsq +from .tensor import inverse as inv +from .tensor.linalg import ( + cholesky, + cholesky_solve, + cond, + corrcoef, + cov, + det, + eig, + eigh, + eigvals, + eigvalsh, + lstsq, + lu, + lu_unpack, + matrix_power, + matrix_rank, + multi_dot, + norm, + pca_lowrank, + pinv, + qr, + slogdet, + solve, + svd, + triangular_solve, +) __all__ = [ - 'cholesky', # noqa + 'cholesky', 'norm', 'cond', 'cov', From 2968a847099d6992cd83501ebd4f68a51dbde516 Mon Sep 17 00:00:00 2001 From: wanghuancoder Date: Mon, 9 Oct 2023 10:34:06 +0800 Subject: [PATCH 10/62] del_lstm_unit_op (#57927) --- paddle/fluid/operators/lstm_unit_op.cc | 149 ------------------- paddle/fluid/operators/lstm_unit_op.cu | 193 ------------------------- paddle/fluid/operators/lstm_unit_op.h | 153 -------------------- test/legacy_test/test_lstm_unit_op.py | 54 ------- 4 files changed, 549 deletions(-) delete mode 100644 paddle/fluid/operators/lstm_unit_op.cc delete mode 100644 paddle/fluid/operators/lstm_unit_op.cu delete mode 100644 paddle/fluid/operators/lstm_unit_op.h delete mode 100644 test/legacy_test/test_lstm_unit_op.py diff --git a/paddle/fluid/operators/lstm_unit_op.cc b/paddle/fluid/operators/lstm_unit_op.cc deleted file mode 100644 index 25503ee32e9bf..0000000000000 --- a/paddle/fluid/operators/lstm_unit_op.cc +++ /dev/null @@ -1,149 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/lstm_unit_op.h" - -#include - -namespace paddle { -namespace operators { - -class LstmUnitOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "lstm_unit"); - OP_INOUT_CHECK(ctx->HasInput("C_prev"), "Input", "C_prev", "lstm_unit"); - OP_INOUT_CHECK(ctx->HasOutput("C"), "Output", "C", "lstm_unit"); - OP_INOUT_CHECK(ctx->HasOutput("H"), "Output", "H", "lstm_unit"); - - auto x_dims = ctx->GetInputDim("X"); - auto c_prev_dims = ctx->GetInputDim("C_prev"); - - PADDLE_ENFORCE_EQ( - x_dims.size(), - 2, - platform::errors::InvalidArgument( - "Input(X)'s rank must be 2. Received %d instead.", x_dims.size())); - if (ctx->IsRuntime()) { - PADDLE_ENFORCE_EQ(x_dims[0], - c_prev_dims[0], - platform::errors::InvalidArgument( - "Batch size of inputs and states must be equal, " - "but received %d (inputs)" - "vs %d (states).", - x_dims[0], - c_prev_dims[0])); - PADDLE_ENFORCE_EQ(x_dims[1], - c_prev_dims[1] * 4, - platform::errors::InvalidArgument( - "Dimension of FC should equal to prev state * 4, " - "but received %d (dimension of FC)" - "vs %d (prev state * 4).", - x_dims[1], - c_prev_dims[1] * 4)); - } - - int b_size = static_cast(c_prev_dims[0]); // batch size - int s_dim = static_cast(c_prev_dims[1]); // state dim - ctx->SetOutputDim("C", {b_size, s_dim}); - ctx->SetOutputDim("H", {b_size, s_dim}); - } -}; - -class LstmUnitOpMaker : public framework::OpProtoAndCheckerMaker { - public: - void Make() override { - AddInput("X", - "Lstm unit only applies non-linear activations, please make sure" - "that linear tranformation has already been applied to `X`. " - "Linear tranformation can be applied by adding a `fc` layer"); - AddInput( - "C_prev", - "The cell state tensor of last time-step in the Lstm Unit operator."); - AddOutput("C", "The cell tensor of Lstm Unit operator."); - AddOutput("H", "The hidden state tensor of Lstm Unit operator."); - AddAttr("forget_bias", - "(float, default 0.0) " - "The forget bias of Lstm Unit.") - .SetDefault(0.0); - AddComment(R"DOC( -Lstm Unit Operator - -Equation: - -$$ -i, f, o, j = split(X) \\ -C = C_{prev} * sigm(f + forget\_bias) + sigm(i) * tanh(j) \\ -H = C * sigm(o) -$$ - -)DOC"); - } -}; - -class LstmUnitGradOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("C")), - "Input", - framework::GradVarName("C"), - "lstm_unit"); - OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("H")), - "Input", - framework::GradVarName("H"), - "lstm_unit"); - ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); - ctx->SetOutputDim(framework::GradVarName("C_prev"), - ctx->GetInputDim("C_prev")); - } -}; - -template -class LstmUnitGradOpMaker : public framework::SingleGradOpMaker { - public: - using framework::SingleGradOpMaker::SingleGradOpMaker; - - protected: - void Apply(GradOpPtr op) const override { - op->SetType("lstm_unit_grad"); - op->SetInput("X", this->Input("X")); - op->SetInput("C_prev", this->Input("C_prev")); - op->SetInput("C", this->Output("C")); - op->SetInput(framework::GradVarName("H"), this->OutputGrad("H")); - op->SetInput(framework::GradVarName("C"), this->OutputGrad("C")); - op->SetOutput(framework::GradVarName("X"), this->InputGrad("X")); - op->SetOutput(framework::GradVarName("C_prev"), this->InputGrad("C_prev")); - op->SetAttrMap(this->Attrs()); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OPERATOR(lstm_unit, - ops::LstmUnitOp, - ops::LstmUnitOpMaker, - ops::LstmUnitGradOpMaker, - ops::LstmUnitGradOpMaker); -REGISTER_OPERATOR(lstm_unit_grad, ops::LstmUnitGradOp); - -PD_REGISTER_STRUCT_KERNEL( - lstm_unit, CPU, ALL_LAYOUT, ops::LstmUnitKernel, float, double) {} -PD_REGISTER_STRUCT_KERNEL( - lstm_unit_grad, CPU, ALL_LAYOUT, ops::LstmUnitGradKernel, float, double) {} diff --git a/paddle/fluid/operators/lstm_unit_op.cu b/paddle/fluid/operators/lstm_unit_op.cu deleted file mode 100644 index b1c9d035a8cb5..0000000000000 --- a/paddle/fluid/operators/lstm_unit_op.cu +++ /dev/null @@ -1,193 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -/* Acknowledgement: the following code is strongly inspired by -https://github.com/caffe2/caffe2/blob/master/caffe2/operators/lstm_unit_op_gpu.cu -*/ - -#include "paddle/fluid/operators/lstm_unit_op.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/cross_entropy_op.h" -#include "paddle/phi/core/hostdevice.h" - -namespace paddle { -namespace operators { - -template -__device__ Dtype cuda_sigmoid(const Dtype x) { - return Dtype(1) / (Dtype(1) + exp(-x)); -} - -template -__device__ Dtype cuda_tanh(const Dtype x) { - return Dtype(1 - exp(-2. * x)) / (Dtype(1) + exp(-2. * x)); -} - -template -__global__ void LSTMUnitKernel(const int nthreads, - const int dim, - const T* C_prev, - const T* X, - T* C, - T* H, - const T forget_bias) { - CUDA_KERNEL_LOOP(index, nthreads) { - const int n = index / dim; - const int d = index % dim; - - const T* X_offset = X + 4 * dim * n; - const T i = cuda_sigmoid(X_offset[d]); - const T f = cuda_sigmoid(X_offset[1 * dim + d] + forget_bias); - const T o = cuda_sigmoid(X_offset[2 * dim + d]); - const T g = cuda_tanh(X_offset[3 * dim + d]); - const T c_prev = C_prev[index]; - const T c = f * c_prev + i * g; - C[index] = c; - const T tanh_c = cuda_tanh(c); - H[index] = o * tanh_c; - } -} - -template -__global__ void LSTMUnitGradientKernel(const int nthreads, - const int dim, - const T* C_prev, - const T* X, - const T* C, - const T* C_diff, - const T* H_diff, - T* C_prev_diff, - T* X_diff, - const T forget_bias) { - CUDA_KERNEL_LOOP(index, nthreads) { - const int n = index / dim; - const int d = index % dim; - const T* X_offset = X + 4 * dim * n; - T* c_prev_diff = C_prev_diff + index; - T* X_diff_offset = X_diff + 4 * dim * n; - T* i_diff = X_diff_offset + d; - T* f_diff = X_diff_offset + 1 * dim + d; - T* o_diff = X_diff_offset + 2 * dim + d; - T* g_diff = X_diff_offset + 3 * dim + d; - - const T i = cuda_sigmoid(X_offset[d]); - const T f = cuda_sigmoid(X_offset[1 * dim + d] + forget_bias); - const T o = cuda_sigmoid(X_offset[2 * dim + d]); - const T g = cuda_tanh(X_offset[3 * dim + d]); - const T c_prev = C_prev[index]; - const T c = C[index]; - const T tanh_c = cuda_tanh(c); - const T c_term_diff = - C_diff[index] + H_diff[index] * o * (1 - tanh_c * tanh_c); - *c_prev_diff = c_term_diff * f; - *i_diff = c_term_diff * g * i * (1 - i); - *f_diff = c_term_diff * c_prev * f * (1 - f); - *o_diff = H_diff[index] * tanh_c * o * (1 - o); - *g_diff = c_term_diff * i * (1 - g * g); - } -} - -template -class LstmUnitOpCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - PADDLE_ENFORCE_EQ( - platform::is_gpu_place(ctx.GetPlace()), - true, - paddle::platform::errors::PreconditionNotMet("It must use CUDAPlace.")); - - auto* x_tensor = ctx.Input("X"); - auto* c_prev_tensor = ctx.Input("C_prev"); - auto* c_tensor = ctx.Output("C"); - auto* h_tensor = ctx.Output("H"); - - auto forget_bias = static_cast(ctx.Attr("forget_bias")); - - int b_size = c_tensor->dims()[0]; - int D = c_tensor->dims()[1]; - - const T* X = x_tensor->data(); - const T* C_prev = c_prev_tensor->data(); - - T* C = c_tensor->mutable_data(ctx.GetPlace()); - T* H = h_tensor->mutable_data(ctx.GetPlace()); - - int block = 512; - int n = b_size * D; - int grid = (n + block - 1) / block; - - LSTMUnitKernel<<>>(n, D, C_prev, X, C, H, forget_bias); - } -}; - -template -class LstmUnitGradOpCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - PADDLE_ENFORCE_EQ( - platform::is_gpu_place(ctx.GetPlace()), - true, - paddle::platform::errors::PreconditionNotMet("It must use CUDAPlace.")); - - auto x_tensor = ctx.Input("X"); - auto c_prev_tensor = ctx.Input("C_prev"); - auto c_tensor = ctx.Input("C"); - auto h_tensor = ctx.Input("H"); - - auto hdiff_tensor = - ctx.Input(framework::GradVarName("H")); - auto cdiff_tensor = - ctx.Input(framework::GradVarName("C")); - - auto xdiff_tensor = - ctx.Output(framework::GradVarName("X")); - auto c_prev_diff_tensor = - ctx.Output(framework::GradVarName("C_prev")); - - auto* X = x_tensor->data(); - auto* C_prev = c_prev_tensor->data(); - auto* C = c_tensor->data(); - - auto* H_diff = hdiff_tensor->data(); - auto* C_diff = cdiff_tensor->data(); - - auto* C_prev_diff = c_prev_diff_tensor->mutable_data(ctx.GetPlace()); - auto* X_diff = xdiff_tensor->mutable_data(ctx.GetPlace()); - - int N = c_tensor->dims()[0]; - int D = c_tensor->dims()[1]; - - auto forget_bias = static_cast(ctx.Attr("forget_bias")); - - int block = 512; - int n = N * D; - int grid = (n + block - 1) / block; - - LSTMUnitGradientKernel<<>>( - n, D, C_prev, X, C, C_diff, H_diff, C_prev_diff, X_diff, forget_bias); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -PD_REGISTER_STRUCT_KERNEL( - lstm_unit, GPU, ALL_LAYOUT, ops::LstmUnitOpCUDAKernel, float, double) {} -PD_REGISTER_STRUCT_KERNEL(lstm_unit_grad, - GPU, - ALL_LAYOUT, - ops::LstmUnitGradOpCUDAKernel, - float, - double) {} diff --git a/paddle/fluid/operators/lstm_unit_op.h b/paddle/fluid/operators/lstm_unit_op.h deleted file mode 100644 index 0621741b885fb..0000000000000 --- a/paddle/fluid/operators/lstm_unit_op.h +++ /dev/null @@ -1,153 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -/* Acknowledgement: the following code is strongly inspired by -https://github.com/caffe2/caffe2/blob/master/caffe2/operators/lstm_unit_op.h -*/ - -#pragma once -#include "glog/logging.h" -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -template -inline T sigmoid(T x) { - return 1. / (1. + exp(-x)); -} - -template -inline T tanh(T x) { - return 2. * sigmoid(2. * x) - 1.; -} - -template -class LstmUnitKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - PADDLE_ENFORCE_EQ( - platform::is_cpu_place(ctx.GetPlace()), - true, - paddle::platform::errors::PreconditionNotMet("It must use CPUPlace.")); - - auto* x_tensor = ctx.Input("X"); - auto* c_prev_tensor = ctx.Input("C_prev"); - auto* c_tensor = ctx.Output("C"); - auto* h_tensor = ctx.Output("H"); - - auto forget_bias = static_cast(ctx.Attr("forget_bias")); - - int b_size = c_tensor->dims()[0]; - int D = c_tensor->dims()[1]; - - T* C = c_tensor->mutable_data(ctx.GetPlace()); - T* H = h_tensor->mutable_data(ctx.GetPlace()); - - const T* X = x_tensor->data(); - const T* C_prev = c_prev_tensor->data(); - - for (int n = 0; n < b_size; ++n) { - for (int d = 0; d < D; ++d) { - const T i = sigmoid(X[d]); - const T f = sigmoid(X[1 * D + d] + forget_bias); - const T o = sigmoid(X[2 * D + d]); - const T g = tanh(X[3 * D + d]); - const T c_prev = C_prev[d]; - const T c = f * c_prev + i * g; - C[d] = c; - const T tanh_c = tanh(c); - H[d] = o * tanh_c; - } - C_prev += D; - X += 4 * D; - C += D; - H += D; - } - } -}; - -template -class LstmUnitGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - PADDLE_ENFORCE_EQ( - platform::is_cpu_place(ctx.GetPlace()), - true, - paddle::platform::errors::PreconditionNotMet("It must use CPUPlace.")); - - auto x_tensor = ctx.Input("X"); - auto c_prev_tensor = ctx.Input("C_prev"); - auto c_tensor = ctx.Input("C"); - - auto hdiff_tensor = - ctx.Input(framework::GradVarName("H")); - auto cdiff_tensor = - ctx.Input(framework::GradVarName("C")); - - auto xdiff_tensor = - ctx.Output(framework::GradVarName("X")); - auto c_prev_diff_tensor = - ctx.Output(framework::GradVarName("C_prev")); - - auto* X = x_tensor->data(); - auto* C_prev = c_prev_tensor->data(); - auto* C = c_tensor->data(); - - auto* H_diff = hdiff_tensor->data(); - auto* C_diff = cdiff_tensor->data(); - - auto* C_prev_diff = c_prev_diff_tensor->mutable_data(ctx.GetPlace()); - auto* X_diff = xdiff_tensor->mutable_data(ctx.GetPlace()); - - int N = c_tensor->dims()[0]; - int D = c_tensor->dims()[1]; - - auto forget_bias = static_cast(ctx.Attr("forget_bias")); - - for (int n = 0; n < N; ++n) { - for (int d = 0; d < D; ++d) { - T* c_prev_diff = C_prev_diff + d; - T* i_diff = X_diff + d; - T* f_diff = X_diff + 1 * D + d; - T* o_diff = X_diff + 2 * D + d; - T* g_diff = X_diff + 3 * D + d; - - const T i = sigmoid(X[d]); - const T f = sigmoid(X[1 * D + d] + forget_bias); - const T o = sigmoid(X[2 * D + d]); - const T g = tanh(X[3 * D + d]); - const T c_prev = C_prev[d]; - const T c = C[d]; - const T tanh_c = tanh(c); - const T c_term_diff = C_diff[d] + H_diff[d] * o * (1 - tanh_c * tanh_c); - *c_prev_diff = c_term_diff * f; - *i_diff = c_term_diff * g * i * (1 - i); - *f_diff = c_term_diff * c_prev * f * (1 - f); - *o_diff = H_diff[d] * tanh_c * o * (1 - o); - *g_diff = c_term_diff * i * (1 - g * g); - } - C_prev += D; - X += 4 * D; - C += D; - C_diff += D; - H_diff += D; - X_diff += 4 * D; - C_prev_diff += D; - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/test/legacy_test/test_lstm_unit_op.py b/test/legacy_test/test_lstm_unit_op.py deleted file mode 100644 index 8a1b2fc238b22..0000000000000 --- a/test/legacy_test/test_lstm_unit_op.py +++ /dev/null @@ -1,54 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import unittest - -import numpy as np -from op_test import OpTest - - -def sigmoid_np(x): - return 1.0 / (1.0 + np.exp(-x)) - - -def tanh_np(x): - return 2 * sigmoid_np(2.0 * x) - 1.0 - - -class LstmUnitTest(OpTest): - def setUp(self): - self.op_type = "lstm_unit" - x_np = np.random.normal(size=(15, 160)).astype("float64") - c_np = np.random.normal(size=(15, 40)).astype("float64") - i_np, f_np, o_np, j_np = np.split(x_np, 4, axis=1) - forget_bias_np = 0.0 - self.attrs = {'forget_bias': 0.0} - - new_c = c_np * sigmoid_np(f_np + forget_bias_np) + sigmoid_np( - i_np - ) * tanh_np(j_np) - new_h = tanh_np(new_c) * sigmoid_np(o_np) - - self.inputs = {'X': x_np, 'C_prev': c_np} - self.outputs = {'C': new_c, 'H': new_h} - - def test_check_output(self): - self.check_output() - - def test_check_grad(self): - self.check_grad(['X', 'C_prev'], ['C', 'H']) - - -if __name__ == "__main__": - unittest.main() From 01f8a7aeac8bffd17e5f3d356be17b33eb488733 Mon Sep 17 00:00:00 2001 From: WangZhen <23097963+0x45f@users.noreply.github.com> Date: Mon, 9 Oct 2023 10:34:57 +0800 Subject: [PATCH 11/62] [PIR]Migrate flatten into pir (#57762) --- python/paddle/tensor/manipulation.py | 4 ++-- .../test_flatten_contiguous_range_op.py | 17 +++++++++++++---- 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py index e6f1484db154c..15d9eb5300a5a 100644 --- a/python/paddle/tensor/manipulation.py +++ b/python/paddle/tensor/manipulation.py @@ -1549,7 +1549,7 @@ def flatten(x, start_axis=0, stop_axis=-1, name=None): img[0, 0, 0, 0] = -1 print(out[0, 0, 0]) # [-1] """ - if not (isinstance(x, Variable)): + if not (isinstance(x, (Variable, paddle.pir.OpResult))): raise ValueError("The input x should be a Tensor") x_dim = len(x.shape) @@ -1586,7 +1586,7 @@ def flatten(x, start_axis=0, stop_axis=-1, name=None): if start_axis > stop_axis: raise ValueError("The stop_axis should be larger than stat_axis") - if in_dynamic_mode(): + if in_dynamic_or_pir_mode(): return _C_ops.flatten(x, start_axis, stop_axis) else: check_variable_and_dtype( diff --git a/test/legacy_test/test_flatten_contiguous_range_op.py b/test/legacy_test/test_flatten_contiguous_range_op.py index d497da1cd2758..71e39e92b8c5a 100644 --- a/test/legacy_test/test_flatten_contiguous_range_op.py +++ b/test/legacy_test/test_flatten_contiguous_range_op.py @@ -46,18 +46,27 @@ def if_enable_cinn(self): def test_check_output(self): if str(self.dtype) in {"float16", "uint16"}: self.check_output_with_place( - core.CUDAPlace(0), no_check_set=["XShape"], check_prim=True + core.CUDAPlace(0), + no_check_set=["XShape"], + check_prim=True, + check_new_ir=True, ) else: - self.check_output(no_check_set=["XShape"], check_prim=True) + self.check_output( + no_check_set=["XShape"], check_prim=True, check_new_ir=True + ) def test_check_grad(self): if str(self.dtype) in {"float16", "uint16"}: self.check_grad_with_place( - core.CUDAPlace(0), ["X"], "Out", check_prim=True + core.CUDAPlace(0), + ["X"], + "Out", + check_prim=True, + check_new_ir=True, ) else: - self.check_grad(["X"], "Out", check_prim=True) + self.check_grad(["X"], "Out", check_prim=True, check_new_ir=True) def init_test_case(self): self.in_shape = (3, 2, 5, 4) From bf591c1e00c9df66d481d82a450d07d875ce5dc0 Mon Sep 17 00:00:00 2001 From: WangZhen <23097963+0x45f@users.noreply.github.com> Date: Mon, 9 Oct 2023 10:44:14 +0800 Subject: [PATCH 12/62] [PIR]Migrate sqrt into pir (#57759) --- python/paddle/tensor/ops.py | 2 +- test/legacy_test/test_activation_op.py | 52 ++++++++++++++++++-------- 2 files changed, 38 insertions(+), 16 deletions(-) diff --git a/python/paddle/tensor/ops.py b/python/paddle/tensor/ops.py index 9fbc9d16baa66..b8185c6c2bca0 100644 --- a/python/paddle/tensor/ops.py +++ b/python/paddle/tensor/ops.py @@ -1057,7 +1057,7 @@ def sqrt(x, name=None): Tensor(shape=[4], dtype=float32, place=Place(cpu), stop_gradient=True, [0.31622776, 0.44721359, 0.54772258, 0.63245553]) """ - if in_dynamic_mode(): + if in_dynamic_or_pir_mode(): return _C_ops.sqrt(x) else: check_variable_and_dtype( diff --git a/test/legacy_test/test_activation_op.py b/test/legacy_test/test_activation_op.py index 96af19e4b77d7..5689b0cb970fc 100644 --- a/test/legacy_test/test_activation_op.py +++ b/test/legacy_test/test_activation_op.py @@ -1461,10 +1461,10 @@ def if_enable_cinn(self): def test_check_grad(self): if self.dtype == np.float16: return - self.check_grad(['X'], 'Out', check_prim=True) + self.check_grad(['X'], 'Out', check_prim=True, check_new_ir=True) def test_check_output(self): - self.check_output() + self.check_output(check_new_ir=True) class TestSqrtPrimFp32(TestActivation): @@ -1486,10 +1486,10 @@ def setUp(self): def test_check_grad(self): if self.dtype == np.float16: return - self.check_grad(['X'], 'Out', check_prim=True) + self.check_grad(['X'], 'Out', check_prim=True, check_new_ir=True) def test_check_output(self): - self.check_output() + self.check_output(check_new_ir=True) def init_dtype(self): self.dtype = np.float32 @@ -1537,11 +1537,13 @@ def if_enable_cinn(self): def test_check_output(self): place = core.CUDAPlace(0) - self.check_output_with_place(place) + self.check_output_with_place(place, check_new_ir=True) def test_check_grad(self): place = core.CUDAPlace(0) - self.check_grad_with_place(place, ['X'], 'Out', check_prim=True) + self.check_grad_with_place( + place, ['X'], 'Out', check_prim=True, check_new_ir=True + ) class TestSqrtComp(TestActivation, TestParameter): @@ -1568,10 +1570,14 @@ def if_enable_cinn(self): def test_check_grad(self): if self.dtype == np.float16: return - self.check_grad(['X'], 'Out', check_dygraph=True, check_prim=True) + self.check_grad( + ['X'], 'Out', check_dygraph=True, check_prim=True, check_new_ir=True + ) def test_check_output(self): - self.check_output(check_dygraph=True, check_prim=True) + self.check_output( + check_dygraph=True, check_prim=True, check_new_ir=True + ) class TestSqrtCompFp32(TestActivation): @@ -1596,10 +1602,14 @@ def if_enable_cinn(self): def test_check_grad(self): if self.dtype == np.float16: return - self.check_grad(['X'], 'Out', check_dygraph=True, check_prim=True) + self.check_grad( + ['X'], 'Out', check_dygraph=True, check_prim=True, check_new_ir=True + ) def test_check_output(self): - self.check_output(check_dygraph=True, check_prim=True) + self.check_output( + check_dygraph=True, check_prim=True, check_new_ir=True + ) def init_dtype(self): self.dtype = np.float32 @@ -4486,6 +4496,7 @@ def create_test_act_fp16_class( check_prim=False, check_prim_pir=False, enable_cinn=False, + check_new_ir=False, grad_atol=1e-2, **kwargs ): @@ -4514,6 +4525,7 @@ def test_check_output(self): check_dygraph=check_dygraph, check_prim=check_prim, check_prim_pir=check_prim_pir, + check_new_ir=check_new_ir, ) def test_check_grad(self): @@ -4528,6 +4540,7 @@ def test_check_grad(self): check_prim=check_prim, check_prim_pir=check_prim_pir, max_relative_error=grad_atol, + check_new_ir=check_new_ir, ) cls_name = "{}_{}".format(parent.__name__, "FP16OP") @@ -4547,8 +4560,12 @@ def test_check_grad(self): create_test_act_fp16_class(TestTanhshrink) create_test_act_fp16_class(TestHardShrink) create_test_act_fp16_class(TestSoftshrink) -create_test_act_fp16_class(TestSqrt, check_prim=True, enable_cinn=True) -create_test_act_fp16_class(TestSqrtComp, check_prim=True, enable_cinn=True) +create_test_act_fp16_class( + TestSqrt, check_prim=True, enable_cinn=True, check_new_ir=True +) +create_test_act_fp16_class( + TestSqrtComp, check_prim=True, enable_cinn=True, check_new_ir=True +) create_test_act_fp16_class(TestAbs, check_prim=True, enable_cinn=True) create_test_act_fp16_class(TestCeil, grad_check=False) create_test_act_fp16_class( @@ -4629,6 +4646,7 @@ def create_test_act_bf16_class( check_dygraph=True, check_prim=False, enable_cinn=False, + check_new_ir=False, grad_atol=1e-2, **kwargs ): @@ -4657,7 +4675,10 @@ def convert_input_output(self): def test_check_output(self): place = core.CUDAPlace(0) self.check_output_with_place( - place, atol=atol, check_prim=check_prim + place, + atol=atol, + check_prim=check_prim, + check_new_ir=check_new_ir, ) def test_check_grad(self): @@ -4669,6 +4690,7 @@ def test_check_grad(self): 'Out', max_relative_error=grad_atol, check_prim=check_prim, + check_new_ir=check_new_ir, ) cls_name = "{}_{}".format(parent.__name__, "BF16OP") @@ -4686,8 +4708,8 @@ def test_check_grad(self): create_test_act_bf16_class(TestTanhshrink) create_test_act_bf16_class(TestHardShrink) create_test_act_bf16_class(TestSoftshrink) -create_test_act_bf16_class(TestSqrt, check_prim=True) -create_test_act_bf16_class(TestSqrtComp, check_prim=True) +create_test_act_bf16_class(TestSqrt, check_prim=True, check_new_ir=True) +create_test_act_bf16_class(TestSqrtComp, check_prim=True, check_new_ir=True) create_test_act_bf16_class(TestAbs, check_prim=True) create_test_act_bf16_class(TestCeil, grad_check=False) create_test_act_bf16_class(TestFloor, grad_check=False, check_prim=True) From b23a0f4c5988c60604e27e341c3a8ecb04c64707 Mon Sep 17 00:00:00 2001 From: Bo Zhang <105368690+zhangbopd@users.noreply.github.com> Date: Mon, 9 Oct 2023 10:48:06 +0800 Subject: [PATCH 13/62] [PIR] Add op_trait and type_util (#57580) * op_trait and type_util * add unit test * add expect_throw for ci converage * fix for win ci --- paddle/pir/core/op_trait.cc | 196 +++++++++++++++ paddle/pir/core/op_trait.h | 121 +++++++++ paddle/pir/core/type_util.cc | 129 ++++++++++ paddle/pir/core/type_util.h | 65 +++++ test/cpp/pir/core/CMakeLists.txt | 11 +- test/cpp/pir/core/ir_op_test.cc | 387 +++++++++++++++++++++++++++++ test/cpp/pir/core/type_test.cc | 31 +++ test/cpp/pir/tools/test_dialect.cc | 19 +- test/cpp/pir/tools/test_op.cc | 119 ++++++++- test/cpp/pir/tools/test_op.h | 262 ++++++++++++++++++- 10 files changed, 1328 insertions(+), 12 deletions(-) create mode 100644 paddle/pir/core/op_trait.cc create mode 100644 paddle/pir/core/op_trait.h create mode 100644 paddle/pir/core/type_util.cc create mode 100644 paddle/pir/core/type_util.h diff --git a/paddle/pir/core/op_trait.cc b/paddle/pir/core/op_trait.cc new file mode 100644 index 0000000000000..ccea4e3f06d9b --- /dev/null +++ b/paddle/pir/core/op_trait.cc @@ -0,0 +1,196 @@ +// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/pir/core/op_trait.h" +#include "paddle/pir/core/enforce.h" +#include "paddle/pir/core/type_util.h" + +namespace pir::detail { + +void VerifySameOperandsShapeTrait(Operation *op) { + VLOG(4) << "Verify SameOperandsShapeTrait for : " << op->name(); + + IR_ENFORCE(op->num_operands() > 0, + "Op %s with SameOperandsShapeTrait requires at least 1 operands, " + "but got %u operands.", + op->name(), + op->num_operands()); + + std::vector operands = op->operands(); + std::vector types; + std::for_each(operands.begin(), operands.end(), [&types](pir::OpOperand op) { + types.push_back(op.type()); + }); + + IR_ENFORCE(VerifyCompatibleShapes(types), + "Op %s with SameOperandsShapeTrait requires the same shape for " + "all operands.", + op->name()); +} + +void VerifySameOperandsAndResultShapeTrait(Operation *op) { + VLOG(4) << "Verify SameOperandsAndResultShapeTrait for : " << op->name(); + + IR_ENFORCE(op->num_operands() > 0, + "Op %s with SameOperandsAndResultShapeTrait requires at least 1 " + "operands, but got %u operands.", + op->name(), + op->num_operands()); + + IR_ENFORCE(op->num_results() > 0, + "Op %s with SameOperandsAndResultShapeTrait requires at least 1 " + "results, but got %u results.", + op->name(), + op->num_results()); + + std::vector operands = op->operands(); + std::vector results = op->results(); + + std::vector types; + + std::for_each(operands.begin(), operands.end(), [&types](pir::OpOperand op) { + types.push_back(op.type()); + }); + + std::for_each(results.begin(), results.end(), [&types](pir::OpResult op) { + types.push_back(op.type()); + }); + + IR_ENFORCE(VerifyCompatibleShapes(types), + "Op %s with SameOperandsAndResultShapeTrait requires compatible " + "shapes for operands and results.", + op->name()); +} + +void VerifySameOperandsElementTypeTrait(Operation *op) { + VLOG(4) << "Verify SameOperandsElementTypeTrait for : " << op->name(); + + IR_ENFORCE(op->num_operands() > 0, + "Op %s with SameOperandsElementTypeTrait requires at least 1 " + "operands, but got %u operands.", + op->name(), + op->num_operands()); + + auto elementType = GetElementTypeOrSelf(op->result(0).type()); + for (auto operand : op->operands()) { + IR_ENFORCE(GetElementTypeOrSelf(operand.type()) == elementType, + "Op %s with SameOperandsElementTypeTrait requires the same " + "element type for all operands.", + op->name()); + } +} + +void VerifySameOperandsAndResultElementTypeTrait(Operation *op) { + VLOG(4) << "Verify SameOperandsAndResultElementTypeTrait for : " + << op->name(); + + IR_ENFORCE(op->num_operands() > 0, + "Op %s with SameOperandsAndResultElementTypeTrait requires at " + "least 1 operands, but got %u operands.", + op->name(), + op->num_operands()); + + IR_ENFORCE(op->num_results() > 0, + "Op %s with SameOperandsAndResultElementTypeTrait requires at " + "least 1 results, but got %u results.", + op->name(), + op->num_results()); + + auto elementType = GetElementTypeOrSelf(op->result(0).type()); + + // Verify result element type matches first result's element type. + for (auto result : op->results()) { + IR_ENFORCE(GetElementTypeOrSelf(result.type()) == elementType, + "Op %s with SameOperandsAndResultElementTypeTrait requires the " + "same element type for all operands and results.", + op->name()); + } + + // Verify operand's element type matches first result's element type. + for (auto operand : op->operands()) { + IR_ENFORCE(GetElementTypeOrSelf(operand.type()) == elementType, + "Op %s with SameOperandsAndResultElementTypeTrait requires the " + "same element type for all operands and results.", + op->name()); + } +} + +void VerifySameOperandsAndResultTypeTrait(Operation *op) { + VLOG(4) << "Verify SameOperandsAndResultTypeTrait for : " << op->name(); + + IR_ENFORCE(op->num_operands() > 0, + "Op %s with SameOperandsAndResultTypeTrait requires at least 1 " + "operands, but got %u operands.", + op->name(), + op->num_operands()); + + IR_ENFORCE(op->num_results() > 0, + "Op %s with SameOperandsAndResultTypeTrait requires at least 1 " + "results, but got %u results.", + op->name(), + op->num_results()); + + auto type = op->result(0).type(); + auto elementType = GetElementTypeOrSelf(type); + + for (auto result : op->results()) { + IR_ENFORCE(GetElementTypeOrSelf(result.type()) == elementType, + "Op %s with SameOperandsAndResultTypeTrait requires the same " + "type for all operands and results.", + op->name()); + + IR_ENFORCE(VerifyCompatibleShape(result.type(), type), + "Op %s with SameOperandsAndResultTypeTrait requires the same " + "type for all operands and results.", + op->name()); + } + + for (auto operand : op->operands()) { + IR_ENFORCE(GetElementTypeOrSelf(operand.type()) == elementType, + "Op %s with SameOperandsAndResultTypeTrait requires the same " + "type for all operands and results.", + op->name()); + + IR_ENFORCE(VerifyCompatibleShape(operand.type(), type), + "Op %s with SameOperandsAndResultTypeTrait requires the same " + "type for all operands and results.", + op->name()); + } +} + +void VerifySameTypeOperandsTrait(Operation *op) { + VLOG(4) << "Verify SameTypeOperandsTrait for : " << op->name(); + + // For zero or only one operand. + unsigned operand_nums = op->num_operands(); + if (operand_nums < 2) return; + + auto type = op->operand(0).type(); + + for (auto operand : op->operands()) { + IR_ENFORCE(operand.type() == type, + "Op %s with SameTypeOperandsTrait requires all operands to have " + "the same type.", + op->name()); + } +} + +} // namespace pir::detail + +IR_DEFINE_EXPLICIT_TYPE_ID(pir::SameOperandsShapeTrait) +IR_DEFINE_EXPLICIT_TYPE_ID(pir::SameOperandsAndResultShapeTrait) +IR_DEFINE_EXPLICIT_TYPE_ID(pir::SameOperandsElementTypeTrait) +IR_DEFINE_EXPLICIT_TYPE_ID(pir::SameOperandsAndResultElementTypeTrait) +IR_DEFINE_EXPLICIT_TYPE_ID(pir::SameOperandsAndResultTypeTrait) +IR_DEFINE_EXPLICIT_TYPE_ID(pir::SameTypeOperandsTrait) diff --git a/paddle/pir/core/op_trait.h b/paddle/pir/core/op_trait.h new file mode 100644 index 0000000000000..760799fd16165 --- /dev/null +++ b/paddle/pir/core/op_trait.h @@ -0,0 +1,121 @@ +// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/pir/core/op_base.h" + +namespace pir { + +namespace detail { +void VerifySameOperandsShapeTrait(Operation *op); +void VerifySameOperandsAndResultShapeTrait(Operation *op); +void VerifySameOperandsElementTypeTrait(Operation *op); +void VerifySameOperandsAndResultElementTypeTrait(Operation *op); +void VerifySameOperandsAndResultTypeTrait(Operation *op); +void VerifySameTypeOperandsTrait(Operation *op); +} // namespace detail + +/// +/// \brief Provides verification for ops that are known to have the +/// same operand shape. +/// +class SameOperandsShapeTrait : public pir::OpTraitBase { + public: + explicit SameOperandsShapeTrait(pir::Operation *op) + : pir::OpTraitBase(op) {} + static void Verify(Operation *op) { + return detail::VerifySameOperandsShapeTrait(op); + } +}; + +/// +/// \brief Provides verification for ops that are known to have the +/// same operand and result shape. +/// +class SameOperandsAndResultShapeTrait + : public pir::OpTraitBase { + public: + explicit SameOperandsAndResultShapeTrait(pir::Operation *op) + : pir::OpTraitBase(op) {} + static void Verify(Operation *op) { + return detail::VerifySameOperandsAndResultShapeTrait(op); + } +}; + +/// +/// \brief Provides verification for ops that are known to have the +/// same operand element type (or the type itself if it is scalar). +/// +class SameOperandsElementTypeTrait + : public pir::OpTraitBase { + public: + explicit SameOperandsElementTypeTrait(pir::Operation *op) + : pir::OpTraitBase(op) {} + static void Verify(Operation *op) { + return detail::VerifySameOperandsElementTypeTrait(op); + } +}; + +/// +/// \brief Provides verification for ops that are known to have the +/// same operand and result element type (or the type itself if it is scalar). +/// +class SameOperandsAndResultElementTypeTrait + : public pir::OpTraitBase { + public: + explicit SameOperandsAndResultElementTypeTrait(pir::Operation *op) + : pir::OpTraitBase(op) {} + static void Verify(Operation *op) { + return detail::VerifySameOperandsAndResultElementTypeTrait(op); + } +}; + +/// +/// \brief Provides verification for ops that are known to have the +/// same operand and result type. It Subsumes both +/// SameOperandsAndResultShapeTrait and SameOperandsAndResultElementTypeTrait +/// +class SameOperandsAndResultTypeTrait + : public pir::OpTraitBase { + public: + explicit SameOperandsAndResultTypeTrait(pir::Operation *op) + : pir::OpTraitBase(op) {} + + static void Verify(Operation *op) { + return detail::VerifySameOperandsAndResultTypeTrait(op); + } +}; + +/// +/// \brief Provides verification that all operands of the specified op have the +/// same type. +/// +class SameTypeOperandsTrait : public pir::OpTraitBase { + public: + explicit SameTypeOperandsTrait(pir::Operation *op) + : pir::OpTraitBase(op) {} + static void Verify(Operation *op) { + return detail::VerifySameTypeOperandsTrait(op); + } +}; + +} // namespace pir + +IR_DECLARE_EXPLICIT_TYPE_ID(pir::SameOperandsShapeTrait) +IR_DECLARE_EXPLICIT_TYPE_ID(pir::SameOperandsAndResultShapeTrait) +IR_DECLARE_EXPLICIT_TYPE_ID(pir::SameOperandsElementTypeTrait) +IR_DECLARE_EXPLICIT_TYPE_ID(pir::SameOperandsAndResultElementTypeTrait) +IR_DECLARE_EXPLICIT_TYPE_ID(pir::SameOperandsAndResultTypeTrait) +IR_DECLARE_EXPLICIT_TYPE_ID(pir::SameTypeOperandsTrait) diff --git a/paddle/pir/core/type_util.cc b/paddle/pir/core/type_util.cc new file mode 100644 index 0000000000000..0d6d137a897f0 --- /dev/null +++ b/paddle/pir/core/type_util.cc @@ -0,0 +1,129 @@ +// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/pir/core/type_util.h" +#include + +namespace pir { + +Type GetElementTypeOrSelf(Type type) { + if (auto sType = type.dyn_cast()) + return sType.GetElementType(); + return type; +} + +bool VerifyCompatibleShape(const phi::DDim &lhs_shape, + const phi::DDim &rhs_shape) { + if (lhs_shape.size() != rhs_shape.size()) return false; + + for (auto dim1 : phi::vectorize(lhs_shape)) { + for (auto dim2 : phi::vectorize(rhs_shape)) { + if (!ShapedTypeInterface::IsDynamic(dim1) && + !ShapedTypeInterface::IsDynamic(dim2) && dim1 != dim2) + return false; + } + } + return true; +} + +bool VerifyCompatibleShape(Type lhs_type, Type rhs_type) { + auto lhs_shaped_type = lhs_type.dyn_cast(); + auto rhs_shaped_type = rhs_type.dyn_cast(); + + // Either both or neither type should be shaped. + if (!lhs_shaped_type) return !rhs_shaped_type; + if (!rhs_shaped_type) return false; + + if (!lhs_shaped_type.HasRank() || !rhs_shaped_type.HasRank()) return true; + + return VerifyCompatibleShape(lhs_shaped_type.GetShape(), + rhs_shaped_type.GetShape()); +} + +bool VerifyCompatibleDims(const std::vector &dims) { + if (dims.empty()) return true; + auto static_dim = std::accumulate( + dims.begin(), dims.end(), dims.front(), [](auto &fold, auto &dim) { + return ShapedTypeInterface::IsDynamic(dim) ? fold : dim; + }); + return std::all_of(dims.begin(), dims.begin(), [&](auto dim) { + return ShapedTypeInterface::IsDynamic(dim) || dim == static_dim; + }); +} + +bool VerifyCompatibleShapes(const std::vector &lhs_types, + const std::vector &rhs_types) { + if (lhs_types.size() != rhs_types.size()) return false; + + for (auto it1 : lhs_types) { + for (auto it2 : rhs_types) { + if (!VerifyCompatibleShape(it1, it2)) return false; + } + } + return true; +} + +bool VerifyCompatibleShapes(const std::vector &types) { + std::vector shaped_type_interfaces; + + std::for_each( + types.begin(), types.end(), [&shaped_type_interfaces](Type type) { + shaped_type_interfaces.push_back(type.dyn_cast()); + }); + + // Return false if some, but not all are not shaped. Return early if none + // are shaped also. + if (std::none_of(shaped_type_interfaces.begin(), + shaped_type_interfaces.end(), + [](auto t) { return t; })) + return true; + + if (!std::all_of(shaped_type_interfaces.begin(), + shaped_type_interfaces.end(), + [](auto t) { return t; })) + return false; + + // Remove all unranked shapes + std::vector shapes; + + std::for_each(shaped_type_interfaces.begin(), + shaped_type_interfaces.end(), + [&shapes](ShapedTypeInterface type) { + if (type.HasRank()) + shapes.push_back(type.dyn_cast()); + }); + if (shapes.empty()) return true; + + // All ranks should be equal + int64_t firstRank = shapes.front().GetRank(); + + if (std::any_of(shapes.begin(), shapes.end(), [&](auto shape) { + return firstRank != shape.GetRank(); + })) + return false; + + for (unsigned i = 0; i < firstRank; ++i) { + // For all ranked dimensions + std::vector dims; + std::for_each(shapes.begin(), shapes.end(), [&](ShapedTypeInterface shape) { + dims.push_back(shape.GetDimSize(i)); + }); + + if (!VerifyCompatibleDims(dims)) return false; + } + + return true; +} + +} // namespace pir diff --git a/paddle/pir/core/type_util.h b/paddle/pir/core/type_util.h new file mode 100644 index 0000000000000..5704ba2abea78 --- /dev/null +++ b/paddle/pir/core/type_util.h @@ -0,0 +1,65 @@ +// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +/// +/// \brief Utility Functions +/// + +#include "paddle/pir/core/builtin_type.h" +#include "paddle/pir/core/builtin_type_interfaces.h" + +namespace pir { +/// +/// \brief Return the element type or return the type itself. +/// +Type GetElementTypeOrSelf(Type type); + +/// +/// \brief Returns true if the given two shapes are compatible. That is, they +/// have the same size and each pair of the elements are equal or one of them is +/// dynamic. +/// +bool VerifyCompatibleShape(const phi::DDim& lhs_shape, + const phi::DDim& rhs_shape); + +/// +/// \brief Returns true if the given two types have compatible shape. That +/// is, they are both scalars (not shaped), or they are both shaped types and at +/// least one is unranked or they have compatible dimensions. Dimensions are +/// compatible if at least one is dynamic or both are equal. The element type +/// does not matter. +/// +bool VerifyCompatibleShape(Type lhs_type, Type rhs_type); + +/// +/// \brief Dimensions are compatible if all non-dynamic dims are equal. +/// +bool VerifyCompatibleDims(const std::vector& dims); + +/// +/// \brief Returns true if the given two arrays have the same number of elements +/// and each pair wise entries have compatible shape. +/// +bool VerifyCompatibleShapes(const std::vector& lhs_types, + const std::vector& rhs_types); + +/// +/// \brief Returns true if all given types have compatible shapes. That is, +/// they are all scalars (not shaped), or they are all shaped types and any +/// ranked shapes have compatible dimensions. Dimensions are compatible if all +/// non-dynamic dims are equal. The element type does not matter. +/// +bool VerifyCompatibleShapes(const std::vector& types); +} // namespace pir diff --git a/test/cpp/pir/core/CMakeLists.txt b/test/cpp/pir/core/CMakeLists.txt index 0d65bc5b454c3..0f0ec568bb50a 100644 --- a/test/cpp/pir/core/CMakeLists.txt +++ b/test/cpp/pir/core/CMakeLists.txt @@ -8,14 +8,15 @@ cc_test_old( pd_op_dialect) cc_test_old(ir_attribute_test SRCS ir_attribute_test.cc DEPS pir gtest) cc_test_old(ir_value_test SRCS ir_value_test.cc DEPS pir gtest) -cc_test_old( +paddle_test( ir_op_test SRCS ir_op_test.cc DEPS pir gtest - test_dialect) + test_dialect + pd_op_dialect) cc_test_old(ir_region_test SRCS ir_region_test.cc DEPS pir gtest) cc_test_old(ir_builder_test SRCS ir_builder_test.cc DEPS pir gtest) cc_test_old( @@ -139,3 +140,9 @@ cc_test_old( test_dialect gtest pir) + +if(WITH_ONNXRUNTIME AND WIN32) + # Copy onnxruntime for some c++ test in Windows, since the test will + # be build only in CI, so suppose the generator in Windows is Ninja. + copy_onnx(ir_op_test) +endif() diff --git a/test/cpp/pir/core/ir_op_test.cc b/test/cpp/pir/core/ir_op_test.cc index c512ea753e3c0..596519ba57d4c 100644 --- a/test/cpp/pir/core/ir_op_test.cc +++ b/test/cpp/pir/core/ir_op_test.cc @@ -15,6 +15,8 @@ #include #include +#include "paddle/fluid/pir/dialect/operator/ir/op_dialect.h" +#include "paddle/phi/core/tensor_meta.h" #include "paddle/pir/core/block.h" #include "paddle/pir/core/builder.h" #include "paddle/pir/core/builtin_attribute.h" @@ -43,6 +45,27 @@ pir::AttributeMap CreateAttributeMap( return attr_map; } +pir::Operation *CreateDenseTensorOp( + pir::IrContext *ctx, + const phi::DDim &dims, + const std::vector &attribute_names, + const std::vector &attributes, + const pir::Type &dtype = + pir::Float32Type::get(pir::IrContext::Instance())) { + std::vector op_inputs = {}; + phi::DataLayout data_layout = phi::DataLayout::NCHW; + phi::LoD lod = {{0, 1, 2}}; + size_t offset = 0; + std::vector op_output_types = { + pir::DenseTensorType::get(ctx, dtype, dims, data_layout, lod, offset)}; + pir::Operation *op = + pir::Operation::Create(op_inputs, + CreateAttributeMap(attribute_names, attributes), + op_output_types, + pir::OpInfo()); + return op; +} + TEST(op_test, region_test) { // (1) Register Dialect, Operation1, Operation2 into IrContext. pir::IrContext *ctx = pir::IrContext::Instance(); @@ -126,3 +149,367 @@ TEST(op_test, trait_and_interface) { pir::OperationArgument argument(&ctx, "test.region"); EXPECT_THROW(builder.Build(std::move(argument)), pir::IrNotMetException); } + +TEST(op_test, op_traits_test) { + pir::IrContext *ctx = pir::IrContext::Instance(); + ctx->GetOrRegisterDialect(); + ctx->GetOrRegisterDialect(); + + pir::Program program(ctx); + auto block = program.block(); + pir::Builder builder(ctx, block); + + pir::Type dtype = pir::Float32Type::get(ctx); + phi::DDim dims = {2, 2}; + phi::DataLayout data_layout = phi::DataLayout::NCHW; + phi::LoD lod = {{0, 1, 2}}; + size_t offset = 0; + + pir::DenseTensorType dense_tensor_dtype = + pir::DenseTensorType::get(ctx, dtype, dims, data_layout, lod, offset); + + pir::Operation *op1 = + CreateDenseTensorOp(ctx, dims, {"op1_temp"}, {"op1_attr"}, dtype); + pir::Operation *op2 = + CreateDenseTensorOp(ctx, dims, {"op2_temp"}, {"op2_attr"}, dtype); + + auto op3 = builder.Build( + op1->result(0), op2->result(0), dense_tensor_dtype); + + EXPECT_EQ(op3->HasTrait(), true); + EXPECT_EQ(op3->HasTrait(), true); + EXPECT_EQ(op3->HasTrait(), true); + EXPECT_EQ(op3->HasTrait(), true); + EXPECT_EQ(op3->HasTrait(), true); + EXPECT_EQ(op3->HasTrait(), true); +} + +TEST(op_test, same_operands_shape_trait_test1) { + pir::IrContext *ctx = pir::IrContext::Instance(); + ctx->GetOrRegisterDialect(); + ctx->GetOrRegisterDialect(); + + pir::Program program(ctx); + auto block = program.block(); + pir::Builder builder(ctx, block); + + EXPECT_THROW(builder.Build(), + pir::IrNotMetException); +} + +TEST(op_test, same_operands_shape_trait_test2) { + pir::IrContext *ctx = pir::IrContext::Instance(); + ctx->GetOrRegisterDialect(); + ctx->GetOrRegisterDialect(); + + pir::Program program(ctx); + auto block = program.block(); + pir::Builder builder(ctx, block); + + pir::Type dtype1 = pir::Float32Type::get(ctx); + phi::DDim dims1 = {2, 2}; + + pir::Type dtype2 = pir::Float64Type::get(ctx); + phi::DDim dims2 = {2, 2, 2}; + + phi::DataLayout data_layout = phi::DataLayout::NCHW; + phi::LoD lod = {{0, 1, 2}}; + size_t offset = 0; + + pir::DenseTensorType dense_tensor_dtype = + pir::DenseTensorType::get(ctx, dtype1, dims1, data_layout, lod, offset); + + pir::Operation *op1 = + CreateDenseTensorOp(ctx, dims1, {"op1_temp"}, {"op1_attr"}, dtype1); + pir::Operation *op2 = + CreateDenseTensorOp(ctx, dims2, {"op2_temp"}, {"op2_attr"}, dtype2); + + EXPECT_THROW(builder.Build( + op1->result(0), op2->result(0), dense_tensor_dtype), + pir::IrNotMetException); +} + +TEST(op_test, same_operands_and_result_shape_trait_test1) { + pir::IrContext *ctx = pir::IrContext::Instance(); + ctx->GetOrRegisterDialect(); + ctx->GetOrRegisterDialect(); + + pir::Program program(ctx); + auto block = program.block(); + pir::Builder builder(ctx, block); + + EXPECT_THROW(builder.Build(), + pir::IrNotMetException); +} + +TEST(op_test, same_operands_and_result_shape_trait_test2) { + pir::IrContext *ctx = pir::IrContext::Instance(); + ctx->GetOrRegisterDialect(); + ctx->GetOrRegisterDialect(); + + pir::Program program(ctx); + auto block = program.block(); + pir::Builder builder(ctx, block); + + pir::Type dtype = pir::Float64Type::get(ctx); + phi::DDim dims = {2, 2, 2}; + + pir::Operation *op1 = + CreateDenseTensorOp(ctx, dims, {"op1_temp"}, {"op1_attr"}, dtype); + pir::Operation *op2 = + CreateDenseTensorOp(ctx, dims, {"op2_temp"}, {"op2_attr"}, dtype); + + EXPECT_THROW(builder.Build( + op1->result(0), op2->result(0)), + pir::IrNotMetException); +} + +TEST(op_test, same_operands_and_result_shape_trait_test3) { + pir::IrContext *ctx = pir::IrContext::Instance(); + ctx->GetOrRegisterDialect(); + ctx->GetOrRegisterDialect(); + + pir::Program program(ctx); + auto block = program.block(); + pir::Builder builder(ctx, block); + + pir::Type dtype1 = pir::Float32Type::get(ctx); + phi::DDim dims1 = {2, 2}; + + pir::Type dtype2 = pir::Float64Type::get(ctx); + phi::DDim dims2 = {2, 2, 2}; + + phi::DataLayout data_layout = phi::DataLayout::NCHW; + phi::LoD lod = {{0, 1, 2}}; + size_t offset = 0; + + pir::DenseTensorType dense_tensor_dtype = + pir::DenseTensorType::get(ctx, dtype1, dims1, data_layout, lod, offset); + + pir::Operation *op1 = + CreateDenseTensorOp(ctx, dims1, {"op1_temp"}, {"op1_attr"}, dtype1); + pir::Operation *op2 = + CreateDenseTensorOp(ctx, dims2, {"op2_temp"}, {"op2_attr"}, dtype2); + + EXPECT_THROW(builder.Build( + op1->result(0), op2->result(0), dense_tensor_dtype), + pir::IrNotMetException); +} + +TEST(op_test, same_operands_element_type_trait_test1) { + pir::IrContext *ctx = pir::IrContext::Instance(); + ctx->GetOrRegisterDialect(); + ctx->GetOrRegisterDialect(); + + pir::Program program(ctx); + auto block = program.block(); + pir::Builder builder(ctx, block); + + EXPECT_THROW(builder.Build(), + pir::IrNotMetException); +} + +TEST(op_test, same_operands_element_type_trait_test2) { + pir::IrContext *ctx = pir::IrContext::Instance(); + ctx->GetOrRegisterDialect(); + ctx->GetOrRegisterDialect(); + + pir::Program program(ctx); + auto block = program.block(); + pir::Builder builder(ctx, block); + + pir::Type dtype1 = pir::Float32Type::get(ctx); + pir::Type dtype2 = pir::Float64Type::get(ctx); + + phi::DDim dims = {2, 2}; + phi::DataLayout data_layout = phi::DataLayout::NCHW; + phi::LoD lod = {{0, 1, 2}}; + size_t offset = 0; + + pir::DenseTensorType dense_tensor_dtype = + pir::DenseTensorType::get(ctx, dtype1, dims, data_layout, lod, offset); + + pir::Operation *op1 = + CreateDenseTensorOp(ctx, dims, {"op1_temp"}, {"op1_attr"}, dtype1); + pir::Operation *op2 = + CreateDenseTensorOp(ctx, dims, {"op2_temp"}, {"op2_attr"}, dtype2); + + EXPECT_THROW(builder.Build( + op1->result(0), op2->result(0), dense_tensor_dtype), + pir::IrNotMetException); +} + +TEST(op_test, same_operands_and_result_element_type_trait_test1) { + pir::IrContext *ctx = pir::IrContext::Instance(); + ctx->GetOrRegisterDialect(); + ctx->GetOrRegisterDialect(); + + pir::Program program(ctx); + auto block = program.block(); + pir::Builder builder(ctx, block); + + EXPECT_THROW(builder.Build(), + pir::IrNotMetException); +} + +TEST(op_test, same_operands_and_result_element_type_trait_test2) { + pir::IrContext *ctx = pir::IrContext::Instance(); + ctx->GetOrRegisterDialect(); + ctx->GetOrRegisterDialect(); + + pir::Program program(ctx); + auto block = program.block(); + pir::Builder builder(ctx, block); + + pir::Type dtype = pir::Float32Type::get(ctx); + phi::DDim dims = {2, 2}; + + pir::Operation *op1 = + CreateDenseTensorOp(ctx, dims, {"op1_temp"}, {"op1_attr"}, dtype); + pir::Operation *op2 = + CreateDenseTensorOp(ctx, dims, {"op2_temp"}, {"op2_attr"}, dtype); + + EXPECT_THROW(builder.Build( + op1->result(0), op2->result(0)), + pir::IrNotMetException); +} + +TEST(op_test, same_operands_and_result_element_type_trait_test3) { + pir::IrContext *ctx = pir::IrContext::Instance(); + ctx->GetOrRegisterDialect(); + ctx->GetOrRegisterDialect(); + + pir::Program program(ctx); + auto block = program.block(); + pir::Builder builder(ctx, block); + + pir::Type dtype1 = pir::Float32Type::get(ctx); + phi::DDim dims1 = {2, 2}; + + pir::Type dtype2 = pir::Float64Type::get(ctx); + phi::DDim dims2 = {2, 2, 2}; + + phi::DataLayout data_layout = phi::DataLayout::NCHW; + phi::LoD lod = {{0, 1, 2}}; + size_t offset = 0; + + pir::DenseTensorType dense_tensor_dtype1 = + pir::DenseTensorType::get(ctx, dtype1, dims1, data_layout, lod, offset); + pir::DenseTensorType dense_tensor_dtype2 = + pir::DenseTensorType::get(ctx, dtype2, dims2, data_layout, lod, offset); + + pir::Operation *op1 = + CreateDenseTensorOp(ctx, dims1, {"op1_temp"}, {"op1_attr"}, dtype1); + pir::Operation *op2 = + CreateDenseTensorOp(ctx, dims2, {"op2_temp"}, {"op2_attr"}, dtype2); + + EXPECT_THROW(builder.Build( + op1->result(0), + op2->result(0), + dense_tensor_dtype1, + dense_tensor_dtype1), + pir::IrNotMetException); + EXPECT_THROW(builder.Build( + op1->result(0), + op1->result(0), + dense_tensor_dtype1, + dense_tensor_dtype2), + pir::IrNotMetException); +} + +TEST(op_test, same_operands_and_result_type_trait_test1) { + pir::IrContext *ctx = pir::IrContext::Instance(); + ctx->GetOrRegisterDialect(); + ctx->GetOrRegisterDialect(); + + pir::Program program(ctx); + auto block = program.block(); + pir::Builder builder(ctx, block); + + EXPECT_THROW(builder.Build(), + pir::IrNotMetException); +} + +TEST(op_test, same_operands_and_result_type_trait_test2) { + pir::IrContext *ctx = pir::IrContext::Instance(); + ctx->GetOrRegisterDialect(); + ctx->GetOrRegisterDialect(); + + pir::Program program(ctx); + auto block = program.block(); + pir::Builder builder(ctx, block); + + pir::Type dtype = pir::Float32Type::get(ctx); + phi::DDim dims = {2, 2}; + + pir::Operation *op1 = + CreateDenseTensorOp(ctx, dims, {"op1_temp"}, {"op1_attr"}, dtype); + pir::Operation *op2 = + CreateDenseTensorOp(ctx, dims, {"op2_temp"}, {"op2_attr"}, dtype); + + EXPECT_THROW(builder.Build( + op1->result(0), op2->result(0)), + pir::IrNotMetException); +} + +TEST(op_test, same_operands_and_result_type_trait_test3) { + pir::IrContext *ctx = pir::IrContext::Instance(); + ctx->GetOrRegisterDialect(); + ctx->GetOrRegisterDialect(); + + pir::Program program(ctx); + auto block = program.block(); + pir::Builder builder(ctx, block); + + pir::Type dtype1 = pir::Float32Type::get(ctx); + phi::DDim dims1 = {2, 2}; + + pir::Type dtype2 = pir::Float64Type::get(ctx); + phi::DDim dims2 = {2, 2, 2}; + + phi::DataLayout data_layout = phi::DataLayout::NCHW; + phi::LoD lod = {{0, 1, 2}}; + size_t offset = 0; + + pir::DenseTensorType dense_tensor_dtype1 = + pir::DenseTensorType::get(ctx, dtype1, dims1, data_layout, lod, offset); + + pir::DenseTensorType dense_tensor_dtype2 = + pir::DenseTensorType::get(ctx, dtype2, dims2, data_layout, lod, offset); + + pir::DenseTensorType dense_tensor_dtype3 = + pir::DenseTensorType::get(ctx, dtype1, dims2, data_layout, lod, offset); + + pir::Operation *op1 = + CreateDenseTensorOp(ctx, dims1, {"op1_temp"}, {"op1_attr"}, dtype2); + pir::Operation *op2 = + CreateDenseTensorOp(ctx, dims2, {"op2_temp"}, {"op2_attr"}, dtype1); + + EXPECT_THROW(builder.Build( + op1->result(0), + op2->result(0), + dense_tensor_dtype1, + dense_tensor_dtype2), + pir::IrNotMetException); + + EXPECT_THROW(builder.Build( + op1->result(0), + op2->result(0), + dense_tensor_dtype1, + dense_tensor_dtype3), + pir::IrNotMetException); + + EXPECT_THROW(builder.Build( + op1->result(0), + op2->result(0), + dense_tensor_dtype1, + dense_tensor_dtype1), + pir::IrNotMetException); + + EXPECT_THROW(builder.Build( + op2->result(0), + op1->result(0), + dense_tensor_dtype1, + dense_tensor_dtype1), + pir::IrNotMetException); +} diff --git a/test/cpp/pir/core/type_test.cc b/test/cpp/pir/core/type_test.cc index ada08b5f9bf1a..0f3581732784f 100644 --- a/test/cpp/pir/core/type_test.cc +++ b/test/cpp/pir/core/type_test.cc @@ -24,6 +24,7 @@ #include "paddle/pir/core/type.h" #include "paddle/pir/core/type_base.h" #include "paddle/pir/core/type_name.h" +#include "paddle/pir/core/type_util.h" #include "paddle/pir/core/utils.h" class TypeA {}; @@ -260,6 +261,36 @@ TEST(type_test, pd_op_dialect) { EXPECT_EQ(select_rows_dtype.offset(), offset); } +TEST(type_test, type_util) { + pir::IrContext *ctx = pir::IrContext::Instance(); + ctx->GetOrRegisterDialect(); + + pir::Type fp32_dtype = pir::Float32Type::get(ctx); + phi::DDim dims1 = {2, 2}; + phi::DDim dims2 = {2, 2, 3}; + phi::DataLayout data_layout = phi::DataLayout::NCHW; + phi::LoD lod = {{0, 1, 2}}; + size_t offset = 0; + + paddle::dialect::SelectedRowsType select_rows_dtype1 = + paddle::dialect::SelectedRowsType::get( + ctx, fp32_dtype, dims1, data_layout, lod, offset); + + paddle::dialect::SelectedRowsType select_rows_dtype2 = + paddle::dialect::SelectedRowsType::get( + ctx, fp32_dtype, dims2, data_layout, lod, offset); + + std::vector types1 = { + select_rows_dtype1, select_rows_dtype1, select_rows_dtype1}; + std::vector types2 = { + select_rows_dtype1, select_rows_dtype1, select_rows_dtype1}; + std::vector types3 = { + select_rows_dtype2, select_rows_dtype2, select_rows_dtype2}; + + EXPECT_TRUE(pir::VerifyCompatibleShapes(types1, types2)); + EXPECT_FALSE(pir::VerifyCompatibleShapes(types1, types3)); +} + namespace TestNamespace { class TestClass {}; } // namespace TestNamespace diff --git a/test/cpp/pir/tools/test_dialect.cc b/test/cpp/pir/tools/test_dialect.cc index 49fb4a6951dd7..e3000a418119b 100644 --- a/test/cpp/pir/tools/test_dialect.cc +++ b/test/cpp/pir/tools/test_dialect.cc @@ -21,7 +21,24 @@ TestDialect::TestDialect(pir::IrContext *context) initialize(); } void TestDialect::initialize() { - RegisterOps(); + RegisterOps(); } void TestDialect::PrintOperation(pir::Operation *op, diff --git a/test/cpp/pir/tools/test_op.cc b/test/cpp/pir/tools/test_op.cc index b67dd24c5dc04..6041efec0e652 100644 --- a/test/cpp/pir/tools/test_op.cc +++ b/test/cpp/pir/tools/test_op.cc @@ -21,8 +21,8 @@ void RegionOp::Build(pir::Builder &builder, pir::OperationArgument &argument) { argument.AddRegion(nullptr); } -void BranchOp::Build(pir::Builder &builder, // NOLINT - pir::OperationArgument &argument, +void BranchOp::Build(pir::Builder &builder, // NOLINT + pir::OperationArgument &argument, // NOLINT const std::vector &target_operands, pir::Block *target) { argument.AddInputs(target_operands.begin(), target_operands.end()); @@ -35,9 +35,7 @@ void BranchOp::Verify() const { IR_ENFORCE((*this)->successor(0), "successor[0] can't be nullptr"); } -const char *Operation1::attributes_name[2] = { // NOLINT - "op1_attr1", - "op1_attr2"}; +const char *Operation1::attributes_name[2] = {"op1_attr1", "op1_attr2"}; void Operation1::Build(pir::Builder &builder, // NOLINT pir::OperationArgument &argument) { // NOLINT @@ -58,9 +56,120 @@ void Operation1::Verify() const { throw("Type of attribute: parameter_name is not right."); } } + +void TraitExampleOp::Build(pir::Builder &builder, // NOLINT + pir::OperationArgument &argument, // NOLINT + pir::Value l_operand, + pir::Value r_operand, + pir::Type out_type) { + argument.AddInput(l_operand); + argument.AddInput(r_operand); + argument.AddOutput(out_type); +} + +void SameOperandsShapeTraitOp2::Build( + pir::Builder &builder, // NOLINT + pir::OperationArgument &argument, // NOLINT + pir::Value l_operand, + pir::Value r_operand, + pir::Type out_type) { + argument.AddInput(l_operand); + argument.AddInput(r_operand); + argument.AddOutput(out_type); +} + +void SameOperandsAndResultShapeTraitOp2::Build( + pir::Builder &builder, // NOLINT + pir::OperationArgument &argument, // NOLINT + pir::Value l_operand, + pir::Value r_operand) { + argument.AddInput(l_operand); + argument.AddInput(r_operand); +} + +void SameOperandsAndResultShapeTraitOp3::Build( + pir::Builder &builder, // NOLINT + pir::OperationArgument &argument, // NOLINT + pir::Value l_operand, + pir::Value r_operand, + pir::Type out_type) { + argument.AddInput(l_operand); + argument.AddInput(r_operand); + argument.AddOutput(out_type); +} + +void SameOperandsElementTypeTraitOp2::Build( + pir::Builder &builder, // NOLINT + pir::OperationArgument &argument, // NOLINT + pir::Value l_operand, + pir::Value r_operand, + pir::Type out_type) { + argument.AddInput(l_operand); + argument.AddInput(r_operand); + argument.AddOutput(out_type); +} + +void SameOperandsAndResultElementTypeTraitOp2::Build( + pir::Builder &builder, // NOLINT + pir::OperationArgument &argument, // NOLINT + pir::Value l_operand, + pir::Value r_operand) { + argument.AddInput(l_operand); + argument.AddInput(r_operand); +} + +void SameOperandsAndResultElementTypeTraitOp3::Build( + pir::Builder &builder, // NOLINT + pir::OperationArgument &argument, // NOLINT + pir::Value l_operand, + pir::Value r_operand, + pir::Type out_type1, + pir::Type out_type2) { + argument.AddInput(l_operand); + argument.AddInput(r_operand); + argument.AddOutput(out_type1); + argument.AddOutput(out_type2); +} + +void SameOperandsAndResultTypeTraitOp2::Build( + pir::Builder &builder, // NOLINT + pir::OperationArgument &argument, // NOLINT + pir::Value l_operand, + pir::Value r_operand) { + argument.AddInput(l_operand); + argument.AddInput(r_operand); +} + +void SameOperandsAndResultTypeTraitOp3::Build( + pir::Builder &builder, // NOLINT + pir::OperationArgument &argument, // NOLINT + pir::Value l_operand, + pir::Value r_operand, + pir::Type out_type1, + pir::Type out_type2) { + argument.AddInput(l_operand); + argument.AddInput(r_operand); + argument.AddOutput(out_type1); + argument.AddOutput(out_type2); +} + } // namespace test IR_DEFINE_EXPLICIT_TYPE_ID(test::RegionOp) IR_DEFINE_EXPLICIT_TYPE_ID(test::BranchOp) IR_DEFINE_EXPLICIT_TYPE_ID(test::Operation1) IR_DEFINE_EXPLICIT_TYPE_ID(test::Operation2) +IR_DEFINE_EXPLICIT_TYPE_ID(test::TraitExampleOp) +IR_DEFINE_EXPLICIT_TYPE_ID(test::SameOperandsShapeTraitOp1) +IR_DEFINE_EXPLICIT_TYPE_ID(test::SameOperandsShapeTraitOp2) +IR_DEFINE_EXPLICIT_TYPE_ID(test::SameOperandsAndResultShapeTraitOp1) +IR_DEFINE_EXPLICIT_TYPE_ID(test::SameOperandsAndResultShapeTraitOp2) +IR_DEFINE_EXPLICIT_TYPE_ID(test::SameOperandsAndResultShapeTraitOp3) +IR_DEFINE_EXPLICIT_TYPE_ID(test::SameOperandsElementTypeTraitOp1) +IR_DEFINE_EXPLICIT_TYPE_ID(test::SameOperandsElementTypeTraitOp2) +IR_DEFINE_EXPLICIT_TYPE_ID(test::SameOperandsAndResultElementTypeTraitOp1) +IR_DEFINE_EXPLICIT_TYPE_ID(test::SameOperandsAndResultElementTypeTraitOp2) +IR_DEFINE_EXPLICIT_TYPE_ID(test::SameOperandsAndResultElementTypeTraitOp3) +IR_DEFINE_EXPLICIT_TYPE_ID(test::SameOperandsAndResultTypeTraitOp1) +IR_DEFINE_EXPLICIT_TYPE_ID(test::SameOperandsAndResultTypeTraitOp2) +IR_DEFINE_EXPLICIT_TYPE_ID(test::SameOperandsAndResultTypeTraitOp3) diff --git a/test/cpp/pir/tools/test_op.h b/test/cpp/pir/tools/test_op.h index 8d4ccd49a38ed..98f01db37614d 100644 --- a/test/cpp/pir/tools/test_op.h +++ b/test/cpp/pir/tools/test_op.h @@ -17,6 +17,7 @@ #include "paddle/pir/core/builder.h" #include "paddle/pir/core/builtin_type.h" #include "paddle/pir/core/op_base.h" +#include "paddle/pir/core/op_trait.h" #include "paddle/pir/core/operation_utils.h" #include "test/cpp/pir/tools/test_interface.h" #include "test/cpp/pir/tools/test_trait.h" @@ -58,7 +59,7 @@ class Operation1 : public pir::Op { using Op::Op; static const char *name() { return "test.operation1"; } static constexpr uint32_t attributes_num = 2; - static const char *attributes_name[attributes_num]; // NOLINT + static const char *attributes_name[attributes_num]; static void Build(pir::Builder &builder, // NOLINT pir::OperationArgument &argument); // NOLINT void Verify() const; @@ -71,16 +72,269 @@ class Operation2 using Op::Op; static const char *name() { return "test.operation2"; } static constexpr uint32_t attributes_num = 0; - static constexpr const char **attributes_name = nullptr; // NOLINT - static void Build(pir::Builder &builder, // NOLINT - pir::OperationArgument &argument) {} // NOLINT + static constexpr const char **attributes_name = nullptr; + static void Build(pir::Builder &builder, // NOLINT + pir::OperationArgument &argument) {} // NOLINT void Verify() const {} static void InferShape() { VLOG(2) << "This is op2's InferShape interface."; } }; +// Define TraitExampleOp. +class TraitExampleOp + : public pir::Op { + public: + using Op::Op; + static const char *name() { return "test.trait_example_op"; } + static constexpr uint32_t attributes_num = 0; + static constexpr const char **attributes_name = nullptr; + static void Build(pir::Builder &builder, // NOLINT + pir::OperationArgument &argument, // NOLINT + pir::Value l_operand, + pir::Value r_operand, + pir::Type out_type); + void Verify() const {} +}; + +// Define SameOperandsShapeTraitOp1. +class SameOperandsShapeTraitOp1 + : public pir::Op { + public: + using Op::Op; + static const char *name() { return "test.same_operands_shape_op1"; } + static constexpr uint32_t attributes_num = 0; + static constexpr const char **attributes_name = nullptr; + static void Build(pir::Builder &builder, // NOLINT + pir::OperationArgument &argument) {} // NOLINT + void Verify() const {} +}; + +// Define SameOperandsShapeTraitOp2. +class SameOperandsShapeTraitOp2 + : public pir::Op { + public: + using Op::Op; + static const char *name() { return "test.same_operands_shape_op2"; } + static constexpr uint32_t attributes_num = 0; + static constexpr const char **attributes_name = nullptr; + static void Build(pir::Builder &builder, // NOLINT + pir::OperationArgument &argument, // NOLINT + pir::Value l_operand, + pir::Value r_operand, + pir::Type out_type); + void Verify() const {} +}; + +// Define SameOperandsAndResultShapeTraitOp1. +class SameOperandsAndResultShapeTraitOp1 + : public pir::Op { + public: + using Op::Op; + static const char *name() { + return "test.same_operands_and_result_shape_op1"; + } + static constexpr uint32_t attributes_num = 0; + static constexpr const char **attributes_name = nullptr; + static void Build(pir::Builder &builder, // NOLINT + pir::OperationArgument &argument) {} // NOLINT + void Verify() const {} +}; + +// Define SameOperandsAndResultShapeTraitOp2. +class SameOperandsAndResultShapeTraitOp2 + : public pir::Op { + public: + using Op::Op; + static const char *name() { + return "test.same_operands_and_result_shape_op2"; + } + static constexpr uint32_t attributes_num = 0; + static constexpr const char **attributes_name = nullptr; + static void Build(pir::Builder &builder, // NOLINT + pir::OperationArgument &argument, // NOLINT + pir::Value l_operand, + pir::Value r_operand); + void Verify() const {} +}; + +// Define SameOperandsAndResultShapeTraitOp3. +class SameOperandsAndResultShapeTraitOp3 + : public pir::Op { + public: + using Op::Op; + static const char *name() { + return "test.same_operands_and_result_shape_op3"; + } + static constexpr uint32_t attributes_num = 0; + static constexpr const char **attributes_name = nullptr; + static void Build(pir::Builder &builder, // NOLINT + pir::OperationArgument &argument, // NOLINT + pir::Value l_operand, + pir::Value r_operand, + pir::Type out_type); + void Verify() const {} +}; + +// Define SameOperandsElementTypeTraitOp1. +class SameOperandsElementTypeTraitOp1 + : public pir::Op { + public: + using Op::Op; + static const char *name() { return "test.same_operands_element_type_op1"; } + static constexpr uint32_t attributes_num = 0; + static constexpr const char **attributes_name = nullptr; + static void Build(pir::Builder &builder, // NOLINT + pir::OperationArgument &argument) {} // NOLINT + void Verify() const {} +}; + +// Define SameOperandsElementTypeTraitOp2. +class SameOperandsElementTypeTraitOp2 + : public pir::Op { + public: + using Op::Op; + static const char *name() { return "test.same_operands_element_type_op1"; } + static constexpr uint32_t attributes_num = 0; + static constexpr const char **attributes_name = nullptr; + static void Build(pir::Builder &builder, // NOLINT + pir::OperationArgument &argument, // NOLINT + pir::Value l_operand, + pir::Value r_operand, + pir::Type out_type); + void Verify() const {} +}; + +// Define SameOperandsAndResultElementTypeTraitOp1. +class SameOperandsAndResultElementTypeTraitOp1 + : public pir::Op { + public: + using Op::Op; + static const char *name() { + return "test.same_operands_and_result_element_type_op1"; + } + static constexpr uint32_t attributes_num = 0; + static constexpr const char **attributes_name = nullptr; + static void Build(pir::Builder &builder, // NOLINT + pir::OperationArgument &argument) {} // NOLINT + void Verify() const {} +}; + +// Define SameOperandsAndResultElementTypeTraitOp2. +class SameOperandsAndResultElementTypeTraitOp2 + : public pir::Op { + public: + using Op::Op; + static const char *name() { + return "test.same_operands_and_result_element_type_op2"; + } + static constexpr uint32_t attributes_num = 0; + static constexpr const char **attributes_name = nullptr; + static void Build(pir::Builder &builder, // NOLINT + pir::OperationArgument &argument, // NOLINT + pir::Value l_operand, + pir::Value r_operand); + void Verify() const {} +}; + +// Define SameOperandsAndResultElementTypeTraitOp3. +class SameOperandsAndResultElementTypeTraitOp3 + : public pir::Op { + public: + using Op::Op; + static const char *name() { + return "test.same_operands_and_result_element_type_op3"; + } + static constexpr uint32_t attributes_num = 0; + static constexpr const char **attributes_name = nullptr; + static void Build(pir::Builder &builder, // NOLINT + pir::OperationArgument &argument, // NOLINT + pir::Value l_operand, + pir::Value r_operand, + pir::Type out_type1, + pir::Type out_type2); + void Verify() const {} +}; + +// Define SameOperandsAndResultTypeTraitOp1. +class SameOperandsAndResultTypeTraitOp1 + : public pir::Op { + public: + using Op::Op; + static const char *name() { return "test.same_operands_and_result_type_op1"; } + static constexpr uint32_t attributes_num = 0; + static constexpr const char **attributes_name = nullptr; + static void Build(pir::Builder &builder, // NOLINT + pir::OperationArgument &argument) {} // NOLINT + void Verify() const {} +}; + +// Define SameOperandsAndResultTypeTraitOp2. +class SameOperandsAndResultTypeTraitOp2 + : public pir::Op { + public: + using Op::Op; + static const char *name() { return "test.same_operands_and_result_type_op2"; } + static constexpr uint32_t attributes_num = 0; + static constexpr const char **attributes_name = nullptr; + static void Build(pir::Builder &builder, // NOLINT + pir::OperationArgument &argument, // NOLINT + pir::Value l_operand, + pir::Value r_operand); + void Verify() const {} +}; + +// Define SameOperandsAndResultTypeTraitOp3. +class SameOperandsAndResultTypeTraitOp3 + : public pir::Op { + public: + using Op::Op; + static const char *name() { return "test.same_operands_and_result_type_op3"; } + static constexpr uint32_t attributes_num = 0; + static constexpr const char **attributes_name = nullptr; + + static void Build(pir::Builder &builder, // NOLINT + pir::OperationArgument &argument, // NOLINT + pir::Value l_operand, + pir::Value r_operand, + pir::Type out_type1, + pir::Type out_type2); + + void Verify() const {} +}; + } // namespace test IR_DECLARE_EXPLICIT_TYPE_ID(test::RegionOp) IR_DECLARE_EXPLICIT_TYPE_ID(test::BranchOp) IR_DECLARE_EXPLICIT_TYPE_ID(test::Operation1) IR_DECLARE_EXPLICIT_TYPE_ID(test::Operation2) +IR_DECLARE_EXPLICIT_TYPE_ID(test::TraitExampleOp) +IR_DECLARE_EXPLICIT_TYPE_ID(test::SameOperandsShapeTraitOp1) +IR_DECLARE_EXPLICIT_TYPE_ID(test::SameOperandsShapeTraitOp2) +IR_DECLARE_EXPLICIT_TYPE_ID(test::SameOperandsAndResultShapeTraitOp1) +IR_DECLARE_EXPLICIT_TYPE_ID(test::SameOperandsAndResultShapeTraitOp2) +IR_DECLARE_EXPLICIT_TYPE_ID(test::SameOperandsAndResultShapeTraitOp3) +IR_DECLARE_EXPLICIT_TYPE_ID(test::SameOperandsElementTypeTraitOp1) +IR_DECLARE_EXPLICIT_TYPE_ID(test::SameOperandsElementTypeTraitOp2) +IR_DECLARE_EXPLICIT_TYPE_ID(test::SameOperandsAndResultElementTypeTraitOp1) +IR_DECLARE_EXPLICIT_TYPE_ID(test::SameOperandsAndResultElementTypeTraitOp2) +IR_DECLARE_EXPLICIT_TYPE_ID(test::SameOperandsAndResultElementTypeTraitOp3) +IR_DECLARE_EXPLICIT_TYPE_ID(test::SameOperandsAndResultTypeTraitOp1) +IR_DECLARE_EXPLICIT_TYPE_ID(test::SameOperandsAndResultTypeTraitOp2) +IR_DECLARE_EXPLICIT_TYPE_ID(test::SameOperandsAndResultTypeTraitOp3) From 9ce9da65a914d468f3244016cde9c57f24d98fb1 Mon Sep 17 00:00:00 2001 From: wanghuancoder Date: Mon, 9 Oct 2023 10:49:47 +0800 Subject: [PATCH 14/62] [CleanOps]del unuseful op (#57730) * del unuseful op --- paddle/fluid/framework/op_compatible_info.cc | 2 - paddle/fluid/operators/lstmp_op.cc | 411 ------------ paddle/fluid/operators/lstmp_op.cu | 21 - paddle/fluid/operators/lstmp_op.h | 610 ------------------ .../sequence_topk_avg_pooling_op.cc | 149 ----- .../sequence_topk_avg_pooling_op.h | 247 ------- paddle/fluid/operators/unity_build_rule.cmake | 4 +- test/legacy_test/CMakeLists.txt | 1 - test/legacy_test/test_lstmp_op.py | 379 ----------- .../test_sequence_topk_avg_pooling.py | 166 ----- ...ck_op_sequence_batch_1_input_white_list.py | 1 - ...op_sequence_instance_0_input_white_list.py | 1 - test/white_list/check_shape_white_list.py | 1 - test/white_list/no_grad_set_white_list.py | 1 - test/white_list/op_accuracy_white_list.py | 1 - test/white_list/op_threshold_white_list.py | 1 - 16 files changed, 1 insertion(+), 1995 deletions(-) delete mode 100644 paddle/fluid/operators/lstmp_op.cc delete mode 100644 paddle/fluid/operators/lstmp_op.cu delete mode 100644 paddle/fluid/operators/lstmp_op.h delete mode 100644 paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.cc delete mode 100644 paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.h delete mode 100644 test/legacy_test/test_lstmp_op.py delete mode 100644 test/sequence/test_sequence_topk_avg_pooling.py diff --git a/paddle/fluid/framework/op_compatible_info.cc b/paddle/fluid/framework/op_compatible_info.cc index fe7180dd373bb..1b7bcb14295dd 100644 --- a/paddle/fluid/framework/op_compatible_info.cc +++ b/paddle/fluid/framework/op_compatible_info.cc @@ -94,8 +94,6 @@ void OpCompatibleMap::InitOpCompatibleMap() { op_compatible_map_["pull_box_sparse"] = {"1.6.0", OpCompatibleType::DEFIN_NOT}; op_compatible_map_["scatter_nd_add"] = {"1.6.0", OpCompatibleType::DEFIN_NOT}; - op_compatible_map_["sequence_topk_avg_pooling"] = { - "1.6.0", OpCompatibleType::DEFIN_NOT}; op_compatible_map_["shard_index"] = {"1.6.0", OpCompatibleType::DEFIN_NOT}; op_compatible_map_["size"] = {"1.6.0", OpCompatibleType::DEFIN_NOT}; op_compatible_map_["strided_slice"] = {"1.6.0", OpCompatibleType::DEFIN_NOT}; diff --git a/paddle/fluid/operators/lstmp_op.cc b/paddle/fluid/operators/lstmp_op.cc deleted file mode 100644 index 7af04a237de4c..0000000000000 --- a/paddle/fluid/operators/lstmp_op.cc +++ /dev/null @@ -1,411 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/lstmp_op.h" - -#include -#include - -namespace paddle { -namespace operators { - -class LSTMPOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", "LSTMP"); - OP_INOUT_CHECK(ctx->HasInput("Weight"), "Input", "Weight", "LSTMP"); - OP_INOUT_CHECK(ctx->HasInput("ProjWeight"), "Input", "ProjWeight", "LSTMP"); - OP_INOUT_CHECK(ctx->HasInput("Bias"), "Input", "Bias", "LSTMP"); - - OP_INOUT_CHECK( - ctx->HasOutput("Projection"), "Output", "Projection", "LSTMP"); - OP_INOUT_CHECK(ctx->HasOutput("Cell"), "Output", "Cell", "LSTMP"); - OP_INOUT_CHECK(ctx->HasOutput("BatchGate"), "Output", "BatchGate", "LSTMP"); - OP_INOUT_CHECK(ctx->HasOutput("BatchCellPreAct"), - "Output", - "BatchCellPreAct", - "LSTMP"); - OP_INOUT_CHECK( - ctx->HasOutput("BatchHidden"), "Output", "BatchHidden", "LSTMP"); - - auto in_dims = ctx->GetInputDim("Input"); - - PADDLE_ENFORCE_EQ( - in_dims.size(), - 2, - platform::errors::InvalidArgument( - "Input(X)'s rank of LSTMP operator must be 2, but received %d.", - in_dims.size())); - - int frame_size = static_cast(in_dims[1] / 4); - auto w_dims = ctx->GetInputDim("Weight"); - auto proj_dims = ctx->GetInputDim("ProjWeight"); - PADDLE_ENFORCE_EQ( - w_dims.size(), - 2, - platform::errors::InvalidArgument( - "The rank of Input(Weight) should be 2, but received %d.", - w_dims.size())); - PADDLE_ENFORCE_EQ( - w_dims[0], - proj_dims[1], - platform::errors::InvalidArgument( - "The first dimension of Input(Weight) and the second dimension of " - "Input(ProjWeight) should be the same, but received %d vs %d.", - w_dims[0], - proj_dims[1])); - PADDLE_ENFORCE_EQ(w_dims[1], - 4 * frame_size, - platform::errors::InvalidArgument( - "The second dimension of Input(Weight) should be 4 * " - "%d, but received %d.", - frame_size, - w_dims[1])); - - PADDLE_ENFORCE_EQ( - proj_dims.size(), - 2, - platform::errors::InvalidArgument( - "The rank of Input(ProjWeight) should be 2, but received %d.", - proj_dims.size())); - PADDLE_ENFORCE_EQ(proj_dims[0], - frame_size, - platform::errors::InvalidArgument( - "The first dimension of Input(ProjWeight) should be " - "%d, but received %d.", - frame_size, - proj_dims[0])); - - if (ctx->HasInput("H0")) { - PADDLE_ENFORCE_EQ( - ctx->HasInput("C0"), - true, - platform::errors::NotFound("Input(C0) of LSTMP operator should not " - "be null after Input(H0) provided.")); - } - - auto b_dims = ctx->GetInputDim("Bias"); - PADDLE_ENFORCE_EQ( - b_dims.size(), - 2, - platform::errors::InvalidArgument( - "The rank of Input(Bias) should be 2, but received %d.", - b_dims.size())); - PADDLE_ENFORCE_EQ( - b_dims[0], - 1, - platform::errors::InvalidArgument( - "The first dimension of Input(Bias) should be 1, but received %d.", - b_dims[0])); - - if (ctx->Attrs().Get("use_peepholes")) { - PADDLE_ENFORCE_EQ( - b_dims[1], - 7 * frame_size, - platform::errors::InvalidArgument( - "The second dimension of Input(Bias) should be 7 * %d if enable " - "peepholes connection, but received %d.", - frame_size, - b_dims[1])); - } else { - PADDLE_ENFORCE_EQ( - b_dims[1], - 4 * frame_size, - platform::errors::InvalidArgument( - "The second dimension of Input(Bias) should be 4 * %d if disable " - "peepholes connection, but received %d.", - frame_size, - b_dims[1])); - } - - framework::DDim out_dims({in_dims[0], frame_size}); - framework::DDim proj_out_dims({in_dims[0], proj_dims[1]}); - ctx->SetOutputDim("Projection", proj_out_dims); - ctx->SetOutputDim("Cell", out_dims); - ctx->SetOutputDim("BatchGate", in_dims); - ctx->SetOutputDim("BatchCellPreAct", out_dims); - ctx->SetOutputDim("BatchHidden", out_dims); - ctx->ShareLoD("Input", "Projection"); - ctx->ShareLoD("Input", "Cell"); - } - - protected: - phi::KernelKey GetExpectedKernelType( - const framework::ExecutionContext& ctx) const override { - return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(ctx, "Input"), - ctx.device_context().GetPlace()); - } -}; - -class LSTMPOpMaker : public framework::OpProtoAndCheckerMaker { - public: - void Make() override { - AddInput( - "Input", - "(phi::DenseTensor) the input for sequence data, which supports " - "variable-time length input sequence. The underlying tensor in " - "this phi::DenseTensor is a matrix with shape (T X 4D), where T is the " - "total time steps in this mini-batch, D is the hidden size."); - AddInput("H0", - "(Tensor, optional) the initial hidden state is an optional " - "input. This is a tensor with shape (N x D), where N is the " - "batch size and D is the hidden size.") - .AsDispensable(); - AddInput("C0", - "(Tensor, optional) the initial cell state is an optional " - "input. This is a tensor with shape (N x D), where N is the " - "batch size. `C0` should not be null if `H0` provided.") - .AsDispensable(); - AddInput("Weight", - "(Tensor) the learnable hidden-hidden weights." - " - The shape is (P x 4D), where P is the projection layer size " - "and D is the hidden size." - " - Weight = {W_cr, W_ir, W_fr, W_or}"); - AddInput("ProjWeight", - "(Tensor) the learnable weight of the projection layer." - " - The shape is (D x P), where P is the recurrent projection " - "layer size and D is the hidden size." - " - ProjWeight = {W_rh}"); - AddInput("Bias", - "(Tensor) the learnable biases, which contains two parts: " - "input-hidden biases and peephole connections weights if " - "setting `use_peepholes` to `True`. " - "1. `use_peepholes = False` " - " - The shape is (1 x 4D). " - " - Bias = {b_c, b_i, b_f, b_o}." - "2. `use_peepholes = True` " - " - The shape is (1 x 7D). " - " - Bias = {b_c, b_i, b_f, b_o, W_ic, W_fc, W_oc}."); - AddOutput("Projection", - "(phi::DenseTensor) the projection of the hidden state of LSTMP " - "operator. The shape is (T x P), and LoD is the same with the " - "`Input`."); - AddOutput("Cell", - "(phi::DenseTensor) the cell state of LSTMP operator. " - "The shape is (T x D), and lod is the same with the `Input`."); - AddOutput( - "BatchGate", - "(phi::DenseTensor) This phi::DenseTensor contains input gate, forget " - "gate " - "and output gate after the activations. This phi::DenseTensor has the " - "same shape as the reorganized input, which is also be called " - "batch input. The LoD size is 2. The first-level LoD is the " - "batch offsets and the second contains the indices, which " - "denotes the position of reorganized sequence in the raw input.") - .AsIntermediate(); - AddOutput( - "BatchCellPreAct", - "(phi::DenseTensor) the pre-activation cell state reorganized in " - "batch. " - "This phi::DenseTensor is obtained in the forward and used in the " - "backward.") - .AsIntermediate(); - AddOutput( - "BatchHidden", - "(phi::DenseTensor) the hidden state reorganized in batch. " - "This phi::DenseTensor is obtained in the forward and used in the " - "backward.") - .AsIntermediate(); - AddAttr("use_peepholes", - "(bool, default: True) " - "whether to enable diagonal/peephole connections.") - .SetDefault(true); - AddAttr("is_reverse", - "(bool, default: False) " - "whether to compute reversed LSTMP.") - .SetDefault(false); - AddAttr("cell_clip", - "(float, default: 0.0) " - "Clip for Tensor for cell state tensor when clip value is " - "greater than 0.0") - .SetDefault(0.0); - AddAttr("proj_clip", - "(float, default: 0.0) " - "Clip for Tensor for projection tensor when clip value is " - "greater than 0.0") - .SetDefault(0.0); - AddAttr( - "gate_activation", - "(string, default: sigmoid)" - "The activation for input gate, forget gate and output " - "gate, `sigmoid` by default.") - .SetDefault("sigmoid") - .InEnum({"sigmoid", "tanh", "relu", "identity"}); - AddAttr("cell_activation", - "(string, default: tanh)" - "The activation for cell output, `tanh` by default.") - .SetDefault("tanh") - .InEnum({"sigmoid", "tanh", "relu", "identity"}); - AddAttr("candidate_activation", - "(string, default: tanh)" - "The activation for candidate hidden state, " - "`tanh` by default.") - .SetDefault("tanh") - .InEnum({"sigmoid", "tanh", "relu", "identity"}); - AddAttr("proj_activation", - "(string, default: tanh)" - "The activation for projection output, " - "`tanh` by default.") - .SetDefault("tanh") - .InEnum({"sigmoid", "tanh", "relu", "identity"}); - AddComment(R"DOC( -Long-Short Term Memory with recurrent Projection layer (LSTMP) Operator. - -LSTMP has a separate projection layer after the LSTM layer, projecting the -original hidden state to a lower-dimensional one, which is proposed to reduce -the number of total parameters and furthermore computational complexity for -the LSTM, espeacially for the case that the size of output units is relative -large (https://research.google.com/pubs/archive/43905.pdf). - -The formula is as follows: - -$$ -i_t = \sigma(W_{ix}x_{t} + W_{ir}r_{t-1} + W_{ic}c_{t-1} + b_i) \\ - -f_t = \sigma(W_{fx}x_{t} + W_{fr}r_{t-1} + W_{fc}c_{t-1} + b_f) \\ - -\tilde{c_t} = act_g(W_{cx}x_t + W_{cr}r_{t-1} + b_c) \\ - -o_t = \sigma(W_{ox}x_{t} + W_{or}r_{t-1} + W_{oc}c_t + b_o) \\ - -c_t = f_t \odot c_{t-1} + i_t \odot \tilde{c_t} \\ - -h_t = o_t \odot act_h(c_t) \\ - -r_t = \overline{act_h}(W_{rh}h_t) -$$ - -where the W terms denote weight matrices (e.g. $W_{xi}$ is the matrix -of weights from the input gate to the input), $W_{ic}, W_{fc}, W_{oc}$ -are diagonal weight matrices for peephole connections. In our implementation, -we use vectors to represent these diagonal weight matrices. The b terms -denote bias vectors ($b_i$ is the input gate bias vector), $\sigma$ -is the activation, such as logistic sigmoid function, and -$i, f, o$ and $c$ are the input gate, forget gate, output gate, -and cell activation vectors, respectively, all of which have the same size as -the cell output activation vector $h$. Here $h$ is usually called the hidden -state and $r$ denotes its recurrent projection. And $\tilde{c_t}$ is also -called the candidate hidden state, whose computation is based on the current -input and previous hidden state. - -The $\odot$ is the element-wise product of the vectors. $act_g$ and $act_h$ -are the cell input and cell output activation functions and `tanh` is usually -used for them. $\overline{act_h}$ is the activation function for the -projection output, usually using `identity` or same as $act_h$. - -Note that these $W_{xi}x_{t}, W_{xf}x_{t}, W_{xc}x_{t}, W_{xo}x_{t}$ -operations on the input $x_{t}$ are NOT included in this operator. -Users can choose to use fully-connected operator before LSTMP operator. - -)DOC"); - } -}; - -template -class LSTMPGradMaker : public framework::SingleGradOpMaker { - public: - using framework::SingleGradOpMaker::SingleGradOpMaker; - - protected: - void Apply(GradOpPtr grad_op) const override { - grad_op->SetType("lstmp_grad"); - grad_op->SetInput("Weight", this->Input("Weight")); - grad_op->SetInput("ProjWeight", this->Input("ProjWeight")); - grad_op->SetInput("Bias", this->Input("Bias")); - - grad_op->SetInput("Projection", this->Output("Projection")); - grad_op->SetInput("Cell", this->Output("Cell")); - grad_op->SetInput("BatchGate", this->Output("BatchGate")); - grad_op->SetInput("BatchCellPreAct", this->Output("BatchCellPreAct")); - grad_op->SetInput("BatchHidden", this->Output("BatchHidden")); - grad_op->SetInput("H0", this->Input("H0")); - grad_op->SetInput("C0", this->Input("C0")); - - grad_op->SetInput(framework::GradVarName("Projection"), - this->OutputGrad("Projection")); - - grad_op->SetOutput(framework::GradVarName("Input"), - this->InputGrad("Input")); - grad_op->SetOutput(framework::GradVarName("Weight"), - this->InputGrad("Weight")); - grad_op->SetOutput(framework::GradVarName("ProjWeight"), - this->InputGrad("ProjWeight")); - grad_op->SetOutput(framework::GradVarName("Bias"), this->InputGrad("Bias")); - grad_op->SetOutput(framework::GradVarName("H0"), this->InputGrad("H0")); - grad_op->SetOutput(framework::GradVarName("C0"), this->InputGrad("C0")); - - grad_op->SetAttrMap(this->Attrs()); - } -}; - -class LSTMPGradOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK( - ctx->HasInput("Projection"), "Input", "Projection", "LSTMP@Grad"); - OP_INOUT_CHECK(ctx->HasInput("Cell"), "Input", "Cell", "LSTMP@Grad"); - OP_INOUT_CHECK(ctx->HasInput("Weight"), "Input", "Weight", "LSTMP@Grad"); - OP_INOUT_CHECK( - ctx->HasInput("ProjWeight"), "Input", "ProjWeight", "LSTMP@Grad"); - OP_INOUT_CHECK(ctx->HasInput("Bias"), "Input", "Bias", "LSTMP@Grad"); - - OP_INOUT_CHECK( - ctx->HasInput("BatchGate"), "Input", "BatchGate", "LSTMP@Grad"); - OP_INOUT_CHECK(ctx->HasInput("BatchCellPreAct"), - "Input", - "BatchCellPreAct", - "LSTMP@Grad"); - - auto SetOutGradDim = [&ctx](const std::string& name) { - auto g_name = framework::GradVarName(name); - if (ctx->HasOutput(g_name)) - ctx->SetOutputDim(g_name, ctx->GetInputDim(name)); - }; - - ctx->SetOutputDim(framework::GradVarName("Input"), - ctx->GetInputDim("BatchGate")); - SetOutGradDim("Weight"); - SetOutGradDim("ProjWeight"); - SetOutGradDim("Bias"); - SetOutGradDim("H0"); - SetOutGradDim("C0"); - } - - protected: - phi::KernelKey GetExpectedKernelType( - const framework::ExecutionContext& ctx) const override { - return phi::KernelKey( - OperatorWithKernel::IndicateVarDataType(ctx, "BatchGate"), - ctx.device_context().GetPlace()); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OPERATOR(lstmp, - ops::LSTMPOp, - ops::LSTMPOpMaker, - ops::LSTMPGradMaker, - ops::LSTMPGradMaker); -REGISTER_OPERATOR(lstmp_grad, ops::LSTMPGradOp); -PD_REGISTER_STRUCT_KERNEL( - lstmp, CPU, ALL_LAYOUT, ops::LSTMPKernel, float, double) {} -PD_REGISTER_STRUCT_KERNEL( - lstmp_grad, CPU, ALL_LAYOUT, ops::LSTMPGradKernel, float, double) {} diff --git a/paddle/fluid/operators/lstmp_op.cu b/paddle/fluid/operators/lstmp_op.cu deleted file mode 100644 index 5559d09f1b9ba..0000000000000 --- a/paddle/fluid/operators/lstmp_op.cu +++ /dev/null @@ -1,21 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/lstmp_op.h" - -namespace ops = paddle::operators; -PD_REGISTER_STRUCT_KERNEL( - lstmp, GPU, ALL_LAYOUT, ops::LSTMPKernel, float, double) {} -PD_REGISTER_STRUCT_KERNEL( - lstmp_grad, GPU, ALL_LAYOUT, ops::LSTMPGradKernel, float, double) {} diff --git a/paddle/fluid/operators/lstmp_op.h b/paddle/fluid/operators/lstmp_op.h deleted file mode 100644 index fd9032c730af8..0000000000000 --- a/paddle/fluid/operators/lstmp_op.h +++ /dev/null @@ -1,610 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include - -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/activation_op.h" -#include "paddle/fluid/platform/place.h" -#include "paddle/phi/common/transform.h" -#include "paddle/phi/kernels/funcs/blas/blas.h" -#include "paddle/phi/kernels/funcs/detail/activation_functions.h" -#include "paddle/phi/kernels/funcs/lstm_compute.h" -#include "paddle/phi/kernels/funcs/sequence2batch.h" - -namespace paddle { -namespace operators { - -using phi::Transform; - -template -using EigenMatrix = framework::EigenMatrix; - -template -class _ClipFunctor { - public: - explicit _ClipFunctor(const T min, const T max) : min_(min), max_(max) {} - HOSTDEVICE T operator()(const T& x) const { - if (x < min_) - return min_; - else if (x > max_) - return max_; - else - return x; - } - - private: - T min_; - T max_; -}; - -template -class _ClipGradFunctor { - public: - explicit _ClipGradFunctor(const T min, const T max) : min_(min), max_(max) {} - HOSTDEVICE T operator()(const T& x, const T& y) const { - return (y > min_ && y < max_) ? x : 0; - } - - private: - T min_; - T max_; -}; - -template -inline void ReorderInitState(const DeviceContext& ctx, - const phi::DenseTensor& src, - phi::Vector index, - phi::DenseTensor* dst, - bool indexed_src) { - phi::funcs::CopyMatrixRowsFunctor row_shuffle; - dst->mutable_data(src.dims(), ctx.GetPlace()); - row_shuffle(ctx, src, index, dst, indexed_src); -} - -template -class LSTMPKernel : public framework::OpKernel { - public: - template - void ActCompute(const phi::funcs::detail::ActivationType act_type, - const Device& d, - X x, - Y y, - platform::Place place) const { - if (act_type == phi::funcs::detail::ActivationType::kIdentity) { - y.device(d) = x; - } else if (act_type == phi::funcs::detail::ActivationType::kSigmoid) { - SigmoidFunctor()(d, x, y); - } else if (act_type == phi::funcs::detail::ActivationType::kTanh) { - TanhFunctor()(d, x, y); - } else if (act_type == phi::funcs::detail::ActivationType::kReLU) { - if (place == platform::CPUPlace()) - ReluCPUFunctor()(d, x, y); - else - ReluCUDAFunctor()(d, x, y); - } else { - PADDLE_THROW( - platform::errors::InvalidArgument("unsupported activation type")); - } - } - - void Compute(const framework::ExecutionContext& ctx) const override { - auto* input = ctx.Input("Input"); - auto* weight = ctx.Input("Weight"); - auto* proj_weight = ctx.Input("ProjWeight"); - auto* bias = ctx.Input("Bias"); - - auto* hidden_t0 = ctx.Input("H0"); - auto* cell_t0 = ctx.Input("C0"); - - auto proj_clip = static_cast(ctx.Attr("proj_clip")); - auto cell_clip = static_cast(ctx.Attr("cell_clip")); - - auto* batch_gate = ctx.Output("BatchGate"); - batch_gate->mutable_data(ctx.GetPlace()); - auto* proj_out = ctx.Output("Projection"); - proj_out->mutable_data(ctx.GetPlace()); - auto* cell_out = ctx.Output("Cell"); - cell_out->mutable_data(ctx.GetPlace()); - - bool is_reverse = ctx.Attr("is_reverse"); - phi::funcs::LoDTensor2BatchFunctor to_batch; - auto& device_ctx = ctx.template device_context(); - to_batch(device_ctx, *input, batch_gate, true, is_reverse); - - auto in_dims = input->dims(); - int frame_size = static_cast(in_dims[1] / 4); - framework::DDim dims({in_dims[0], frame_size}); - framework::DDim proj_dims({in_dims[0], proj_weight->dims()[1]}); - - if (bias) { - phi::DenseTensor b = *bias; - b.Resize({bias->numel(), 1}); - phi::DenseTensor gate_bias = b.Slice(0, 4 * frame_size); - phi::funcs::RowwiseAdd add_bias; - add_bias(device_ctx, *batch_gate, gate_bias, batch_gate); - } - - phi::funcs::LstmMetaValue lstmp_value; - if (bias && ctx.Attr("use_peepholes")) { - T* bias_data = const_cast(bias->data()); - // the code style in LstmpMetaValue will be updated later. - - lstmp_value.check_ig = bias_data + 4 * frame_size; - lstmp_value.check_fg = lstmp_value.check_ig + frame_size; - lstmp_value.check_og = lstmp_value.check_fg + frame_size; - } else { - lstmp_value.check_ig = nullptr; - lstmp_value.check_fg = nullptr; - lstmp_value.check_og = nullptr; - } - lstmp_value.prev_state_value = nullptr; - phi::DenseTensor ordered_c0; - phi::DenseTensor ordered_h0; - - phi::Vector order(batch_gate->lod()[2]); - - if (cell_t0) { - // Since the batch computing for LSTMP reorders the input sequence - // according to their length. The initialized cell state also needs - // to reorder. - ReorderInitState( - device_ctx, *cell_t0, order, &ordered_c0, true); - lstmp_value.prev_state_value = ordered_c0.data(); - } - - // Use the local variable as here. - phi::DenseTensor batch_proj, batch_cell; - auto* batch_cell_pre_act = ctx.Output("BatchCellPreAct"); - batch_cell_pre_act->mutable_data(dims, ctx.GetPlace()); - auto* batch_hidden = ctx.Output("BatchHidden"); - batch_hidden->mutable_data(dims, ctx.GetPlace()); // T x D - batch_proj.mutable_data(proj_dims, ctx.GetPlace()); // T x P - batch_cell.mutable_data(dims, ctx.GetPlace()); // T x D - - auto batch_starts = batch_gate->lod()[0]; - size_t num_batch = batch_starts.size() - 1; - auto gate_act = phi::funcs::detail::GetActivationType( - ctx.Attr("gate_activation")); - auto cell_act = phi::funcs::detail::GetActivationType( - ctx.Attr("cell_activation")); - auto cand_act = phi::funcs::detail::GetActivationType( - ctx.Attr("candidate_activation")); - auto proj_act = phi::funcs::detail::GetActivationType( - ctx.Attr("proj_activation")); - auto& place = *ctx.template device_context().eigen_device(); - auto blas = phi::funcs::GetBlas(device_ctx); - for (size_t n = 0; n < num_batch; n++) { - int bstart = static_cast(batch_starts[n]); - int bend = static_cast(batch_starts[n + 1]); - - phi::DenseTensor gate_t = batch_gate->Slice(bstart, bend); - phi::DenseTensor hidden_t = batch_hidden->Slice(bstart, bend); - phi::DenseTensor proj_t = batch_proj.Slice(bstart, bend); - phi::DenseTensor cell_t = batch_cell.Slice(bstart, bend); - phi::DenseTensor cell_pre_act_t = batch_cell_pre_act->Slice(bstart, bend); - - int cur_batch_size = bend - bstart; - - if (n > 0) { - int pre_h_start = static_cast(batch_starts[n - 1]); - int pre_h_end = pre_h_start + cur_batch_size; - auto pre_proj_t = batch_proj.Slice(pre_h_start, pre_h_end); - blas.MatMul(pre_proj_t, - false, - *weight, - false, - static_cast(1.0), - &gate_t, - static_cast(1.0)); - } else if (hidden_t0) { - // If n == 0 and there is no initialized hidden state, that is to say - // the H0 is zeros, the calculation W_h * H0 will be skiped. - // If n == 0 and there is initialized hidden state, calculate W_h * H0. - - // Since the batch computing for LSTMP reorders the input sequence - // according to their length. The initialized hidden state also needs - // to reorder. - ReorderInitState( - device_ctx, *hidden_t0, order, &ordered_h0, true); - blas.MatMul(ordered_h0, - false, - *weight, - false, - static_cast(1.0), - &gate_t, - static_cast(1.0)); - } - - lstmp_value.gate_value = gate_t.data(); - lstmp_value.output_value = hidden_t.data(); - lstmp_value.state_value = cell_t.data(); - lstmp_value.state_active_value = cell_pre_act_t.data(); - phi::funcs::LstmUnitFunctor::compute(device_ctx, - lstmp_value, - frame_size, - cur_batch_size, - cell_clip, - gate_act, - cell_act, - cand_act); - lstmp_value.prev_state_value = lstmp_value.state_value; - blas.MatMul(hidden_t, - false, - *proj_weight, - false, - static_cast(1.0), - &proj_t, - static_cast(0.0)); - if (proj_act != phi::funcs::detail::ActivationType::kIdentity) { - auto proj_t_dev = EigenMatrix::From(proj_t); - ActCompute(cell_act, place, proj_t_dev, proj_t_dev, ctx.GetPlace()); - } - if (proj_clip && proj_clip > 0.0) { - T* x_data = proj_t.data(); - int64_t numel = proj_t.numel(); - Transform trans; - trans(ctx.template device_context(), - x_data, - x_data + numel, - x_data, - _ClipFunctor(-1.0 * proj_clip, proj_clip)); - } - } - - phi::funcs::Batch2LoDTensorFunctor to_seq; - batch_proj.set_lod(batch_gate->lod()); - // restore the output hidden in phi::DenseTensor from the batch hidden - to_seq(device_ctx, batch_proj, proj_out); - - batch_cell.set_lod(batch_gate->lod()); - // restore the output cell state in phi::DenseTensor from the batch cell - to_seq(device_ctx, batch_cell, cell_out); - } -}; - -template -class LSTMPGradKernel : public framework::OpKernel { - public: - template - void ActGradCompute(const phi::funcs::detail::ActivationType act_type, - const Device& d, - X x, - Y y, - DX dx, - DY dy) const { - // x is dummy and won't be used even in Relu(use y instead) - if (act_type == phi::funcs::detail::ActivationType::kIdentity) - dx.device(d) = dy; - else if (act_type == phi::funcs::detail::ActivationType::kSigmoid) - SigmoidGradFunctor()(d, x, y, dy, dx); - else if (act_type == phi::funcs::detail::ActivationType::kTanh) - TanhGradFunctor()(d, x, y, dy, dx); - else if (act_type == phi::funcs::detail::ActivationType::kReLU) - ReluGradFunctor()(d, x, y, dy, dx); - else - PADDLE_THROW( - platform::errors::InvalidArgument("unsupported activation type")); - } - - void Compute(const framework::ExecutionContext& ctx) const override { - auto* weight = ctx.Input("Weight"); - auto* proj_weight = ctx.Input("ProjWeight"); - auto* bias = ctx.Input("Bias"); - - auto* proj_out = ctx.Input("Projection"); - auto* cell_out = ctx.Input("Cell"); - - auto proj_clip = static_cast(ctx.Attr("proj_clip")); - auto cell_clip = static_cast(ctx.Attr("cell_clip")); - - auto* batch_gate = ctx.Input("BatchGate"); - auto* batch_cell_pre_act = ctx.Input("BatchCellPreAct"); - auto* batch_hidden = ctx.Input("BatchHidden"); - - auto* projection_g = - ctx.Input(framework::GradVarName("Projection")); - - auto* in_g = ctx.Output(framework::GradVarName("Input")); - auto* weight_g = - ctx.Output(framework::GradVarName("Weight")); - auto* proj_weight_g = - ctx.Output(framework::GradVarName("ProjWeight")); - auto* bias_g = ctx.Output(framework::GradVarName("Bias")); - - auto* h0 = ctx.Input("H0"); - auto* c0 = ctx.Input("C0"); - - auto* h0_g = ctx.Output(framework::GradVarName("H0")); - auto* c0_g = ctx.Output(framework::GradVarName("C0")); - - auto& device_ctx = ctx.template device_context(); - phi::funcs::SetConstant zero; - if (weight_g) { - weight_g->mutable_data(ctx.GetPlace()); - zero(device_ctx, weight_g, static_cast(0.0)); - } - if (proj_weight_g) { - proj_weight_g->mutable_data(ctx.GetPlace()); - zero(device_ctx, proj_weight_g, static_cast(0.0)); - } - - // ordered_h0/c0 is the reordered hidden/cell initialization. - // ordered_h0_g/c0_g is the reordered gradient of hidden/cell - // initialization. - phi::DenseTensor ordered_h0, ordered_c0, ordered_h0_g, ordered_c0_g; - - phi::Vector order(batch_gate->lod()[2]); - - if (c0) { - ReorderInitState( - device_ctx, *c0, order, &ordered_c0, true); - } - if (c0 && c0_g) { - ordered_c0_g.mutable_data(c0_g->dims(), ctx.GetPlace()); - } - - // batch_gate dims equal to input dims - auto in_dims = batch_gate->dims(); - auto out_dims = cell_out->dims(); - framework::DDim proj_dims({in_dims[0], proj_weight->dims()[1]}); - int frame_size = static_cast(in_dims[1] / 4); - PADDLE_ENFORCE_EQ(frame_size, - out_dims[1], - platform::errors::InvalidArgument( - "The second dimension of Input(Cell) should be %d, " - "but received %d in LSTMP@Grad operator.", - frame_size, - out_dims[1])); - - phi::funcs::LstmMetaValue lstmp_value; - if (bias && ctx.Attr("use_peepholes")) { - T* bias_data = const_cast(bias->data()); - lstmp_value.check_ig = bias_data + 4 * frame_size; - lstmp_value.check_fg = lstmp_value.check_ig + frame_size; - lstmp_value.check_og = lstmp_value.check_fg + frame_size; - } else { - lstmp_value.check_ig = nullptr; - lstmp_value.check_fg = nullptr; - lstmp_value.check_og = nullptr; - } - - phi::funcs::LstmMetaGrad lstmp_grad; - - if (bias && bias_g) { - bias_g->mutable_data(ctx.GetPlace()); - zero(device_ctx, bias_g, static_cast(0.0)); - } - if (bias && bias_g && ctx.Attr("use_peepholes")) { - T* bias_g_data = bias_g->data(); - lstmp_grad.check_ig_grad = bias_g_data + 4 * frame_size; - lstmp_grad.check_fg_grad = lstmp_grad.check_ig_grad + frame_size; - lstmp_grad.check_og_grad = lstmp_grad.check_fg_grad + frame_size; - } else { - lstmp_grad.check_ig_grad = nullptr; - lstmp_grad.check_fg_grad = nullptr; - lstmp_grad.check_og_grad = nullptr; - } - - phi::funcs::LoDTensor2BatchFunctor to_batch; - - auto ToBatch = [&batch_gate, &to_batch](const DeviceContext& ctx, - const phi::DenseTensor& src, - const framework::DDim& dims, - phi::DenseTensor& dst) { - dst.mutable_data(dims, ctx.GetPlace()); - dst.set_lod(batch_gate->lod()); - to_batch(ctx, src, &dst, false); - }; - - phi::DenseTensor batch_hidden_g, batch_proj, batch_proj_g, batch_cell; - batch_hidden_g.mutable_data(out_dims, ctx.GetPlace()); - ToBatch(device_ctx, *proj_out, proj_dims, batch_proj); // T x P - ToBatch(device_ctx, *projection_g, proj_dims, batch_proj_g); // T x P - ToBatch(device_ctx, *cell_out, out_dims, batch_cell); // T x D - - phi::DenseTensor batch_cell_g, batch_gate_g; - batch_cell_g.mutable_data(out_dims, ctx.GetPlace()); - // TODO(qingqing) support the case output cell has gradient. - // to_batch(device_ctx, *cell_g, batch_cell_g, false); - zero(device_ctx, &batch_cell_g, static_cast(0.0)); - batch_gate_g.mutable_data(batch_gate->dims(), ctx.GetPlace()); - batch_gate_g.set_lod(batch_gate->lod()); - - auto gate_act = phi::funcs::detail::GetActivationType( - ctx.Attr("gate_activation")); - auto cell_act = phi::funcs::detail::GetActivationType( - ctx.Attr("cell_activation")); - auto cand_act = phi::funcs::detail::GetActivationType( - ctx.Attr("candidate_activation")); - auto proj_act = phi::funcs::detail::GetActivationType( - ctx.Attr("proj_activation")); - auto& place = *ctx.template device_context().eigen_device(); - - auto batch_starts = batch_gate->lod()[0]; - size_t num_batch = batch_starts.size() - 1; - auto blas = phi::funcs::GetBlas(device_ctx); - for (int n = static_cast(num_batch) - 1; n >= 0; n--) { - int bstart = static_cast(batch_starts[n]); - int bend = static_cast(batch_starts[n + 1]); - - phi::DenseTensor cur_proj = batch_proj.Slice(bstart, bend); - phi::DenseTensor proj_g = batch_proj_g.Slice(bstart, bend); - - if (proj_clip && proj_clip > 0.0) { - T* dx_data = proj_g.data(); - T* x_data = cur_proj.data(); - int64_t numel = proj_g.numel(); - Transform trans; - trans(ctx.template device_context(), - dx_data, - dx_data + numel, - x_data, - dx_data, - _ClipGradFunctor(-1.0 * proj_clip, proj_clip)); - } - - if (proj_act != phi::funcs::detail::ActivationType::kIdentity) { - auto cur_proj_dev = EigenMatrix::From(cur_proj); - auto proj_g_dev = EigenMatrix::From(proj_g); - ActGradCompute(cell_act, - place, - cur_proj_dev, - cur_proj_dev, - proj_g_dev, - proj_g_dev); - } - /* hidden state backwarad */ - phi::DenseTensor out_g = batch_hidden_g.Slice(bstart, bend); - blas.MatMul(proj_g, - false, - *proj_weight, - true, - static_cast(1.0), - &out_g, - static_cast(0.0)); - /* projection weight backward*/ - if (proj_weight_g) { - phi::DenseTensor hidden_t = batch_hidden->Slice(bstart, bend); - blas.MatMul(hidden_t, - true, - proj_g, - false, - static_cast(1.0), - proj_weight_g, - static_cast(1.0)); - } - - phi::DenseTensor gate = batch_gate->Slice(bstart, bend); - phi::DenseTensor cell = batch_cell.Slice(bstart, bend); - phi::DenseTensor cell_pre_act = batch_cell_pre_act->Slice(bstart, bend); - lstmp_value.gate_value = gate.data(); - lstmp_value.state_value = cell.data(); - lstmp_value.state_active_value = cell_pre_act.data(); - - phi::DenseTensor gate_g = batch_gate_g.Slice(bstart, bend); - phi::DenseTensor cell_g = batch_cell_g.Slice(bstart, bend); - lstmp_grad.state_grad = cell_g.data(); - lstmp_grad.gate_grad = gate_g.data(); - lstmp_grad.output_grad = out_g.data(); - - if (n > 0) { - int bstart_pre = static_cast(batch_starts[n - 1]); - phi::DenseTensor cell_pre = batch_cell.Slice(bstart_pre, bstart); - phi::DenseTensor cell_pre_g = batch_cell_g.Slice(bstart_pre, bstart); - lstmp_value.prev_state_value = cell_pre.data(); - lstmp_grad.prev_state_grad = cell_pre_g.data(); - } else { - lstmp_value.prev_state_value = c0 ? ordered_c0.data() : nullptr; - lstmp_grad.prev_state_grad = c0_g ? ordered_c0_g.data() : nullptr; - } - - int cur_batch_size = bend - bstart; - // lstmp_value.output_value not used in bp, set to null - // lstmp_grad.state_active_grad not used in bp, set to null - lstmp_value.output_value = nullptr; - lstmp_grad.state_active_grad = nullptr; - - phi::funcs::LstmUnitGradFunctor::compute(device_ctx, - lstmp_value, - lstmp_grad, - frame_size, - cur_batch_size, - cell_clip, - gate_act, - cell_act, - cand_act); - - if (n > 0) { - int pre_h_start = static_cast(batch_starts[n - 1]); - int pre_h_end = pre_h_start + cur_batch_size; - auto pre_proj_g = batch_proj_g.Slice(pre_h_start, pre_h_end); - blas.MatMul(gate_g, - false, - *weight, - true, - static_cast(1.0), - &pre_proj_g, - static_cast(1.0)); - if (weight_g) { - /* weight backward*/ - auto pre_proj = batch_proj.Slice(pre_h_start, pre_h_end); - blas.MatMul(pre_proj, - true, - gate_g, - false, - static_cast(1.0), - weight_g, - static_cast(1.0)); - } - } else { - if (h0 && weight_g) { - ReorderInitState( - device_ctx, *h0, order, &ordered_h0, true); - if (weight_g) { - blas.MatMul(ordered_h0, - true, - gate_g, - false, - static_cast(1.0), - weight_g, - static_cast(1.0)); - } - } - if (h0 && (h0_g || proj_weight_g)) { - ordered_h0_g.mutable_data(h0_g->dims(), ctx.GetPlace()); - blas.MatMul(gate_g, - false, - *weight, - true, - static_cast(1.0), - &ordered_h0_g, - static_cast(0.0)); - } - } - } - - phi::funcs::Batch2LoDTensorFunctor to_seq; - if (in_g) { - /* backward data */ - in_g->mutable_data(ctx.GetPlace()); - to_seq(device_ctx, batch_gate_g, in_g); - } - if (bias && bias_g) { - /* backward bias */ - phi::DenseTensor b_g = *bias_g; - b_g.Resize({bias_g->numel(), 1}); - phi::DenseTensor gate_bias_g = b_g.Slice(0, 4 * frame_size); - phi::funcs::ColwiseSum col_sum; - col_sum(device_ctx, batch_gate_g, &gate_bias_g); - } - - if (h0 && h0_g) { - ReorderInitState( - device_ctx, ordered_h0_g, order, h0_g, false); - } - if (c0 && c0_g) { - ReorderInitState( - device_ctx, ordered_c0_g, order, c0_g, false); - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.cc b/paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.cc deleted file mode 100644 index c8ce5475e545b..0000000000000 --- a/paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.cc +++ /dev/null @@ -1,149 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.h" - -#include -#include - -namespace paddle { -namespace operators { - -class SequenceTopkAvgPoolingOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "SequenceTopkAvgPooling"); - OP_INOUT_CHECK( - ctx->HasInput("ROW"), "Input", "ROW", "SequenceTopkAvgPooling"); - OP_INOUT_CHECK( - ctx->HasInput("COLUMN"), "Input", "COLUMN", "SequenceTopkAvgPooling"); - OP_INOUT_CHECK( - ctx->HasOutput("Out"), "Output", "Out", "SequenceTopkAvgPooling"); - OP_INOUT_CHECK( - ctx->HasOutput("pos"), "Output", "pos", "SequenceTopkAvgPooling"); - - auto attr = ctx->Attrs(); - auto channel_num = attr.Get("channel_num"); - PADDLE_ENFORCE_GT( - channel_num, - 0, - platform::errors::InvalidArgument( - "Expected channel_num > 0, but received %d.", channel_num)); - - auto topks = attr.Get>("topks"); - auto num_k = topks.size(); - PADDLE_ENFORCE_GT( - num_k, - 0, - platform::errors::InvalidArgument( - "Expected topks.size() > 0, but received %zu.", num_k)); - - auto row_dim = ctx->GetInputDim("ROW"); - auto row_shape_0 = row_dim[0]; - - std::vector vec_out_shape; - vec_out_shape.push_back(row_shape_0); // NOLINT - vec_out_shape.push_back(channel_num * num_k); // NOLINT - - ctx->SetOutputDim("Out", phi::make_ddim(vec_out_shape)); - ctx->ShareLoD("ROW", "Out"); - } -}; - -class SequenceTopkAvgPoolingOpMaker : public framework::OpProtoAndCheckerMaker { - public: - void Make() override { - AddInput("X", - "(LoDTensor) The variable-length input of SequenceTopkPoolingOp"); - AddInput("ROW", "(LoDTensor) the row info"); - AddInput("COLUMN", "(LoDTensor) the column info"); - AddOutput( - "Out", - "(Tensor) The output of SequenceTopkPoolingOp does not contain LoD " - "information."); - AddOutput("pos", "(Tensor) store the topk index ").AsIntermediate(); - AddAttr>("topks", "topks"); - AddAttr("channel_num", "channel number"); - AddComment(R"DOC( - sequecen topk average pooling op - )DOC"); - } -}; - -class SequenceTopkAvgPoolingGradOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")), - "Input", - framework::GradVarName("Out"), - "SequenceTopkAvgPoolingGrad"); - OP_INOUT_CHECK( - ctx->HasInput("X"), "Input", "X", "SequenceTopkAvgPoolingGrad"); - - ctx->ShareDim("X", /*->*/ framework::GradVarName("X")); - ctx->ShareLoD("X", /*->*/ framework::GradVarName("X")); - } - - protected: - phi::KernelKey GetExpectedKernelType( - const framework::ExecutionContext& ctx) const override { - auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X"); - return phi::KernelKey(data_type, ctx.GetPlace()); - } -}; - -template -class SequenceTopkAvgPoolGradOpMaker : public framework::SingleGradOpMaker { - public: - using framework::SingleGradOpMaker::SingleGradOpMaker; - - protected: - void Apply(GradOpPtr op_desc_ptr) const override { - op_desc_ptr->SetType("sequence_topk_avg_pooling_grad"); - op_desc_ptr->SetInput("X", this->Input("X")); - op_desc_ptr->SetInput("ROW", this->Input("ROW")); - op_desc_ptr->SetInput("COLUMN", this->Input("COLUMN")); - op_desc_ptr->SetInput("pos", this->Output("pos")); - op_desc_ptr->SetInput(framework::GradVarName("Out"), - this->OutputGrad("Out")); - op_desc_ptr->SetOutput(framework::GradVarName("X"), this->InputGrad("X")); - op_desc_ptr->SetAttrMap(this->Attrs()); - } -}; -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OPERATOR( - sequence_topk_avg_pooling, - ops::SequenceTopkAvgPoolingOp, - ops::SequenceTopkAvgPoolingOpMaker, - ops::SequenceTopkAvgPoolGradOpMaker, - ops::SequenceTopkAvgPoolGradOpMaker); -REGISTER_OPERATOR(sequence_topk_avg_pooling_grad, - ops::SequenceTopkAvgPoolingGradOp); -PD_REGISTER_STRUCT_KERNEL(sequence_topk_avg_pooling, - CPU, - ALL_LAYOUT, - ops::SequenceTopkAvgPoolingKernel, - float) {} -PD_REGISTER_STRUCT_KERNEL(sequence_topk_avg_pooling_grad, - CPU, - ALL_LAYOUT, - ops::SequenceTopkAvgPoolingGradKernel, - float) {} diff --git a/paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.h b/paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.h deleted file mode 100644 index df69acc748872..0000000000000 --- a/paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.h +++ /dev/null @@ -1,247 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include -#include -#include -#include -#include - -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace paddle { -namespace operators { - -using Tensor = phi::DenseTensor; -using LoDTensor = phi::DenseTensor; -static constexpr int TopKPosPaddingId = -1; - -namespace details { - -template -static void get_topk_pos(const T* data, int length, int k, int* pos) { - VLOG(3) << "length: " << length << " , k : " << k; - - std::priority_queue, - std::vector>, - std::greater>> - topk_queue; - - for (int i = 0; i < length; ++i) { - T elem = data[i]; - if (topk_queue.size() < static_cast(k)) { - topk_queue.emplace(elem, i); - } else { - if (elem >= topk_queue.top().first) { - // replace top node if found a bigger value - topk_queue.pop(); - topk_queue.emplace(elem, i); - } - } - } - // reversely assign value - int real_k = topk_queue.size(); - for (int i = real_k - 1; i >= 0; --i) { - pos[i] = topk_queue.top().second; - topk_queue.pop(); - } - // if length of data is less than k, fill TopKPosPaddingId at the end of pos. - for (int i = real_k; i < k; ++i) { - pos[i] = TopKPosPaddingId; - } -} -} // namespace details - -template -class SequenceTopkAvgPoolingKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* in = context.Input("X"); - auto* row = context.Input("ROW"); - auto* col = context.Input("COLUMN"); - auto* out = context.Output("Out"); - auto* pos = context.Output("pos"); - - PADDLE_ENFORCE_EQ( - in->lod().empty(), - false, - platform::errors::InvalidArgument( - "Input(X) Tensor of SequenceTopkAvgPoolingOp does not " - "contain LoD information.")); - PADDLE_ENFORCE_EQ( - row->lod().empty(), - false, - platform::errors::InvalidArgument( - "Input(ROW) Tensor of SequenceTopkAvgPoolingOp does not " - "contain LoD information.")); - PADDLE_ENFORCE_EQ( - col->lod().empty(), - false, - platform::errors::InvalidArgument( - "Input(COLUMN) Tensor of SequenceTopkAvgPoolingOp does " - "not contain LoD information.")); - - auto channel_num = context.Attr("channel_num"); - auto topks = context.Attr>("topks"); - auto k_num = topks.size(); - auto max_k = topks[topks.size() - 1]; - PADDLE_ENFORCE_GE(max_k, - 0, - platform::errors::InvalidArgument( - "Expected max_k >= 0, but received %d.", max_k)); - std::vector vec_pos_shape; - auto in_lod = in->lod()[0]; - - auto row_lod = row->lod()[0]; - auto col_lod = col->lod()[0]; - int batch_size = row_lod.size() - 1; - int pos_total_size = row_lod[batch_size] * channel_num * max_k; - vec_pos_shape.push_back(pos_total_size); - pos->Resize({phi::make_ddim(vec_pos_shape)}); - auto pos_data = pos->mutable_data(context.GetPlace()); - - int offset = 0; - phi::Vector vec_out_lod; - vec_out_lod.reserve(batch_size + 1); - for (int i = 0; i <= batch_size; ++i) { - offset = row_lod[i]; - vec_out_lod.push_back(offset); - } - - framework::LoD lod_temp; - lod_temp.push_back(vec_out_lod); - out->set_lod(lod_temp); - - auto din_data = in->data(); - auto dout_data = out->mutable_data(context.GetPlace()); - - T* sum_data = new T[max_k]; - for (int i = 0; i < batch_size; ++i) { - int total_size = in_lod[i + 1] - in_lod[i]; - int row_size = row_lod[i + 1] - row_lod[i]; - int col_size = col_lod[i + 1] - col_lod[i]; - PADDLE_ENFORCE_EQ(total_size, - channel_num * row_size * col_size, - platform::errors::PreconditionNotMet( - "Expected total_size == channel_num * row_size * " - "col_size, but got %d != %d.", - total_size, - channel_num * row_size * col_size)); - - int feature_num = row_size * col_size; - for (int j = 0; j < channel_num; ++j) { - auto input_offset_feature_data = din_data + in_lod[i] + j * feature_num; - - for (int r = 0; r < row_size; ++r) { - auto row_data = input_offset_feature_data + r * col_size; - - auto pos_slice_data = pos_data + row_lod[i] * channel_num * max_k + - r * channel_num * max_k + j * max_k; - auto out_slice_data = dout_data + row_lod[i] * channel_num * k_num + - r * channel_num * k_num + j * k_num; - - details::get_topk_pos(row_data, col_size, max_k, pos_slice_data); - if (pos_slice_data[0] == TopKPosPaddingId) { - sum_data[0] = 0.0; - } else { - sum_data[0] = row_data[pos_slice_data[0]]; - } - for (int k = 1; k < max_k; ++k) { - if (pos_slice_data[k] == TopKPosPaddingId) { - sum_data[k] = sum_data[k - 1]; - } else { - sum_data[k] = sum_data[k - 1] + row_data[pos_slice_data[k]]; - } - } - for (size_t k = 0; k < k_num; ++k) { - out_slice_data[k] = sum_data[topks[k] - 1] / topks[k]; - } - } - } - } - delete[] sum_data; - } -}; - -template -class SequenceTopkAvgPoolingGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* d_out = context.Input(framework::GradVarName("Out")); - auto* d_in = context.Output(framework::GradVarName("X")); - auto* pos_input = context.Input("pos"); - auto* row_input = context.Input("ROW"); - auto* col_input = context.Input("COLUMN"); - auto* forward_input = context.Input("X"); - - int batch_size = row_input->lod()[0].size() - 1; - auto channel_num = context.Attr("channel_num"); - auto topks = context.Attr>("topks"); - auto k_num = topks.size(); - auto max_k = topks[k_num - 1]; - - auto out_lod = forward_input->lod(); - d_in->set_lod(out_lod); - - d_in->mutable_data(context.GetPlace()); - auto pos_data = pos_input->data(); - auto dout_data = d_out->data(); - - auto& dev_ctx = context.template device_context(); - phi::funcs::SetConstant zero; - zero(dev_ctx, d_in, static_cast(0.0)); - - auto din_data = d_in->data(); - - auto out_offset = out_lod[0]; - auto row_lod = row_input->lod()[0]; - auto col_lod = col_input->lod()[0]; - - for (int i = 0; i < batch_size; ++i) { - int row_size = row_lod[i + 1] - row_lod[i]; - int col_size = col_lod[i + 1] - col_lod[i]; - int feature_num = row_size * col_size; - - for (int j = 0; j < channel_num; ++j) { - auto in_offset_feature_data = - din_data + out_offset[i] + j * feature_num; - - for (int r = 0; r < row_size; r++) { - auto row_data = dout_data + row_lod[i] * channel_num * k_num + - r * channel_num * k_num + j * k_num; - auto pos_slice_data = pos_data + row_lod[i] * channel_num * max_k + - r * channel_num * max_k + j * max_k; - auto in_slice_data = in_offset_feature_data + r * col_size; - - for (size_t m = 0; m < k_num; ++m) { - for (int k = 0; k < topks[m]; ++k) { - if (pos_slice_data[k] == TopKPosPaddingId) { - break; - } else { - in_slice_data[pos_slice_data[k]] += row_data[m] / topks[m]; - } - } - } - } - } - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/unity_build_rule.cmake b/paddle/fluid/operators/unity_build_rule.cmake index 3cfb53ea14b65..6669d2bfe2235 100644 --- a/paddle/fluid/operators/unity_build_rule.cmake +++ b/paddle/fluid/operators/unity_build_rule.cmake @@ -148,8 +148,7 @@ register_unity_group( lookup_table_dequant_op.cc lrn_op.cc mkldnn/lrn_mkldnn_op.cc - lstm_unit_op.cc - lstmp_op.cc) + lstm_unit_op.cc) register_unity_group( cc log_loss_op.cc @@ -439,7 +438,6 @@ register_unity_group( lookup_table_v2_op.cu margin_rank_loss_op.cu masked_select_op.cu - lstmp_op.cu shuffle_channel_op.cu softmax_cudnn_op.cu squared_l2_distance_op.cu) diff --git a/test/legacy_test/CMakeLists.txt b/test/legacy_test/CMakeLists.txt index 6c5bb2801a6fc..c7dac2d5cb0c0 100644 --- a/test/legacy_test/CMakeLists.txt +++ b/test/legacy_test/CMakeLists.txt @@ -1072,7 +1072,6 @@ set_tests_properties(test_pool2d_op PROPERTIES TIMEOUT 120) set_tests_properties(test_transpose_op PROPERTIES TIMEOUT 120) set_tests_properties(test_activation_op PROPERTIES TIMEOUT 270) set_tests_properties(test_normal PROPERTIES TIMEOUT 120) -set_tests_properties(test_lstmp_op PROPERTIES TIMEOUT 120) set_tests_properties(test_bilinear_interp_op PROPERTIES TIMEOUT 120) set_tests_properties(test_decoupled_py_reader PROPERTIES TIMEOUT 120) set_tests_properties(test_fuse_bn_act_pass PROPERTIES TIMEOUT 120) diff --git a/test/legacy_test/test_lstmp_op.py b/test/legacy_test/test_lstmp_op.py deleted file mode 100644 index f1af219140935..0000000000000 --- a/test/legacy_test/test_lstmp_op.py +++ /dev/null @@ -1,379 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import unittest - -import numpy as np -import test_lstm_op as LstmTest - -ACTIVATION = { - 'identity': LstmTest.identity, - 'sigmoid': LstmTest.sigmoid, - 'tanh': LstmTest.tanh, - 'relu': LstmTest.relu, -} - - -# LSTM with recurrent projection Layer -def lstmp( - input, # T x 4D - lod, # 1 x N - h0=None, # N x D - c0=None, # N x D - w_r=None, # P x 4D - w_rh=None, # D x P - w_b=None, # 1 x 4D - w_c=None, # 1 x 3D - is_reverse=False, - proj_clip=0.0, - cell_clip=0.0, - act_gate=None, - act_cell=None, - act_cand=None, - act_proj=None, -): - def _step( - x, - w_r, - w_rh, - w_c, - r_pre, - c_pre, - proj_clip, - cell_clip, - act_gate, - act_cell, - act_cand, - act_proj, - ): - g = np.dot(r_pre, w_r) # 1 x 4D - g = g + x - g = np.reshape(g, (1, g.size)) - c, g_i, g_f, g_o = np.split(g, 4, axis=1) - if w_c is None: - g_i = act_gate(g_i) # 1 x D - g_f = act_gate(g_f) # 1 x D - else: - w_ic, w_fc, _ = np.split(w_c, 3, axis=1) - g_i = act_gate(g_i + w_ic * c_pre) # 1 x D - g_f = act_gate(g_f + w_fc * c_pre) # 1 x D - c = g_f * c_pre + g_i * act_cand(c) # 1 x D - - def array_clip(a, clip): - size = np.prod(a.shape) - new_a = np.reshape(a, (size)) - for i in range(size): - new_a[i] = max(new_a[i], -1.0 * clip) - new_a[i] = min(new_a[i], clip) - new_a = np.reshape(new_a, a.shape) - return new_a - - if cell_clip > 0.0: - c = array_clip(c, cell_clip) - if w_c is None: - g_o = act_gate(g_o) # 1 x D - else: - _, _, w_oc = np.split(w_c, 3, axis=1) - g_o = act_gate(g_o + w_oc * c) # 1 x D - h = g_o * act_cell(c) - # projection - r = np.dot(h, w_rh) - r = act_proj(r) - if proj_clip > 0.0: - r = array_clip(r, proj_clip) - return r, c - - def _reverse(x, offset): - y = np.zeros_like(x) - for i in range(len(offset) - 1): - b, e = offset[i], offset[i + 1] - y[b:e, :] = np.flip(x[b:e, :], 0) - return y - - offset = [0] - for l in lod[0]: - offset.append(offset[-1] + l) - batch_size = len(lod[0]) - # recurrent projection state - projection = [] - cell = [] - input = _reverse(input, offset) if is_reverse else input - if w_b is not None: - input = input + np.tile(w_b, (offset[-1], 1)) - for i in range(batch_size): - # compute one sequence - seq_len = lod[0][i] - x = input[offset[i] : offset[i + 1], :] - r_pre = h0[i] - c_pre = c0[i] # 1 x D - for j in range(seq_len): - # compute one step - r_pre, c_pre = _step( - x[j], - w_r, - w_rh, - w_c, - r_pre, - c_pre, - proj_clip, - cell_clip, - act_gate, - act_cell, - act_cand, - act_proj, - ) - projection.append(r_pre.flatten()) - cell.append(c_pre.flatten()) - - projection = np.array(projection).astype('float64') - cell = np.array(cell).astype('float64') - - projection = _reverse(projection, offset) if is_reverse else projection - cell = _reverse(cell, offset) if is_reverse else cell - - assert projection.shape == (input.shape[0], w_r.shape[0]) # T x P - assert cell.shape == (input.shape[0], input.shape[1] / 4) # T x D - return projection, cell - - -class TestLstmpOp(LstmTest.TestLstmOp): - def reset_argument(self): - pass - - def setUp(self): - self.set_argument() - # projection size - self.P = 10 - self.act_proj = self.act_cell - - self.reset_argument() - self.op_type = 'lstmp' - - T = sum(self.lod[0]) - N = len(self.lod[0]) - x = np.random.normal(size=(T, 4 * self.D)).astype('float64') - if self.has_initial_state: - h0 = np.random.normal(size=(N, self.P)).astype('float64') - c0 = np.random.normal(size=(N, self.D)).astype('float64') - else: - h0 = np.zeros((N, self.P)).astype('float64') - c0 = np.zeros((N, self.D)).astype('float64') - w = np.random.normal(size=(self.P, 4 * self.D)).astype('float64') - if self.use_peepholes: - b = np.random.normal(size=(1, 7 * self.D)).astype('float64') - else: - b = np.random.normal(size=(1, 4 * self.D)).astype('float64') - - w_b = b[:, 0 : 4 * self.D] - w_c = b[:, 4 * self.D :] if self.use_peepholes else None - w_rh = np.random.normal(size=(self.D, self.P)).astype('float64') - proj_clip = 0.1 - cell_clip = 0.1 - r, c = lstmp( - x, - self.lod, - h0, - c0, - w, - w_rh, - w_b, - w_c, - self.is_reverse, - proj_clip, - cell_clip, - ACTIVATION[self.act_gate], - ACTIVATION[self.act_cell], - ACTIVATION[self.act_cand], - ACTIVATION[self.act_proj], - ) - - self.inputs = {'Input': (x, self.lod), 'Weight': w, 'ProjWeight': w_rh} - - self.inputs['Bias'] = b - - if self.has_initial_state: - self.inputs['H0'] = h0 - self.inputs['C0'] = c0 - - self.outputs = { - 'Projection': (r, self.lod), - 'Cell': (c, self.lod), - } - self.attrs = { - 'use_peepholes': self.use_peepholes, - 'is_reverse': self.is_reverse, - 'proj_clip': proj_clip, - 'cell_clip': cell_clip, - 'gate_activation': self.act_gate, - 'cell_activation': self.act_cell, - 'candidate_activation': self.act_cand, - 'proj_activation': self.act_proj, - } - - def test_check_output(self): - self.check_output(atol=1e-8, check_dygraph=False) - - def test_check_grad(self): - # TODO(qingqing) remove folowing lines after the check_grad is refined. - N = len(self.lod[0]) - self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64') - self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64') - self.outputs['BatchCellPreAct'] = np.zeros((N, self.D)).astype( - 'float64' - ) - self.check_grad( - ['Input', 'Weight', 'ProjWeight', 'Bias'], - ['Projection'], - numeric_grad_delta=0.0000005, - check_dygraph=False, - ) - - -class TestLstmpOpHasInitial(TestLstmpOp): - def reset_argument(self): - self.has_initial_state = True - - def test_check_grad(self): - # TODO(qingqing) remove folowing lines after the check_grad is refined. - N = len(self.lod[0]) - self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64') - self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64') - self.outputs['BatchCellPreAct'] = np.zeros((N, self.D)).astype( - 'float64' - ) - self.check_grad( - ['Input', 'Weight', 'ProjWeight', 'Bias', 'H0', 'C0'], - ['Projection'], - numeric_grad_delta=0.0000005, - check_dygraph=False, - ) - - def test_check_grad_ingore_bias(self): - N = len(self.lod[0]) - self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64') - self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64') - self.outputs['BatchCellPreAct'] = np.zeros((N, self.D)).astype( - 'float64' - ) - self.check_grad( - ['Input', 'ProjWeight', 'Weight'], - ['Projection'], - numeric_grad_delta=0.0000005, - no_grad_set=set('Bias'), - check_dygraph=False, - ) - - def test_check_grad_ingore_weight(self): - N = len(self.lod[0]) - self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64') - self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64') - self.outputs['BatchCellPreAct'] = np.zeros((N, self.D)).astype( - 'float64' - ) - self.check_grad( - ['Input', 'ProjWeight', 'Bias'], - ['Projection'], - numeric_grad_delta=0.0000005, - no_grad_set=set('Weight'), - check_dygraph=False, - ) - - def test_check_grad_ingore_proj_weight(self): - N = len(self.lod[0]) - self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64') - self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64') - self.outputs['BatchCellPreAct'] = np.zeros((N, self.D)).astype( - 'float64' - ) - self.check_grad( - ['Input', 'Weight', 'Bias'], - ['Projection'], - numeric_grad_delta=0.0000005, - no_grad_set=set('ProjWeight'), - check_dygraph=False, - ) - - def test_check_grad_ingore_input(self): - N = len(self.lod[0]) - self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64') - self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64') - self.outputs['BatchCellPreAct'] = np.zeros((N, self.D)).astype( - 'float64' - ) - self.check_grad( - ['Weight', 'ProjWeight', 'Bias'], - ['Projection'], - numeric_grad_delta=0.0000005, - no_grad_set=set('Input'), - check_dygraph=False, - ) - - def test_check_grad_ingore_h0(self): - N = len(self.lod[0]) - self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64') - self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64') - self.outputs['BatchCellPreAct'] = np.zeros((N, self.D)).astype( - 'float64' - ) - self.check_grad( - ['Input', 'Weight', 'ProjWeight', 'Bias', 'C0'], - ['Projection'], - numeric_grad_delta=0.0000005, - no_grad_set=set('H0'), - check_dygraph=False, - ) - - def test_check_grad_ingore_c0(self): - N = len(self.lod[0]) - self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64') - self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64') - self.outputs['BatchCellPreAct'] = np.zeros((N, self.D)).astype( - 'float64' - ) - self.check_grad( - ['Input', 'Weight', 'ProjWeight', 'Bias', 'H0'], - ['Projection'], - numeric_grad_delta=0.0000005, - no_grad_set=set('C0'), - check_dygraph=False, - ) - - -class TestLstmpOpRerverse(TestLstmpOp): - def reset_argument(self): - self.is_reverse = True - - -class TestLstmpOpNotUsePeepholes(TestLstmpOp): - def reset_argument(self): - self.use_peepholes = False - - -class TestLstmpOpLinearProjection(TestLstmpOp): - def reset_argument(self): - self.act_proj = 'identity' - - -class TestLstmpOpLen0Case1(TestLstmpOp): - def reset_argument(self): - self.lod = [[0, 4, 0]] - - -class TestLstmpOpLen0Case2(TestLstmpOp): - def reset_argument(self): - self.lod = [[2, 0, 3]] - - -if __name__ == '__main__': - unittest.main() diff --git a/test/sequence/test_sequence_topk_avg_pooling.py b/test/sequence/test_sequence_topk_avg_pooling.py deleted file mode 100644 index 470b3029ab9ed..0000000000000 --- a/test/sequence/test_sequence_topk_avg_pooling.py +++ /dev/null @@ -1,166 +0,0 @@ -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import unittest -from copy import deepcopy - -import numpy as np -from op_test import OpTest - - -class TestSequenceTopkAvgPoolingOp(OpTest): - def setUp(self): - self.init_op_type() - self.set_data() - self.compute() - - def init_op_type(self): - self.op_type = "sequence_topk_avg_pooling" - - def set_data(self): - topks = [1, 3, 5] - channel_num = 3 - dim = 10 - row = [30, 45] - col = [25, 36] - self.init_data(topks, channel_num, row, col, dim) - self.init_data(topks, channel_num, row, col, dim) - - def init_data(self, topks, channel_num, row, col, dim=10): - self.attrs = {"topks": topks, "channel_num": channel_num} - feature = [row[i] * col[i] for i in range(len(row))] - numel = sum(feature) * channel_num - x_data = np.arange(numel).astype('float32') - x_lod = [[x * channel_num for x in feature]] - row_data = np.random.random((sum(row), dim)).astype('float32') - col_data = np.random.random((sum(col), dim)).astype('float32') - self.inputs = { - 'X': (x_data, x_lod), - 'ROW': (row_data, [row]), - 'COLUMN': (col_data, [col]), - } - - def calc_gradient(self, pos_data, topks, channel_num, row, col): - max_k = topks[-1] - pos_data = pos_data.flatten() - in_numel = sum([row[i] * col[i] for i in range(len(row))]) * channel_num - out_numel = sum(row) * len(topks) * channel_num - gradient = np.zeros(shape=(in_numel), dtype="float32") - dout_val = 1.0 / out_numel - pos_offset, in_offset = 0, 0 - for bs_idx in range(len(row)): # batch - row_size = row[bs_idx] - col_size = col[bs_idx] - for ch in range(channel_num): # channel - for row_idx in range(row_size): # row - in_idx = in_offset + row_idx * col_size - pos_idx = pos_offset + row_idx * max_k - for k_idx in range(len(topks)): - for k in range(topks[k_idx]): - if pos_data[pos_idx + k] != -1: - gradient[in_idx + pos_data[pos_idx + k]] += ( - dout_val / topks[k_idx] - ) - in_offset += row_size * col_size - pos_offset += row_size * max_k - return gradient - - def compute(self): - topks = self.attrs['topks'] - max_k = topks[-1] - x_data, x_lod = self.inputs['X'] - row_data, row_lod = self.inputs['ROW'] - col_data, col_lod = self.inputs['COLUMN'] - channel_num = self.attrs['channel_num'] - out = np.zeros((0, len(topks) * channel_num), dtype=x_data.dtype) - pos = np.zeros((0,), dtype='int32') - out_lod = deepcopy(row_lod) - - offset = 0 - for idx in range(len(x_lod[0])): - x_len = x_lod[0][idx] - self.assertTrue( - x_len == channel_num * row_lod[0][idx] * col_lod[0][idx], - f"x_len: {x_len} can't mod channel_num: {channel_num}", - ) - out_tmp = np.zeros((0,), dtype=x_data.dtype) - pos_tmp = np.zeros((0,), dtype='int32') - for ch in range(channel_num): - for r_id in range(row_lod[0][idx]): - x_sub = x_data[offset : (offset + col_lod[0][idx])] - topk_val, topk_pos = self.get_topk(x_sub, max_k) - sum_data = self.topk_sum(topk_val, topk_pos, max_k) - new_feature = np.array( - [sum_data[topk] / topk for topk in topks] - ) - out_tmp = np.hstack((out_tmp, new_feature)) - pos_tmp = np.hstack((pos_tmp, topk_pos)) - - offset += col_lod[0][idx] - - out_tmp = out_tmp.reshape([channel_num, -1, len(topks)]).transpose( - 1, 0, 2 - ) - pos_tmp = pos_tmp.reshape([channel_num, -1, max_k]).transpose( - 1, 0, 2 - ) - out = np.vstack( - (out, out_tmp.reshape([-1, len(topks) * channel_num])) - ) - pos = np.hstack((pos, pos_tmp.flatten())) - - self.outputs = {'Out': (out.astype('float32'), out_lod), 'pos': pos} - self.gradient = self.calc_gradient( - pos, topks, channel_num, row_lod[0], col_lod[0] - ) - - def get_topk(self, x, topk): - real_topk = topk if topk < len(x) else len(x) - topk_pos = np.array(x).argsort()[-topk:][::-1] - topk_val = np.array(x)[topk_pos] - if real_topk < topk: - topk_pos = np.hstack((topk_pos, np.full((topk - real_topk,), -1))) - topk_val = np.hstack((topk_val, np.full((topk - real_topk,), 0.0))) - - return topk_val, topk_pos - - def topk_sum(self, x, pos, max_k): - sum_data = [0.0] * (max_k + 1) - for i in range(1, max_k + 1): - if pos[i - 1] == -1: - sum_data[i] = sum_data[i - 1] - else: - sum_data[i] = sum_data[i - 1] + x[i - 1] - return sum_data - - def test_check_output(self): - self.check_output() - - def test_check_grad(self): - self.check_grad(['X'], 'Out', user_defined_grads=[self.gradient]) - - -class TestSequenceTopkAvgPoolingOpCase1(TestSequenceTopkAvgPoolingOp): - def set_data(self): - topks = [2, 3] - channel_num = 5 - dim = 10 - row = [36] - col = [48] - self.init_data(topks, channel_num, row, col, dim) - - -if __name__ == '__main__': - unittest.main() diff --git a/test/white_list/check_op_sequence_batch_1_input_white_list.py b/test/white_list/check_op_sequence_batch_1_input_white_list.py index f98fcbd586e65..2506c557e6e63 100644 --- a/test/white_list/check_op_sequence_batch_1_input_white_list.py +++ b/test/white_list/check_op_sequence_batch_1_input_white_list.py @@ -30,7 +30,6 @@ 'sequence_scatter', 'sequence_slice', 'sequence_softmax', - 'sequence_topk_avg_pooling', 'sequence_unpad', ] diff --git a/test/white_list/check_op_sequence_instance_0_input_white_list.py b/test/white_list/check_op_sequence_instance_0_input_white_list.py index 5b222c56e8dde..b4f9d16317e16 100644 --- a/test/white_list/check_op_sequence_instance_0_input_white_list.py +++ b/test/white_list/check_op_sequence_instance_0_input_white_list.py @@ -36,7 +36,6 @@ 'sequence_scatter', 'sequence_slice', 'sequence_softmax', - 'sequence_topk_avg_pooling', 'sequence_unpad', ] diff --git a/test/white_list/check_shape_white_list.py b/test/white_list/check_shape_white_list.py index db5a710867277..0994b18973059 100644 --- a/test/white_list/check_shape_white_list.py +++ b/test/white_list/check_shape_white_list.py @@ -18,7 +18,6 @@ 'conv2d_transpose', 'depthwise_conv2d_transpose', 'grid_sampler', - 'lstmp', 'margin_rank_loss', 'matmul', 'scatter', diff --git a/test/white_list/no_grad_set_white_list.py b/test/white_list/no_grad_set_white_list.py index 525cce49df3dc..ade5ea12f6654 100644 --- a/test/white_list/no_grad_set_white_list.py +++ b/test/white_list/no_grad_set_white_list.py @@ -58,7 +58,6 @@ 'lookup_table', 'lookup_table_v2', 'lstm', - 'lstmp', 'margin_rank_loss', 'matmul', 'matmul_v2', diff --git a/test/white_list/op_accuracy_white_list.py b/test/white_list/op_accuracy_white_list.py index 8565f6c7892e5..063b0dcffcc0c 100644 --- a/test/white_list/op_accuracy_white_list.py +++ b/test/white_list/op_accuracy_white_list.py @@ -61,7 +61,6 @@ 'sequence_pool', 'sequence_reverse', 'sequence_slice', - 'sequence_topk_avg_pooling', 'shuffle_channel', 'sigmoid', 'smooth_l1_loss', diff --git a/test/white_list/op_threshold_white_list.py b/test/white_list/op_threshold_white_list.py index fa151bfb07257..c5eb0862cb717 100644 --- a/test/white_list/op_threshold_white_list.py +++ b/test/white_list/op_threshold_white_list.py @@ -27,7 +27,6 @@ 'gru_unit', 'kldiv_loss', 'lstm', - 'lstmp', 'max_pool2d_with_index', 'max_pool3d_with_index', 'norm', From 076b5e923a5ce8cccd546020f39baa0b5bf66bd0 Mon Sep 17 00:00:00 2001 From: YuanRisheng Date: Mon, 9 Oct 2023 10:51:08 +0800 Subject: [PATCH 15/62] fix unittest bus (#57855) --- test/legacy_test/test_assign_value_op.py | 2 +- test/legacy_test/test_stack_op.py | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/test/legacy_test/test_assign_value_op.py b/test/legacy_test/test_assign_value_op.py index b0963b51b2485..e682835162870 100644 --- a/test/legacy_test/test_assign_value_op.py +++ b/test/legacy_test/test_assign_value_op.py @@ -54,7 +54,7 @@ def init_data(self): self.attrs["fp32_values"] = [float(v) for v in self.value.flat] def test_forward(self): - self.check_output(check_cinn=True, check_new_ir=True) + self.check_output(check_cinn=True, check_new_ir=False) class TestAssignValueOp2(TestAssignValueOp): diff --git a/test/legacy_test/test_stack_op.py b/test/legacy_test/test_stack_op.py index 44abff4dafeb5..0b0d73e9fafc1 100644 --- a/test/legacy_test/test_stack_op.py +++ b/test/legacy_test/test_stack_op.py @@ -63,11 +63,11 @@ def setUp(self): self.attrs = {'axis': self.axis} def test_check_output(self): - self.check_output(check_prim=True, check_new_ir=True) + self.check_output(check_prim=True, check_new_ir=False) def test_check_grad(self): self.check_grad( - self.get_x_names(), 'Y', check_prim=True, check_new_ir=True + self.get_x_names(), 'Y', check_prim=True, check_new_ir=False ) @@ -189,11 +189,11 @@ def setUp(self): self.attrs = {'axis': self.axis} def test_check_output(self): - self.check_output(check_prim=True, check_new_ir=True) + self.check_output(check_prim=True, check_new_ir=False) def test_check_grad(self): self.check_grad( - self.get_x_names(), 'Y', check_prim=True, check_new_ir=True + self.get_x_names(), 'Y', check_prim=True, check_new_ir=False ) From 5454994f84051c4697640ba1336a7e4537115eb8 Mon Sep 17 00:00:00 2001 From: tianhaodongbd <137985359+tianhaodongbd@users.noreply.github.com> Date: Mon, 9 Oct 2023 11:02:34 +0800 Subject: [PATCH 16/62] compilation optimization for kron_grad_kernel (#57822) --- paddle/phi/kernels/impl/kron_grad_kernel_impl.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/paddle/phi/kernels/impl/kron_grad_kernel_impl.h b/paddle/phi/kernels/impl/kron_grad_kernel_impl.h index 4829ae0a9f0c9..352e4d3006719 100644 --- a/paddle/phi/kernels/impl/kron_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/kron_grad_kernel_impl.h @@ -15,6 +15,7 @@ #pragma once #include "paddle/phi/kernels/impl/kron_kernel_impl.h" +#include "paddle/phi/kernels/reduce_sum_kernel.h" namespace phi { @@ -234,12 +235,12 @@ struct KronGradOpFunctor { #if defined(__NVCC__) || defined(__HIPCC__) auto stream = dev_ctx.stream(); // it is a cuda device_context if (dx) { - funcs::ReduceKernel>( - dev_ctx, dout_x, dx, kps::IdentityFunctor(), {1}); + phi::SumKernel( + dev_ctx, dout_x, {1}, dout_x.dtype(), false, dx); } if (dy) { - funcs::ReduceKernel>( - dev_ctx, dout_y, dy, kps::IdentityFunctor(), {1}); + phi::SumKernel( + dev_ctx, dout_y, {1}, dout_y.dtype(), false, dy); } #else auto *place = dev_ctx.eigen_device(); From 8a42a34a0b1c2b83ace7f4066b51e7d7c16eb5ce Mon Sep 17 00:00:00 2001 From: tianhaodongbd <137985359+tianhaodongbd@users.noreply.github.com> Date: Mon, 9 Oct 2023 11:03:11 +0800 Subject: [PATCH 17/62] Compilation optimization for dist_kernel (#57541) --- paddle/phi/kernels/gpu/dist_kernel.cu | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/paddle/phi/kernels/gpu/dist_kernel.cu b/paddle/phi/kernels/gpu/dist_kernel.cu index 9129c87b91434..e146fb47cf66d 100644 --- a/paddle/phi/kernels/gpu/dist_kernel.cu +++ b/paddle/phi/kernels/gpu/dist_kernel.cu @@ -21,7 +21,9 @@ #include "paddle/phi/kernels/elementwise_subtract_kernel.h" #include "paddle/phi/kernels/funcs/math_cuda_utils.h" #include "paddle/phi/kernels/gpu/reduce.h" +#include "paddle/phi/kernels/legacy/reduce_max_kernel.h" #include "paddle/phi/kernels/p_norm_kernel.h" +#include "paddle/phi/kernels/reduce_min_kernel.h" namespace phi { @@ -149,16 +151,16 @@ void DistKernel(const Context& dev_ctx, ReduceMaxWithSubtract <<>>( x_ptr, y_ptr, i_ptr, n); - phi::funcs::ReduceKernel>( - dev_ctx, intermediate, out, kps::IdentityFunctor(), reduce_axis); + phi::MaxRawKernel( + dev_ctx, intermediate, reduce_axis, true, true, out); } else if (p == -INFINITY) { ReduceMinWithSubtract <<>>( x_ptr, y_ptr, i_ptr, n); - phi::funcs::ReduceKernel>( - dev_ctx, intermediate, out, kps::IdentityFunctor(), reduce_axis); + phi::MinRawKernel( + dev_ctx, intermediate, reduce_axis, true, true, out); } else { MT p_order = static_cast(p); From 0e4f474168496e17adce73e24a1928fa022cc5f1 Mon Sep 17 00:00:00 2001 From: BiynXu <62832681+BiynXu@users.noreply.github.com> Date: Mon, 9 Oct 2023 11:03:47 +0800 Subject: [PATCH 18/62] [CINN] Add FactorizeReduction schedule primitive (#57777) Add FactorizeReduction schedule primitive. The difference between FactorizeReduction primitive and the original RFactor primitive are: FactorizeReduction supports complex iters_value subscript, which means that FactorizeReduction can be used after using primitives such as Fuse and Split, and RFactor does not support this. FactorizeReduction does not change the order of the original loop, while RFactor may have an implicit Reorder. FactorizeReduction supports the transformation of one reduce block in a complex AST, while RFactor only supports the case where the AST is entirely composed of one reduce block. --- paddle/cinn/backends/ir_schedule_test.cc | 264 ++++++++++++ paddle/cinn/ir/schedule/factorize_reduction.h | 408 ++++++++++++++++++ paddle/cinn/ir/schedule/ir_schedule.cc | 84 ++++ paddle/cinn/ir/schedule/ir_schedule.h | 40 ++ paddle/cinn/ir/schedule/ir_schedule_util.cc | 8 + paddle/cinn/ir/schedule/ir_schedule_util.h | 11 +- paddle/cinn/ir/schedule/schedule_desc.cc | 6 + 7 files changed, 820 insertions(+), 1 deletion(-) create mode 100644 paddle/cinn/ir/schedule/factorize_reduction.h diff --git a/paddle/cinn/backends/ir_schedule_test.cc b/paddle/cinn/backends/ir_schedule_test.cc index 5ea30c6951d24..2923c8dc9fe7a 100644 --- a/paddle/cinn/backends/ir_schedule_test.cc +++ b/paddle/cinn/backends/ir_schedule_test.cc @@ -2310,6 +2310,270 @@ void test_rfactor(void* _args, int32_t num_args) ASSERT_EQ(utils::Trim(target_code), utils::Trim(source_code)); } +TEST(IrSchedule, factorize_reduction) { + Context::Global().ResetNameId(); + Expr M(3); + Expr N(4); + Expr K(5); + + Target target = common::DefaultHostTarget(); + + Placeholder A("A", {M, N, K}); + Var j(4, "j0"); + Var k(5, "k0"); + auto B = Compute( + {M}, + [&](Var i) { + return lang::ReduceSum(A(i, j, k), {j, k}); + }, + "B"); + + auto stages = CreateStages({A, B}); + auto func = cinn::lang::LowerVec("test_factorize_reduction", + stages, + {A, B}, + {}, + {}, + nullptr, + target, + true); + CHECK(!func.empty()); + auto ast_expr = func[0]->body; + std::vector vec_ast{ast_expr}; + ir::ModuleExpr mod_expr(vec_ast); + ir::IRSchedule ir_sch(mod_expr); + auto loops = ir_sch.GetLoops("B"); + CHECK_EQ(loops.size(), 3U); + auto new_rf_tensor = ir_sch.FactorizeReduction(loops[1], 0); + auto* new_rf_tensor_ref = new_rf_tensor.As(); + CHECK(new_rf_tensor_ref); + CHECK(new_rf_tensor_ref->buffer.defined()); + func[0]->temp_bufs.push_back(new_rf_tensor_ref->buffer); + func[0]->PrepareBufferCastExprs(); + std::string origin = utils::GetStreamCnt(func[0]); + LOG(INFO) << origin; + EXPECT_EQ(origin, utils::Trim(R"ROC( +function test_factorize_reduction (_A, _B) +{ + ScheduleBlock(root) + { + { + serial for (i, 0, 3) + { + serial for (j0, 0, 4) + { + ScheduleBlock(B_rf__reduce_init) + { + vj0, i0_0 = axis.bind(j0, i) + B_rf__reduce_init[vj0, i0_0] = 0.00000000f + } + serial for (k0, 0, 5) + { + ScheduleBlock(B_rf) + { + vj0, i0_0, i2 = axis.bind(j0, i, k0) + B_rf[vj0, i0_0] = (B_rf[vj0, i0_0] + A[i0_0, vj0, i2]) + } + } + } + } + serial for (i, 0, 3) + { + ScheduleBlock(B__reduce_init) + { + i0_0 = axis.bind(i) + B__reduce_init[i0_0] = 0.00000000f + } + serial for (j0, 0, 4) + { + ScheduleBlock(B) + { + vj0, i0_0 = axis.bind(j0, i) + B[i0_0] = (B[i0_0] + B_rf[vj0, i0_0]) + } + } + } + } + } +} +)ROC")); +} + +TEST(IrSchedule, factorize_reduction1) { + Context::Global().ResetNameId(); + Expr M(3); + Expr N(4); + Expr K(5); + + Target target = common::DefaultHostTarget(); + + Placeholder A("A", {M, N, K}); + Var j(4, "j0"); + Var k(5, "k0"); + auto B = Compute( + {M}, + [&](Var i) { + return lang::ReduceSum(A(i, j, k), {j, k}); + }, + "B"); + + auto stages = CreateStages({A, B}); + auto func = cinn::lang::LowerVec("test_factorize_reduction", + stages, + {A, B}, + {}, + {}, + nullptr, + target, + true); + CHECK(!func.empty()); + auto ast_expr = func[0]->body; + std::vector vec_ast{ast_expr}; + ir::ModuleExpr mod_expr(vec_ast); + ir::IRSchedule ir_sch(mod_expr); + auto loops = ir_sch.GetLoops("B"); + CHECK_EQ(loops.size(), 3U); + auto new_rf_tensor = ir_sch.FactorizeReduction(loops[1], 1); + auto* new_rf_tensor_ref = new_rf_tensor.As(); + CHECK(new_rf_tensor_ref); + CHECK(new_rf_tensor_ref->buffer.defined()); + func[0]->temp_bufs.push_back(new_rf_tensor_ref->buffer); + func[0]->PrepareBufferCastExprs(); + std::string origin = utils::GetStreamCnt(func[0]); + LOG(INFO) << origin; + EXPECT_EQ(origin, utils::Trim(R"ROC( +function test_factorize_reduction (_A, _B) +{ + ScheduleBlock(root) + { + { + serial for (i, 0, 3) + { + serial for (j0, 0, 4) + { + ScheduleBlock(B_rf__reduce_init) + { + vj0, i0_0 = axis.bind(j0, i) + B_rf__reduce_init[i0_0, vj0] = 0.00000000f + } + serial for (k0, 0, 5) + { + ScheduleBlock(B_rf) + { + vj0, i0_0, i2 = axis.bind(j0, i, k0) + B_rf[i0_0, vj0] = (B_rf[i0_0, vj0] + A[i0_0, vj0, i2]) + } + } + } + } + serial for (i, 0, 3) + { + ScheduleBlock(B__reduce_init) + { + i0_0 = axis.bind(i) + B__reduce_init[i0_0] = 0.00000000f + } + serial for (j0, 0, 4) + { + ScheduleBlock(B) + { + vj0, i0_0 = axis.bind(j0, i) + B[i0_0] = (B[i0_0] + B_rf[i0_0, vj0]) + } + } + } + } + } +} +)ROC")); +} + +TEST(IrSchedule, factorize_reduction2) { + Context::Global().ResetNameId(); + Expr M(3); + Expr N(4); + Expr K(5); + + Target target = common::DefaultHostTarget(); + + Placeholder A("A", {M, N * K}); + Var j(4 * 5, "j0"); + auto B = Compute( + {M}, [&](Var i) { return lang::ReduceSum(A(i, j), {j}); }, "B"); + + auto stages = CreateStages({A, B}); + auto func = cinn::lang::LowerVec("test_factorize_reduction", + stages, + {A, B}, + {}, + {}, + nullptr, + target, + true); + CHECK(!func.empty()); + auto ast_expr = func[0]->body; + std::vector vec_ast{ast_expr}; + ir::ModuleExpr mod_expr(vec_ast); + ir::IRSchedule ir_sch(mod_expr); + auto loops = ir_sch.GetLoops("B"); + CHECK_EQ(loops.size(), 2U); + auto splited_loops = ir_sch.Split(loops[1], {4, 5}); + CHECK_EQ(splited_loops.size(), 2U); + auto new_rf_tensor = ir_sch.FactorizeReduction(splited_loops[0], 1); + auto* new_rf_tensor_ref = new_rf_tensor.As(); + CHECK(new_rf_tensor_ref); + CHECK(new_rf_tensor_ref->buffer.defined()); + func[0]->temp_bufs.push_back(new_rf_tensor_ref->buffer); + func[0]->PrepareBufferCastExprs(); + std::string origin = utils::GetStreamCnt(func[0]); + LOG(INFO) << origin; + EXPECT_EQ(origin, utils::Trim(R"ROC( +function test_factorize_reduction (_A, _B) +{ + ScheduleBlock(root) + { + { + serial for (i, 0, 3) + { + serial for (j0, 0, 4) + { + ScheduleBlock(B_rf__reduce_init) + { + vj0, i0_0 = axis.bind(j0, i) + B_rf__reduce_init[i0_0, vj0] = 0.00000000f + } + serial for (j0_0, 0, 5) + { + ScheduleBlock(B_rf) + { + vj0, i0_0, vj0_0 = axis.bind(j0, i, j0_0) + B_rf[i0_0, vj0] = (B_rf[i0_0, vj0] + A[i0_0, ((5 * vj0) + vj0_0)]) + } + } + } + } + serial for (i, 0, 3) + { + ScheduleBlock(B__reduce_init) + { + i0_0 = axis.bind(i) + B__reduce_init[i0_0] = 0.00000000f + } + serial for (j0, 0, 4) + { + ScheduleBlock(B) + { + vj0, i0_0 = axis.bind(j0, i) + B[i0_0] = (B[i0_0] + B_rf[i0_0, vj0]) + } + } + } + } + } +} +)ROC")); +} + TEST(IrSchedule, compute_inline1) { Context::Global().ResetNameId(); Expr M(32); diff --git a/paddle/cinn/ir/schedule/factorize_reduction.h b/paddle/cinn/ir/schedule/factorize_reduction.h new file mode 100644 index 0000000000000..0973d123fd40c --- /dev/null +++ b/paddle/cinn/ir/schedule/factorize_reduction.h @@ -0,0 +1,408 @@ +// Copyright (c) 2021 CINN Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Used in FactorizeReduction + +#pragma once +#include "paddle/cinn/ir/ir.h" +#include "paddle/cinn/ir/ir_base.h" +#include "paddle/cinn/ir/ir_printer.h" +#include "paddle/cinn/ir/schedule/ir_schedule_util.h" +#include "paddle/cinn/ir/tensor.h" +#include "paddle/cinn/ir/utils/ir_copy.h" +#include "paddle/cinn/lang/compute.h" +#include "paddle/cinn/optim/replace_var_with_expr.h" +#include "paddle/cinn/utils/error.h" + +namespace cinn { +namespace ir { + +// Create the new Reduction-Factorized tensor, +// only used for FactorizeReduction schedule primitive. +Tensor CreateRFTensor(const Tensor& original_tensor, + const Expr& rf_loop, + int rf_axis) { + std::string name = original_tensor->name + "_rf"; + std::vector new_shape = original_tensor->shape; + new_shape.insert(new_shape.begin() + rf_axis, rf_loop.As()->extent); + Tensor rf_tensor = _Tensor_::Make(name, + original_tensor->type(), + new_shape, + new_shape, + original_tensor->operation, + original_tensor->reduce_axis); + rf_tensor->WithBuffer("global", name, original_tensor->type()); + return rf_tensor; +} + +// Base class to create a new reduce block, +// only used for FactorizeReduction schedule primitive. +class ReduceBlockCreater { + public: + ReduceBlockCreater(const Expr& original_block, + const std::vector& original_loops, + const Expr& rf_loop, + const Expr& original_update_stmt, + const ir::Tensor& rf_tensor, + bool is_rf_block) + : original_block_(original_block), + original_loops_(original_loops), + rf_loop_(rf_loop), + original_update_stmt_(original_update_stmt), + rf_tensor_(rf_tensor), + is_rf_block_(is_rf_block) { + const ScheduleBlockRealize* block_real = + original_block_.As(); + CHECK_NOTNULL(block_real); + num_block_iters_ = block_real->iter_values.size(); + } + + void CreateBlock() { + CreateRFIter(); + for (int i = 0; i < num_block_iters_; ++i) { + CreateNormalIter(i); + } + CreateUpdateStmt(); + + std::string new_update_block_name = + original_block_.As() + ->schedule_block.As() + ->name; + if (is_rf_block_) { + new_update_block_name += "_rf"; + } + std::string new_init_block_name = + ir::GenReduceInitTensorNameOf(new_update_block_name); + VLOG(5) << "new_init_block_name = " << new_init_block_name; + + Expr init_value = rf_tensor_->GetReduceInitVal(); + const std::vector& domain = rf_tensor_->domain_without_reduce_axis(); + ir::Tensor init_tensor = lang::Compute( + domain, + [=](const std::vector& axis) { return init_value; }, + new_init_block_name); + init_tensor->Bind(rf_tensor_->buffer); + Expr init_stmt = ir::Store::Make( + init_tensor, init_value, new_update_stmt_.As()->indices); + new_init_sch_block_ = ScheduleBlock::Make( + new_init_iter_vars_, {}, {}, new_init_block_name, init_stmt); + new_init_block_realize_ = + ScheduleBlockRealize::Make(new_init_iter_values_, new_init_sch_block_); + + new_update_sch_block_ = ScheduleBlock::Make( + new_iter_vars_, {}, {}, new_update_block_name, new_update_stmt_); + new_update_block_realize_ = + ScheduleBlockRealize::Make(new_iter_values_, new_update_sch_block_); + VLOG(4) << "new_update_block_realize:\n" << new_update_block_realize_; + } + + Expr CreateLoops() { + int num_loops = original_loops_.size(); + std::vector new_loops(num_loops); + Expr body = new_update_block_realize_; + bool has_add_init_block = false; + for (int i = num_loops - 1; i >= 0; --i) { + bool is_spatial_loop = + new_spatial_loop_var_names_.count( + original_loops_[i].As()->loop_var->name) > 0; + bool is_rf_loop = rf_loop_.As()->loop_var->name == + original_loops_[i].As()->loop_var->name; + // Skip non rf reduction loops of write back block. + if (!is_rf_block_ && !is_spatial_loop && !is_rf_loop) { + continue; + } + // Add reduce init block. + if (!has_add_init_block && is_spatial_loop) { + body = Block::Make({new_init_block_realize_, body}); + has_add_init_block = true; + } + // Add loops + Var loop_var = ir_utils::IRCopy(original_loops_[i].As()->loop_var); + Expr min = ir_utils::IRCopy(original_loops_[i].As()->min); + Expr extent = ir_utils::IRCopy(original_loops_[i].As()->extent); + body = For::Make(loop_var, + min, + extent, + original_loops_[i].As()->for_type(), + original_loops_[i].As()->device_api, + body, + original_loops_[i].As()->vectorize_info(), + original_loops_[i].As()->bind_info()); + VLOG(5) << "new body:\n" << body; + } + VLOG(4) << "new loop nest:\n" << body; + return body; + } + + private: + virtual void CreateRFIter() = 0; + virtual void CreateNormalIter(int idx) = 0; + virtual void CreateUpdateStmt() = 0; + + public: + Var rf_var_; + std::vector rf_tensor_access_indices_; + + protected: + const Expr& original_block_; + const std::vector& original_loops_; + const Expr& rf_loop_; + const Expr& original_update_stmt_; + const ir::Tensor& rf_tensor_; + std::map original_indice2new_expr_; + int num_block_iters_; + bool is_rf_block_; + + std::vector new_iter_vars_; + std::vector new_iter_values_; + std::vector new_init_iter_vars_; + std::vector new_init_iter_values_; + std::unordered_set new_spatial_loop_var_names_; + Expr new_update_stmt_; + + Expr new_update_sch_block_; + Expr new_update_block_realize_; + Expr new_init_sch_block_; + Expr new_init_block_realize_; +}; + +// Implement class for building Reduction-Factorized block, +// only used for FactorizeReduction schedule primitive. +class RFBlockCreater : public ReduceBlockCreater { + public: + RFBlockCreater(const Expr& original_block, + const std::vector& original_loops, + const Expr& rf_loop, + const Expr& original_update_stmt, + const ir::Tensor& rf_tensor, + const std::map& var2loops, + int rf_axis) + : ReduceBlockCreater(original_block, + original_loops, + rf_loop, + original_update_stmt, + rf_tensor, + true), + var2loops_(var2loops), + rf_axis_(rf_axis) {} + + private: + void CreateRFIter() override { + std::string loop_var_name = rf_loop_.As()->loop_var->name; + std::string rf_var_name = "v" + loop_var_name; + rf_var_ = Var(rf_loop_.As()->min, + rf_loop_.As()->extent, + rf_var_name, + /* is_reduce = */ false); + loop_var2block_iters_[rf_loop_.As()->loop_var] = rf_var_; + new_iter_vars_.push_back(rf_var_); + new_iter_values_.push_back(rf_loop_.As()->loop_var); + new_init_iter_vars_.push_back(rf_var_); + new_init_iter_values_.push_back(rf_loop_.As()->loop_var); + new_spatial_loop_var_names_.insert(rf_loop_.As()->loop_var->name); + VLOG(4) << "create new_rf_var = " << rf_var_ + << ", with iter value = " << new_iter_values_.back(); + } + + void CreateNormalIter(int idx) override { + Var original_iter_var = original_block_.As() + ->schedule_block.As() + ->iter_vars[idx]; + Expr original_iter_value = + original_block_.As()->iter_values[idx]; + // The original iter is either a spatial iter, or a reduction iter that + // doesn't touch the rf loop. In this case reuse the old iter var and its + // corresponding iter value. + if (!original_iter_var->is_reduce_axis) { + new_iter_vars_.push_back(original_iter_var); + new_iter_values_.push_back(original_iter_value); + new_init_iter_vars_.push_back(original_iter_var); + new_init_iter_values_.push_back(original_iter_value); + ir_utils::CollectIRNodesWithoutTensor( + original_iter_value, [&](const Expr* x) { + if (x->as_var()) { + new_spatial_loop_var_names_.insert(x->as_var()->name); + } + return false; + }); + return; + } else if (!ContainVar({original_iter_value}, + rf_loop_.As()->loop_var->name)) { + new_iter_vars_.push_back(original_iter_var); + new_iter_values_.push_back(original_iter_value); + return; + } + CHECK(original_iter_var->is_reduce_axis); + + // This iter is a reduction iter and touches the rfactor loop. So we try to + // create a new iter for each loop var that appear in the original iter + // value. + std::vector vars_in_original_iter_values; + ir_utils::CollectIRNodesWithoutTensor( + original_iter_value, [&](const Expr* x) { + if (x->as_var()) { + vars_in_original_iter_values.push_back(x->as_var_ref()); + } + return false; + }); + for (const Var& loop_var : vars_in_original_iter_values) { + if (var2loops_.count(loop_var) == 0) { + continue; + } + Expr loop = var2loops_.at(loop_var); + if (loop_var2block_iters_.count(loop_var) == 0) { + Var new_iter_var(loop.As()->min, + loop.As()->extent, + "v" + loop_var->name, + /* is_reduce = */ true); + new_iter_vars_.push_back(new_iter_var); + new_iter_values_.emplace_back(loop_var); + loop_var2block_iters_[loop_var] = new_iter_var; + } + } + // Substitute the original iter values with new iter vars, + // and store the new iter values in original_indice2new_expr_, + // it will be used in Load/Store indices. + Expr new_iters = ir_utils::IRCopy(original_iter_value); + ReplaceExpr(&new_iters, loop_var2block_iters_); + original_indice2new_expr_[original_iter_var] = new_iters; + VLOG(4) << "original_indice2new_expr_[" << original_iter_var + << "] = " << new_iters; + } + + void CreateUpdateStmt() override { + rf_tensor_access_indices_ = original_update_stmt_.As()->indices; + rf_tensor_access_indices_.insert( + rf_tensor_access_indices_.begin() + rf_axis_, rf_var_); + Expr original_store_body = original_update_stmt_.As()->value; + Expr new_store_body = ir_utils::IRCopy(original_store_body); +#define REPLACE_RF_TENSOR(Op) \ + if (new_store_body.As()) { \ + auto* node = new_store_body.As(); \ + CHECK(node); \ + auto& operand = node->a(); \ + operand = Load::Make(rf_tensor_, rf_tensor_access_indices_); \ + } + + REPLACE_RF_TENSOR(Add) + REPLACE_RF_TENSOR(Mul) + REPLACE_RF_TENSOR(Max) + REPLACE_RF_TENSOR(Min) +#undef REPLACE_RF_TENSOR + + new_update_stmt_ = + ir::Store::Make(rf_tensor_, new_store_body, rf_tensor_access_indices_); + ReplaceExpr(&new_update_stmt_, original_indice2new_expr_); + VLOG(4) << "new_update_stmt of rf block: \n" << new_update_stmt_; + } + + private: + const std::map& var2loops_; + int rf_axis_; + + std::map loop_var2block_iters_; +}; + +// Implement class for building Writing-Back block, +// only used for FactorizeReduction schedule primitive. +class RBBlockCreater : public ReduceBlockCreater { + public: + RBBlockCreater(const Expr& original_block, + const std::vector& original_loops, + const Expr& rf_loop, + const Expr& original_update_stmt, + const ir::Tensor& rf_tensor, + const std::vector& rf_tensor_access_indices, + const Var& rf_block_rf_iter_var) + : ReduceBlockCreater(original_block, + original_loops, + rf_loop, + original_update_stmt, + rf_tensor, + false), + rf_tensor_access_indices_(rf_tensor_access_indices), + rf_block_rf_iter_var_(rf_block_rf_iter_var) {} + + private: + void CreateRFIter() override { + std::string loop_var_name = rf_loop_.As()->loop_var->name; + std::string rf_var_name = "v" + loop_var_name; + rf_var_ = Var(rf_loop_.As()->min, + rf_loop_.As()->extent, + rf_var_name, + /* is_reduce = */ true); + new_iter_vars_.push_back(rf_var_); + new_iter_values_.push_back(rf_loop_.As()->loop_var); + original_indice2new_expr_[rf_block_rf_iter_var_] = Expr(rf_var_); + VLOG(4) << "create new_rf_var = " << rf_var_ + << ", with iter value = " << new_iter_values_.back(); + } + + void CreateNormalIter(int idx) override { + Var original_iter_var = original_block_.As() + ->schedule_block.As() + ->iter_vars[idx]; + Expr original_iter_value = + original_block_.As()->iter_values[idx]; + if (!original_iter_var->is_reduce_axis) { + new_iter_vars_.push_back(original_iter_var); + new_iter_values_.push_back(original_iter_value); + new_init_iter_vars_.push_back(original_iter_var); + new_init_iter_values_.push_back(original_iter_value); + ir_utils::CollectIRNodesWithoutTensor( + original_iter_value, [&](const Expr* x) { + if (x->as_var()) { + new_spatial_loop_var_names_.insert(x->as_var()->name); + } + return false; + }); + // original_indice2new_expr_[original_iter_var] = new_iter_vars_.back(); + VLOG(4) << "create new iter var = " << new_iter_vars_.back() + << ", with iter value = " << new_iter_values_.back(); + } + } + + void CreateUpdateStmt() override { + Expr original_store_body = original_update_stmt_.As()->value; + Expr new_store_body = ir_utils::IRCopy(original_store_body); +#define REPLACE_RF_TENSOR(Op) \ + if (new_store_body.As()) { \ + auto* node = new_store_body.As(); \ + CHECK(node); \ + auto& operand = node->b(); \ + operand = Load::Make(rf_tensor_, rf_tensor_access_indices_); \ + } + + REPLACE_RF_TENSOR(Add) + REPLACE_RF_TENSOR(Mul) + REPLACE_RF_TENSOR(Max) + REPLACE_RF_TENSOR(Min) +#undef REPLACE_RF_TENSOR + + Expr original_store_tensor = original_update_stmt_.As()->tensor; + std::vector original_store_indices = + original_update_stmt_.As()->indices; + new_update_stmt_ = ir::Store::Make( + original_store_tensor, new_store_body, original_store_indices); + ReplaceExpr(&new_update_stmt_, original_indice2new_expr_); + VLOG(4) << "new_update_stmt of write back block: \n" << new_update_stmt_; + } + + private: + const std::vector& rf_tensor_access_indices_; + const Var& rf_block_rf_iter_var_; +}; + +} // namespace ir +} // namespace cinn diff --git a/paddle/cinn/ir/schedule/ir_schedule.cc b/paddle/cinn/ir/schedule/ir_schedule.cc index f17e17b73019d..24f97b6e03d1e 100644 --- a/paddle/cinn/ir/schedule/ir_schedule.cc +++ b/paddle/cinn/ir/schedule/ir_schedule.cc @@ -33,6 +33,7 @@ #include "paddle/cinn/ir/ir_printer.h" #include "paddle/cinn/ir/ir_visitor.h" #include "paddle/cinn/ir/op/ir_operators.h" +#include "paddle/cinn/ir/schedule/factorize_reduction.h" #include "paddle/cinn/ir/schedule/ir_schedule_error.h" #include "paddle/cinn/ir/schedule/ir_schedule_util.h" #include "paddle/cinn/ir/utils/ir_copy.h" @@ -120,6 +121,7 @@ class ScheduleImpl { void ReverseComputeInline(const Expr& schedule_block); void Bind(const Expr& loop, const std::string& thread_axis); Expr Rfactor(const Expr& rf_loop, int rf_axis); + Expr FactorizeReduction(const Expr& rf_loop, int rf_axis); Expr AddUnitLoop(const Expr& block) const; void Annotate(const Expr& block, const std::string& key, const attr_t& value); void Unannotate(Expr& block, const std::string& key); // NOLINT @@ -717,6 +719,79 @@ Expr ScheduleImpl::Rfactor(const Expr& rf_loop, int rf_axis) { return rf_create.CreateRfAllStmts(); } +Expr ScheduleImpl::FactorizeReduction(const Expr& rf_loop, int rf_axis) { + std::string primitive = "FactorizeReduction"; + // Get child block of the rf_loop and check. + std::vector blocks = GetChildBlocks(rf_loop); + if (blocks.size() != 1) { + std::ostringstream os; + os << "The rf_loop is required to have only one child block, but got " + << blocks.size() << std::endl; + throw IRScheduleErrorHandler(primitive, os.str(), this->module_expr_); + } + Expr original_block = blocks.at(0); + Expr root_block = GetRootBlock(original_block); + // TODO(BiynXu): Add CheckReductionBlock() + + // Collect the loops of the block. + // Construct a map from loop var names to corresponding loops. + std::vector original_loops = this->GetLoops(original_block); + CHECK_GT(original_loops.size(), 0); + VLOG(3) << "before FactorizeReduction, original computational body of the " + "reduction is:\n" + << original_loops[0]; + std::map var2loops; + for (const Expr& loop : original_loops) { + var2loops[loop.As()->loop_var] = loop; + } + + // Get original stmt of reduction update and original store tensor. + Expr original_update_body = original_block.As() + ->schedule_block.As() + ->body; + Expr original_update_stmt; + CHECK(original_update_body.As() || original_update_body.As()); + if (original_update_body.As()) { + CHECK_EQ(original_update_body.As()->stmts.size(), 1); + original_update_stmt = original_update_body.As()->stmts[0]; + } else if (original_update_body.As()) { + original_update_stmt = original_update_body; + } + Tensor original_tensor = + original_update_stmt.As()->tensor.as_tensor_ref(); + + // Create new blocks and loops. + Tensor rf_tensor = CreateRFTensor(original_tensor, rf_loop, rf_axis); + RFBlockCreater rf_block_creater(original_block, + original_loops, + rf_loop, + original_update_stmt, + rf_tensor, + var2loops, + rf_axis); + rf_block_creater.CreateBlock(); + RBBlockCreater wb_block_creater(original_block, + original_loops, + rf_loop, + original_update_stmt, + rf_tensor, + rf_block_creater.rf_tensor_access_indices_, + rf_block_creater.rf_var_); + wb_block_creater.CreateBlock(); + + Expr rf_body = rf_block_creater.CreateLoops(); + Expr wb_body = wb_block_creater.CreateLoops(); + + Expr new_computational_body = Block::Make({rf_body, wb_body}); + + // Replace and update the AST. + this->Replace(original_loops[0], new_computational_body); + VLOG(3) << "After FactorizeReduction, new computational body of the " + "reduction is:\n" + << new_computational_body; + return rf_tensor; +} + struct CacheReadRewriter : public ir::IRMutator<> { public: static Expr Rewrite(const Expr& root, CacheBlockInfo* info) { @@ -2647,6 +2722,15 @@ Expr IRSchedule::Rfactor(const Expr& rf_loop, int rf_axis) { return result; } +Expr IRSchedule::FactorizeReduction(const Expr& rf_loop, int rf_axis) { + auto result = impl_->FactorizeReduction(rf_loop, rf_axis); + trace_.Append(ScheduleDesc::Step("FactorizeReduction", + {{"rf_loop", std::vector({rf_loop})}}, + {{"rf_axis", rf_axis}}, + {result})); + return result; +} + void IRSchedule::Annotate(const Expr& block, const std::string& key, const attr_t& value) { diff --git a/paddle/cinn/ir/schedule/ir_schedule.h b/paddle/cinn/ir/schedule/ir_schedule.h index ce341c502b1fb..4c5fc1d10f1b6 100644 --- a/paddle/cinn/ir/schedule/ir_schedule.h +++ b/paddle/cinn/ir/schedule/ir_schedule.h @@ -381,6 +381,46 @@ class IRSchedule { */ Expr Rfactor(const Expr& rf_loop, int rf_axis); + /** + * \brief Factorize the reduction block by the given loop. The block will be + * split into two blocks: reduction-factorized block and write-back block. + * @param rf_loop the reduce loop to be factorized. + * @param rf_axis The position where the new dimension is placed in the new rf + * tensor. + * @return The new created rf tensor. + * + * For example, input the block: + * \code + * for (i, 0, 10) // serial loop + * B_init[i] = 0 + * for (j, 0, 20) // reduce loop + * for (k, 0, 30) // reduce loop + * B[i] = B[i] + A[i, j, k] + * \endcode + * + * If the rf loop is j and rf_axis is 0, the transformation is + * divided into 2 steps: + * 1. get the rf block where the reduce loop j is transformed to the + * serial loop with no accumalation and a new rf tensor is created. + * The axis j will be placed in the rf_axis of the new rf_tensor. + * The rf_block is as follows: + * \code + * for (i, 0, 10) // serial loop + * for (j, 0, 20) // rf loop j is transformed to the serial loop + * rf_B_init[j, i] = 0 + * for (k, 0, 30) // reduce loop. + * rf_B[j, i] = rf_B[j, i] + A[i, j, k] + * \endcode + * 2. do reduction of the rf loop j to get the final result block: + * \code + * for (i, 0, 10) // serial loop + * B_init[i] = 0 + * for (j, 0, 20) // rf reduction loop + * B[i] = B[i] + rf_B[j, i] + * \endcode + */ + Expr FactorizeReduction(const Expr& rf_loop, int rf_axis); + /*! * \brief Annotate a block with a key-value pair to set as its attribute * \param block The block to be annotated diff --git a/paddle/cinn/ir/schedule/ir_schedule_util.cc b/paddle/cinn/ir/schedule/ir_schedule_util.cc index 7144e1484a58c..7a2daa3106612 100644 --- a/paddle/cinn/ir/schedule/ir_schedule_util.cc +++ b/paddle/cinn/ir/schedule/ir_schedule_util.cc @@ -221,6 +221,14 @@ void ReplaceExpr(Expr* source, return; } +void ReplaceExpr(Expr* source, + const std::map& replacing_map) { + if (replacing_map.empty()) return; + MappingVarToExprMutator mapper(replacing_map); + mapper(source); + return; +} + std::vector ValidateFactors(const std::vector& factors, int total_extent, const ModuleExpr& module_expr) { diff --git a/paddle/cinn/ir/schedule/ir_schedule_util.h b/paddle/cinn/ir/schedule/ir_schedule_util.h index 50515e5f3cfa9..9c9418b4d577e 100644 --- a/paddle/cinn/ir/schedule/ir_schedule_util.h +++ b/paddle/cinn/ir/schedule/ir_schedule_util.h @@ -193,7 +193,7 @@ Tensor GetReadTensor(const Expr& block, int index); int GetLoopExtent(const Expr& loop); /** - * \brief Given a vector of Exors, return whether they contain a var with + * \brief Given a vector of Exprs, return whether they contain a var with * specific name. * @param exprs The given vector of Exprs * @param var_name The name of specific var @@ -241,6 +241,15 @@ void ReplaceExpr(Expr* source, const std::vector& replaced, const std::vector& candidates); +/** + * Replace Vars in replaced to Exprs in candidates in source. + * @param source The Expr we will implement the change. + * @param replacing_map The one-to-one corresponded Vars -> Exprs to be + * replaced. + */ +void ReplaceExpr(Expr* source, + const std::map& replacing_map); + /** * Validate the factors param of Split. We will check if factors are validate * and change -1 to positive integer. diff --git a/paddle/cinn/ir/schedule/schedule_desc.cc b/paddle/cinn/ir/schedule/schedule_desc.cc index a3ef7e72a1bc9..e0d5f4ab21701 100644 --- a/paddle/cinn/ir/schedule/schedule_desc.cc +++ b/paddle/cinn/ir/schedule/schedule_desc.cc @@ -474,6 +474,12 @@ CINN_BUILD_STEP_KIND(Rfactor) .SetApplyFn( APPLY_FUNC_UNIFORM(FREE_FUNCTION_CONVERTER(&IRSchedule::Rfactor))); +CINN_BUILD_STEP_KIND(FactorizeReduction) + .Inputs({"rf_loop"}) + .Attrs({"rf_axis"}) + .SetApplyFn(APPLY_FUNC_UNIFORM( + FREE_FUNCTION_CONVERTER(&IRSchedule::FactorizeReduction))); + CINN_BUILD_STEP_KIND(MergeExprs) .SetApplyFn( APPLY_FUNC_UNIFORM(FREE_FUNCTION_CONVERTER(&IRSchedule::MergeExprs))); From 20c7e89ba3faaa39242340a771b3838878877184 Mon Sep 17 00:00:00 2001 From: tianhaodongbd <137985359+tianhaodongbd@users.noreply.github.com> Date: Mon, 9 Oct 2023 11:03:54 +0800 Subject: [PATCH 19/62] compilation optimization for logsumexp_kernel (#57817) --- paddle/phi/kernels/gpu/logsumexp_kernel.cu | 32 ++++------------------ 1 file changed, 5 insertions(+), 27 deletions(-) diff --git a/paddle/phi/kernels/gpu/logsumexp_kernel.cu b/paddle/phi/kernels/gpu/logsumexp_kernel.cu index 72f878c38dd36..ef2c29bbb2da0 100644 --- a/paddle/phi/kernels/gpu/logsumexp_kernel.cu +++ b/paddle/phi/kernels/gpu/logsumexp_kernel.cu @@ -18,12 +18,15 @@ #include "paddle/phi/common/bfloat16.h" #include "paddle/phi/common/float16.h" #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/activation_kernel.h" #include "paddle/phi/kernels/elementwise_add_kernel.h" #include "paddle/phi/kernels/elementwise_subtract_kernel.h" #include "paddle/phi/kernels/funcs/activation_functor.h" #include "paddle/phi/kernels/funcs/elementwise_base.h" #include "paddle/phi/kernels/funcs/transpose_function.cu.h" #include "paddle/phi/kernels/gpu/reduce.h" +#include "paddle/phi/kernels/reduce_max_kernel.h" +#include "paddle/phi/kernels/transpose_kernel.h" namespace phi { @@ -42,27 +45,6 @@ struct ComputeType { using type = float; }; -template -struct LogCUDAFunctor { - HOSTDEVICE inline T operator()(const T x) const { return std::log(x); } -}; - -template <> -struct LogCUDAFunctor { - HOSTDEVICE inline float16 operator()(const float16 x) const { - auto x_ = static_cast(x); - return static_cast(std::log(x_)); - } -}; - -template <> -struct LogCUDAFunctor { - HOSTDEVICE inline bfloat16 operator()(const bfloat16 x) const { - auto x_ = static_cast(x); - return static_cast(std::log(x_)); - } -}; - template void LogsumexpFallbackKernel(const Context& dev_ctx, const DenseTensor& x, @@ -84,18 +66,14 @@ void LogsumexpFallbackKernel(const Context& dev_ctx, max_x.Resize(outdim); dev_ctx.template Alloc(&max_x); - phi::funcs::ReduceKernel>( - dev_ctx, *in_x, &max_x, kps::IdentityFunctor(), axis_vec); + phi::MaxKernel(dev_ctx, *in_x, axis_vec, false, &max_x); max_x.Resize(keeped_outdim); DenseTensor temp_x = Subtract(dev_ctx, *in_x, max_x); phi::funcs::ReduceKernel>( dev_ctx, temp_x, out_y, kps::ExpFunctor(), axis_vec); - const std::vector inputs = {out_y}; - std::vector outputs = {&temp_x}; - phi::funcs::ElementwiseKernel( - dev_ctx, inputs, &outputs, LogCUDAFunctor()); + phi::LogKernel(dev_ctx, *out_y, &temp_x); temp_x.Resize(outdim); out->Resize(outdim); phi::AddKernel(dev_ctx, temp_x, max_x, out); From 1e1fddcaf0536c17f26c07d045d0d11ae6fe138a Mon Sep 17 00:00:00 2001 From: Haohongxiang <86215757+haohongxiang@users.noreply.github.com> Date: Mon, 9 Oct 2023 11:23:57 +0800 Subject: [PATCH 20/62] fix mp sync params when dp_degree=1 (#57918) --- .../distributed/fleet/meta_parallel/tensor_parallel.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/python/paddle/distributed/fleet/meta_parallel/tensor_parallel.py b/python/paddle/distributed/fleet/meta_parallel/tensor_parallel.py index 6da9dae096d46..9876446ed3a80 100755 --- a/python/paddle/distributed/fleet/meta_parallel/tensor_parallel.py +++ b/python/paddle/distributed/fleet/meta_parallel/tensor_parallel.py @@ -41,8 +41,9 @@ def _prepare_for_model(self): logger.info("start broadcast sharding parameters") broadcast_sharding_parameters(self._layers, self._hcg) - logger.info("start broadcast dp parameters") - broadcast_dp_parameters(self._layers, self._hcg) + if self._hcg.get_data_parallel_world_size() > 1: + logger.info("start broadcast dp parameters") + broadcast_dp_parameters(self._layers, self._hcg) logger.info("mp's parameters is ready") From f4877d47610175f6083e0641e8cb509092799f5d Mon Sep 17 00:00:00 2001 From: WangZhen <23097963+0x45f@users.noreply.github.com> Date: Mon, 9 Oct 2023 11:29:56 +0800 Subject: [PATCH 21/62] [PIR]Migrate clip into pir (#57907) --- python/paddle/tensor/math.py | 2 +- test/legacy_test/test_clip_op.py | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py index d56bb8ab0768b..5a60e6884b890 100644 --- a/python/paddle/tensor/math.py +++ b/python/paddle/tensor/math.py @@ -3509,7 +3509,7 @@ def clip(x, min=None, max=None, name=None): min_ = float(np.finfo(np.float32).min) max_ = float(np.finfo(np.float32).max) - if in_dynamic_mode(): + if in_dynamic_or_pir_mode(): if isinstance(min, Variable): min = min.item(0) if isinstance(max, Variable): diff --git a/test/legacy_test/test_clip_op.py b/test/legacy_test/test_clip_op.py index 1c8366bbdf5ef..2de2d94047363 100644 --- a/test/legacy_test/test_clip_op.py +++ b/test/legacy_test/test_clip_op.py @@ -55,12 +55,12 @@ def setUp(self): def test_check_output(self): paddle.enable_static() - self.check_output(check_cinn=self.check_cinn) + self.check_output(check_cinn=self.check_cinn, check_new_ir=True) paddle.disable_static() def test_check_grad_normal(self): paddle.enable_static() - self.check_grad(['X'], 'Out') + self.check_grad(['X'], 'Out', check_new_ir=True) paddle.disable_static() def initTestCase(self): @@ -194,14 +194,14 @@ def test_check_output(self): if paddle.is_compiled_with_cuda(): place = paddle.CUDAPlace(0) paddle.enable_static() - self.check_output_with_place(place) + self.check_output_with_place(place, check_new_ir=True) paddle.disable_static() def test_check_grad_normal(self): if paddle.is_compiled_with_cuda(): place = paddle.CUDAPlace(0) paddle.enable_static() - self.check_grad_with_place(place, ['X'], 'Out') + self.check_grad_with_place(place, ['X'], 'Out', check_new_ir=True) paddle.disable_static() def initTestCase(self): From b6de5eae17bd8754d73415a01b3b2e92a138c5db Mon Sep 17 00:00:00 2001 From: wanghuancoder Date: Mon, 9 Oct 2023 12:38:34 +0800 Subject: [PATCH 22/62] [CleanOps]del unseful op2 (#57732) * del unseful op --- paddle/fluid/operators/expand_as_op.cc | 170 ------------------- paddle/fluid/operators/expand_as_op.h | 219 ------------------------- test/legacy_test/test_expand_as_op.py | 104 ------------ 3 files changed, 493 deletions(-) delete mode 100644 paddle/fluid/operators/expand_as_op.cc delete mode 100644 paddle/fluid/operators/expand_as_op.h delete mode 100755 test/legacy_test/test_expand_as_op.py diff --git a/paddle/fluid/operators/expand_as_op.cc b/paddle/fluid/operators/expand_as_op.cc deleted file mode 100644 index 107fe9f6174b6..0000000000000 --- a/paddle/fluid/operators/expand_as_op.cc +++ /dev/null @@ -1,170 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/expand_as_op.h" - -#include -#include - -namespace paddle { -namespace operators { - -class ExpandAsOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - protected: - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "ExpandAs"); - OP_INOUT_CHECK( - ctx->HasInput("target_tensor"), "Input", "target_tensor", "ExpandAs"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "ExpandAs"); - auto x_dims = ctx->GetInputDim("X"); - auto target_tensor_dims = ctx->GetInputDim("target_tensor"); - PADDLE_ENFORCE_EQ( - static_cast(x_dims.size()), - target_tensor_dims.size(), - platform::errors::InvalidArgument( - "The rank of Input(target_tensor) must be equal " - "to the rank of Input(X). But received Input(X): input " - "rank %u, input shape [%s]; received Input(target_tensor): " - "input rank %u, input shape [%s].", - x_dims.size(), - x_dims, - target_tensor_dims.size(), - target_tensor_dims)); - PADDLE_ENFORCE_LE( - x_dims.size(), - 6, - platform::errors::InvalidArgument( - "The rank of Input(X) must not be greater than 6. But " - "received: input rank %u, input shape [%s].", - x_dims.size(), - x_dims)); - std::vector out_shape(x_dims.size()); - ctx->SetOutputDim("Out", phi::make_ddim(out_shape)); - } -}; - -class ExpandAsOpMaker : public framework::OpProtoAndCheckerMaker { - public: - void Make() override { - AddInput("X", - "(Tensor, default Tensor). A tensor with rank in [1, 6]." - "X is the input to be expanded."); - AddOutput("Out", - "(Tensor, default Tensor). A tensor with rank in [1, 6]." - "The rank of Output(Out) have the same with Input(X). " - "After expanding, size of each dimension of Output(Out) is equal " - "to size of the corresponding dimension of Input(X) multiplying " - "the corresponding value given by Attr(expand_times)."); - AddInput("target_tensor", "Expand tensor's shape for each dimension."); - AddComment(R"DOC( -Expand as operator tiles the input by given times number. You should set times -number for each dimension by providing tensor 'expend_tensor'. The rank of X -should be in [1, 6]. Please note that size of 'expend_tensor' must be the same -with X's rank. Following is a using case: -Input(X) is a 3-D tensor with shape [2, 3, 1]: - [ - [[1], [2], [3]], - [[4], [5], [6]] - ] -target_tensors'shape: [2, 6, 2] -Output(Out) is a 3-D tensor with shape [2, 6, 2]: - [ - [[1, 1], [2, 2], [3, 3], [1, 1], [2, 2], [3, 3]], - [[4, 4], [5, 5], [6, 6], [4, 4], [5, 5], [6, 6]] - ] -)DOC"); - } -}; - -class ExpandAsGradOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - protected: - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "ExpandAs"); - OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")), - "Input", - framework::GradVarName("Out"), - "ExpandAs"); - - auto x_dims = ctx->GetInputDim("X"); - auto x_grad_name = framework::GradVarName("X"); - if (ctx->HasOutput(x_grad_name)) { - ctx->SetOutputDim(x_grad_name, x_dims); - } - } - - phi::KernelKey GetExpectedKernelType( - const framework::ExecutionContext& ctx) const override { - return phi::KernelKey(OperatorWithKernel::IndicateVarDataType( - ctx, framework::GradVarName("Out")), - ctx.device_context().GetPlace()); - } -}; - -template -class ExpandAsGradOpMaker : public framework::SingleGradOpMaker { - public: - using framework::SingleGradOpMaker::SingleGradOpMaker; - - protected: - void Apply(GradOpPtr op) const override { - op->SetType("expand_as_grad"); - op->SetInput("X", this->Input("X")); - op->SetInput("target_tensor", this->Input("target_tensor")); - op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out")); - op->SetOutput(framework::GradVarName("X"), this->InputGrad("X")); - op->SetAttrMap(this->Attrs()); - } -}; - -DECLARE_NO_NEED_BUFFER_VARS_INFERER(ExpandAsGradNoNeedBufVarsInferer, "X"); - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OPERATOR(expand_as, - ops::ExpandAsOp, - ops::ExpandAsOpMaker, - ops::ExpandAsGradOpMaker, - ops::ExpandAsGradOpMaker); -REGISTER_OPERATOR(expand_as_grad, - ops::ExpandAsGradOp, - ops::ExpandAsGradNoNeedBufVarsInferer); -REGISTER_OP_CPU_KERNEL(expand_as, - ops::ExpandAsKernel, - ops::ExpandAsKernel, - ops::ExpandAsKernel, - ops::ExpandAsKernel, - ops::ExpandAsKernel); -REGISTER_OP_CPU_KERNEL(expand_as_grad, - ops::ExpandAsGradKernel, - ops::ExpandAsGradKernel, - ops::ExpandAsGradKernel, - ops::ExpandAsGradKernel); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -REGISTER_OP_CUDA_KERNEL(expand_as, - ops::ExpandAsKernel, - ops::ExpandAsKernel, - ops::ExpandAsKernel, - ops::ExpandAsKernel, - ops::ExpandAsKernel); -REGISTER_OP_CUDA_KERNEL(expand_as_grad, - ops::ExpandAsGradKernel, - ops::ExpandAsGradKernel, - ops::ExpandAsGradKernel, - ops::ExpandAsGradKernel); -#endif diff --git a/paddle/fluid/operators/expand_as_op.h b/paddle/fluid/operators/expand_as_op.h deleted file mode 100644 index a3462a00bcfb1..0000000000000 --- a/paddle/fluid/operators/expand_as_op.h +++ /dev/null @@ -1,219 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include - -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/operators/eigen/eigen_function.h" - -#define MAX_RANK_SUPPORTED 6 - -namespace paddle { -namespace operators { - -template -using EigenVector = framework::EigenVector; -template -using EigenTensor = framework::EigenTensor; - -template -class ExpandAsKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto rank = context.Input("X")->dims().size(); - switch (rank) { - case 1: - ExpandAs<1>(context); - break; - case 2: - ExpandAs<2>(context); - break; - case 3: - ExpandAs<3>(context); - break; - case 4: - ExpandAs<4>(context); - break; - case 5: - ExpandAs<5>(context); - break; - case 6: - ExpandAs<6>(context); - break; - default: - PADDLE_THROW(platform::errors::InvalidArgument( - "Only support tensor with rank being between 1 and 6. But received " - "tensor X's rank = %d.", - rank)); - } - } - - protected: - template - void ExpandAs(const framework::ExecutionContext& context) const { - auto* in0 = context.Input("X"); - auto in_dims = in0->dims(); - auto* target_tensor = context.Input("target_tensor"); - auto* out0 = context.Output("Out"); - Eigen::DSizes bcast_dims; - int bcast_dims_remainder = 0; - auto x_dims = in0->dims(); - auto y_dims = target_tensor->dims(); - for (int i = 0; i < y_dims.size(); ++i) { - PADDLE_ENFORCE_NE( - x_dims[i], - 0UL, - platform::errors::InvalidArgument( - "X(input) should not have 0 dim. But received x_dims[%d] = 0.", - i)); - bcast_dims[i] = y_dims[i] / x_dims[i]; - bcast_dims_remainder += y_dims[i] % x_dims[i]; - } - PADDLE_ENFORCE_EQ( - bcast_dims_remainder, - 0UL, - platform::errors::InvalidArgument( - "X(input) could not be broadcast together with remapped " - "shape(expand tensor's shape)")); - framework::DDim out_dims(in_dims); - for (size_t i = 0; i < bcast_dims.size(); ++i) { - out_dims[i] *= bcast_dims[i]; - } - - out0->Resize(out_dims); - auto x = EigenTensor::From(*in0); - out0->mutable_data(context.GetPlace()); - auto y = EigenTensor::From(*out0); - auto& place = - *context.template device_context().eigen_device(); - EigenBroadcast, T, Rank>::Eval( - place, y, x, bcast_dims); - } -}; - -template -class ExpandAsGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* in0 = context.Input("X"); - auto* target_tensor = context.Input("target_tensor"); - auto x_dims = in0->dims(); - auto y_dims = target_tensor->dims(); - std::vector bcast_dims; - for (int i = 0; i < y_dims.size(); ++i) { - bcast_dims.push_back(y_dims[i] / x_dims[i]); - } - std::vector reshape_dims_vec; - std::vector reduce_dims_vec; - for (size_t i = 0; i < bcast_dims.size(); ++i) { - reduce_dims_vec.push_back(reshape_dims_vec.size()); - reshape_dims_vec.push_back(bcast_dims[i]); - reshape_dims_vec.push_back(x_dims[i]); - } - int dims = reduce_dims_vec.size(); - bool just_copy = true; - for (size_t i = 0; i < bcast_dims.size(); i++) { - if (bcast_dims[i] != 1) { - just_copy = false; - break; - } - } - // no need reduce, just copy - if (just_copy) { - auto* in0 = - context.Input(framework::GradVarName("Out")); - auto* out0 = - context.Output(framework::GradVarName("X")); - out0->mutable_data(context.GetPlace()); - framework::TensorCopy( - *in0, context.GetPlace(), context.device_context(), out0); - } else { - PADDLE_ENFORCE_GE(dims, - 1, - platform::errors::InvalidArgument( - "The rank of the input 'Out@GRAD' for " - "expand_as_grad op must be greater than or " - "equal to 1, but the value received is %d.", - dims)); - PADDLE_ENFORCE_LE(dims, - MAX_RANK_SUPPORTED, - platform::errors::InvalidArgument( - "The rank of the input 'Out@GRAD' for " - "expand_as_grad op must be less than or equal " - "to %d, but the value received is %d.", - MAX_RANK_SUPPORTED, - dims)); - switch (dims) { - case 1: - ExpandAsBackward<1>(context, reshape_dims_vec, reduce_dims_vec); - break; - case 2: - ExpandAsBackward<2>(context, reshape_dims_vec, reduce_dims_vec); - break; - case 3: - ExpandAsBackward<3>(context, reshape_dims_vec, reduce_dims_vec); - break; - case 4: - ExpandAsBackward<4>(context, reshape_dims_vec, reduce_dims_vec); - break; - case 5: - ExpandAsBackward<5>(context, reshape_dims_vec, reduce_dims_vec); - break; - case 6: - ExpandAsBackward<6>(context, reshape_dims_vec, reduce_dims_vec); - break; - default: - PADDLE_THROW(platform::errors::InvalidArgument( - "Only support tensor with rank being between 1 and 6. But " - "received tensor's rank = %d.", - dims)); - } - } - } - - protected: - template - void ExpandAsBackward(const framework::ExecutionContext& context, - const std::vector& reshape_dims_vec, - const std::vector& reduce_dims_vec) const { - size_t reshape_size = reshape_dims_vec.size(); - size_t reduce_size = reduce_dims_vec.size(); - auto* in0 = context.Input(framework::GradVarName("Out")); - auto* out0 = context.Output(framework::GradVarName("X")); - out0->mutable_data(context.GetPlace()); - auto x_grad = EigenVector::Flatten(*out0); - Eigen::DSizes reshape_dims; - for (size_t i = 0; i < reshape_size; ++i) { - reshape_dims[i] = reshape_dims_vec[i]; - } - Eigen::DSizes reduce_dims; - for (size_t i = 0; i < reduce_size; ++i) { - reduce_dims[i] = reduce_dims_vec[i]; - } - auto out_grad = EigenVector::Flatten(*in0); - auto& place = - *context.template device_context().eigen_device(); - EigenBroadcastGrad, T, Dims>::Eval( - place, x_grad, out_grad, reduce_dims, reshape_dims); - } -}; - -} // namespace operators -} // namespace paddle diff --git a/test/legacy_test/test_expand_as_op.py b/test/legacy_test/test_expand_as_op.py deleted file mode 100755 index b5b8013a2c9c6..0000000000000 --- a/test/legacy_test/test_expand_as_op.py +++ /dev/null @@ -1,104 +0,0 @@ -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import unittest - -import numpy as np -from op_test import OpTest - - -def bcast(x, target_tensor): - x_dims = x.shape - y_dims = target_tensor.shape - bcast_dims = [] - for i in range(len(x_dims)): - bcast_dims.append(int(y_dims[i] / x_dims[i])) - bcast_dims = np.array(bcast_dims).astype("int64") - return bcast_dims - - -class TestExpandAsOpRank1(OpTest): - def setUp(self): - self.op_type = "expand_as" - x = np.random.rand(100).astype("float64") - target_tensor = np.random.rand(200).astype("float64") - self.inputs = {'X': x, 'target_tensor': target_tensor} - self.attrs = {} - bcast_dims = bcast(x, target_tensor) - output = np.tile(self.inputs['X'], bcast_dims) - self.outputs = {'Out': output} - - def test_check_output(self): - self.check_output() - - def test_check_grad(self): - self.check_grad(['X'], 'Out') - - -class TestExpandAsOpRank2(OpTest): - def setUp(self): - self.op_type = "expand_as" - x = np.random.rand(10, 12).astype("float64") - target_tensor = np.random.rand(20, 24).astype("float64") - self.inputs = {'X': x, 'target_tensor': target_tensor} - self.attrs = {} - bcast_dims = bcast(x, target_tensor) - output = np.tile(self.inputs['X'], bcast_dims) - self.outputs = {'Out': output} - - def test_check_output(self): - self.check_output() - - def test_check_grad(self): - self.check_grad(['X'], 'Out') - - -class TestExpandAsOpRank3(OpTest): - def setUp(self): - self.op_type = "expand_as" - x = np.random.rand(2, 3, 20).astype("float64") - target_tensor = np.random.rand(4, 6, 40).astype("float64") - self.inputs = {'X': x, 'target_tensor': target_tensor} - self.attrs = {} - bcast_dims = bcast(x, target_tensor) - output = np.tile(self.inputs['X'], bcast_dims) - self.outputs = {'Out': output} - - def test_check_output(self): - self.check_output() - - def test_check_grad(self): - self.check_grad(['X'], 'Out') - - -class TestExpandAsOpRank4(OpTest): - def setUp(self): - self.op_type = "expand_as" - x = np.random.rand(1, 1, 7, 16).astype("float64") - target_tensor = np.random.rand(4, 6, 14, 32).astype("float64") - self.inputs = {'X': x, 'target_tensor': target_tensor} - self.attrs = {} - bcast_dims = bcast(x, target_tensor) - output = np.tile(self.inputs['X'], bcast_dims) - self.outputs = {'Out': output} - - def test_check_output(self): - self.check_output() - - def test_check_grad(self): - self.check_grad(['X'], 'Out') - - -if __name__ == "__main__": - unittest.main() From 5b4c0a608a35e3960c291dbecf277cebb1faca94 Mon Sep 17 00:00:00 2001 From: wanghuancoder Date: Mon, 9 Oct 2023 12:39:02 +0800 Subject: [PATCH 23/62] [CleanOps]del_unuseful_op10 (#57810) * del_unuseful_op10 --- paddle/fluid/operators/flatten_op.cc | 167 ++----- paddle/fluid/operators/flatten_op.cu.cc | 14 - paddle/fluid/operators/flatten_op.h | 50 +- paddle/fluid/operators/flatten_op_xpu.cc | 12 - .../operators/mkldnn/reshape_mkldnn_op.cc | 2 +- paddle/fluid/operators/unsqueeze_op.cc | 311 ------------ paddle/fluid/operators/unsqueeze_op.cu.cc | 48 -- paddle/fluid/operators/unsqueeze_op.h | 145 ------ test/ir/inference/test_trt_convert_flatten.py | 447 ------------------ test/legacy_test/test_flatten_op.py | 97 ---- test/legacy_test/test_unsqueeze_op.py | 423 ----------------- test/xpu/test_flatten_op_xpu.py | 88 ---- test/xpu/test_unsqueeze_op_xpu.py | 104 ---- tools/parallel_UT_rule.py | 3 - tools/windows/run_unittests.sh | 1 - 15 files changed, 37 insertions(+), 1875 deletions(-) delete mode 100644 paddle/fluid/operators/unsqueeze_op.cc delete mode 100644 paddle/fluid/operators/unsqueeze_op.cu.cc delete mode 100644 paddle/fluid/operators/unsqueeze_op.h delete mode 100644 test/ir/inference/test_trt_convert_flatten.py delete mode 100644 test/legacy_test/test_flatten_op.py delete mode 100755 test/legacy_test/test_unsqueeze_op.py delete mode 100644 test/xpu/test_flatten_op_xpu.py delete mode 100644 test/xpu/test_unsqueeze_op_xpu.py diff --git a/paddle/fluid/operators/flatten_op.cc b/paddle/fluid/operators/flatten_op.cc index 7bacc5f9de3e0..ddb67eef4a3fa 100644 --- a/paddle/fluid/operators/flatten_op.cc +++ b/paddle/fluid/operators/flatten_op.cc @@ -27,35 +27,53 @@ limitations under the License. */ namespace paddle { namespace operators { - -class FlattenOp : public framework::OperatorWithKernel { +// FIXME(zcd): flatten2 adds an intermediate output(XShape) based on flatten, +// the XShape is used to carry the shape and lod of X which will be used in +// flatten_grad, in this way, the framework can reuse the memory of X +// immediately the flatten2_op is finished. +// Considering compatibility issues, we could not fix flatten2_op +class Flatten2Op : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; void InferShape(framework::InferShapeContext *ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Flatten"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Flatten"); + OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Flatten2"); + OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Flatten2"); const auto &axis = ctx->Attrs().Get("axis"); const auto &in_dims = ctx->GetInputDim("X"); PADDLE_ENFORCE_GE(axis, 0, platform::errors::InvalidArgument( "The axis should be greater than or equal to 0.")); - if (in_dims.size() > 0) { - PADDLE_ENFORCE_LE( - axis, - in_dims.size(), - platform::errors::InvalidArgument( - "The axis should be less than or equal to input tensor's rank.")); - } + PADDLE_ENFORCE_LE( + axis, + in_dims.size(), + platform::errors::InvalidArgument( + "The axis should be less than or equal to input tensor's rank")); - const auto &out_dims = GetOutputShape(axis, in_dims); + const auto &out_dims = Flatten2Op::GetOutputShape(axis, in_dims); ctx->SetOutputDim("Out", phi::make_ddim(out_dims)); if (in_dims[0] == out_dims[0]) { // Only pass LoD when the first dimension of output and Input(X) // are the same. ctx->ShareLoD("X", "Out"); } + if (!ctx->HasOutput("XShape")) return; + // OP_INOUT_CHECK(ctx->HasOutput("XShape"), "Output", "XShape", "Flatten2"); + std::vector xshape_dims(in_dims.size() + 1); + xshape_dims[0] = 0; + for (int i = 0; i < in_dims.size(); ++i) { + xshape_dims[i + 1] = in_dims[i]; + } + ctx->SetOutputDim("XShape", phi::make_ddim(xshape_dims)); + ctx->ShareLoD("X", "XShape"); + } + + phi::KernelKey GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + auto input_data_type = + framework::OperatorWithKernel::IndicateVarDataType(ctx, "X"); + return phi::KernelKey(input_data_type, ctx.GetPlace()); } static std::vector GetOutputShape(const int axis, @@ -85,17 +103,9 @@ class FlattenOp : public framework::OperatorWithKernel { out_shape[1] = static_cast(inner); return out_shape; } - - protected: - phi::KernelKey GetExpectedKernelType( - const framework::ExecutionContext &ctx) const override { - auto input_data_type = - framework::OperatorWithKernel::IndicateVarDataType(ctx, "X"); - return phi::KernelKey(input_data_type, ctx.GetPlace()); - } }; -class FlattenOpMaker : public framework::OpProtoAndCheckerMaker { +class Flatten2OpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { AddInput("X", "(Tensor) A tensor of rank >= axis."); @@ -145,96 +155,6 @@ Case 2: We get: Out.shape = (1, 3 * 100 * 100 * 4) )DOC"); - } -}; - -class FlattenGradOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext *context) const override { - context->SetOutputDim(framework::GradVarName("X"), - context->GetInputDim("X")); - context->ShareLoD("X", framework::GradVarName("X")); - } - - protected: - phi::KernelKey GetExpectedKernelType( - const framework::ExecutionContext &ctx) const override { - auto input_data_type = framework::OperatorWithKernel::IndicateVarDataType( - ctx, framework::GradVarName("Out")); - return phi::KernelKey(input_data_type, ctx.GetPlace()); - } -}; - -template -class FlattenGradOpMaker : public framework::SingleGradOpMaker { - public: - using framework::SingleGradOpMaker::SingleGradOpMaker; - - void Apply(GradOpPtr grad_op) const override { - grad_op->SetType("flatten_grad"); - grad_op->SetInput("X", this->Input("X")); - grad_op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out")); - grad_op->SetOutput(framework::GradVarName("X"), this->InputGrad("X")); - grad_op->SetAttrMap(this->Attrs()); - } -}; - -// FIXME(zcd): flatten2 adds an intermediate output(XShape) based on flatten, -// the XShape is used to carry the shape and lod of X which will be used in -// flatten_grad, in this way, the framework can reuse the memory of X -// immediately the flatten2_op is finished. -// Considering compatibility issues, we could not fix flatten2_op -class Flatten2Op : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext *ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Flatten2"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Flatten2"); - const auto &axis = ctx->Attrs().Get("axis"); - const auto &in_dims = ctx->GetInputDim("X"); - PADDLE_ENFORCE_GE(axis, - 0, - platform::errors::InvalidArgument( - "The axis should be greater than or equal to 0.")); - PADDLE_ENFORCE_LE( - axis, - in_dims.size(), - platform::errors::InvalidArgument( - "The axis should be less than or equal to input tensor's rank")); - - const auto &out_dims = FlattenOp::GetOutputShape(axis, in_dims); - ctx->SetOutputDim("Out", phi::make_ddim(out_dims)); - if (in_dims[0] == out_dims[0]) { - // Only pass LoD when the first dimension of output and Input(X) - // are the same. - ctx->ShareLoD("X", "Out"); - } - if (!ctx->HasOutput("XShape")) return; - // OP_INOUT_CHECK(ctx->HasOutput("XShape"), "Output", "XShape", "Flatten2"); - std::vector xshape_dims(in_dims.size() + 1); - xshape_dims[0] = 0; - for (int i = 0; i < in_dims.size(); ++i) { - xshape_dims[i + 1] = in_dims[i]; - } - ctx->SetOutputDim("XShape", phi::make_ddim(xshape_dims)); - ctx->ShareLoD("X", "XShape"); - } - - phi::KernelKey GetExpectedKernelType( - const framework::ExecutionContext &ctx) const override { - auto input_data_type = - framework::OperatorWithKernel::IndicateVarDataType(ctx, "X"); - return phi::KernelKey(input_data_type, ctx.GetPlace()); - } -}; - -class Flatten2OpMaker : public FlattenOpMaker { - public: - void Make() override { - FlattenOpMaker::Make(); AddOutput("XShape", "XShape is just used to store the shape and lod of X, which will " "be used in FlattenGradOp.") @@ -293,17 +213,6 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(FlattenGradNoNeedBufferVarsInferer, "X"); } // namespace paddle namespace ops = paddle::operators; -REGISTER_OPERATOR(flatten, - ops::FlattenOp, - ops::FlattenOpMaker, - ops::FlattenGradOpMaker, - ops::FlattenGradOpMaker, - ops::FlattenOpInplaceInferer); -REGISTER_OPERATOR(flatten_grad, - ops::FlattenGradOp, - ops::FlattenGradInplaceInferer, - ops::FlattenGradNoNeedBufferVarsInferer); - REGISTER_OPERATOR(flatten2, ops::Flatten2Op, ops::Flatten2OpMaker, @@ -314,20 +223,6 @@ REGISTER_OPERATOR(flatten2_grad, ops::Flatten2GradOp, ops::FlattenGradInplaceInferer); -REGISTER_OP_CPU_KERNEL(flatten, - ops::FlattenKernel, - ops::FlattenKernel, - ops::FlattenKernel, - ops::FlattenKernel, - ops::FlattenKernel, - ops::FlattenKernel); -REGISTER_OP_CPU_KERNEL(flatten_grad, - ops::FlattenGradKernel, - ops::FlattenGradKernel, - ops::FlattenGradKernel, - ops::FlattenGradKernel, - ops::FlattenGradKernel, - ops::FlattenGradKernel); REGISTER_OP_CPU_KERNEL(flatten2, ops::Flatten2Kernel, ops::Flatten2Kernel, diff --git a/paddle/fluid/operators/flatten_op.cu.cc b/paddle/fluid/operators/flatten_op.cu.cc index 6fe28c4a7b689..59b9271f0af3c 100644 --- a/paddle/fluid/operators/flatten_op.cu.cc +++ b/paddle/fluid/operators/flatten_op.cu.cc @@ -16,20 +16,6 @@ limitations under the License. */ namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL(flatten, - ops::FlattenKernel, - ops::FlattenKernel, - ops::FlattenKernel, - ops::FlattenKernel, - ops::FlattenKernel, - ops::FlattenKernel); -REGISTER_OP_CUDA_KERNEL(flatten_grad, - ops::FlattenGradKernel, - ops::FlattenGradKernel, - ops::FlattenGradKernel, - ops::FlattenGradKernel, - ops::FlattenGradKernel, - ops::FlattenGradKernel); REGISTER_OP_CUDA_KERNEL(flatten2, ops::Flatten2Kernel, ops::Flatten2Kernel, diff --git a/paddle/fluid/operators/flatten_op.h b/paddle/fluid/operators/flatten_op.h index 513716047ed77..6942a0f7db2da 100644 --- a/paddle/fluid/operators/flatten_op.h +++ b/paddle/fluid/operators/flatten_op.h @@ -28,14 +28,16 @@ namespace paddle { namespace operators { template -class FlattenKernel : public framework::OpKernel { +class Flatten2Kernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &context) const override { + auto &axes = context.Attr("axis"); + auto *in = context.Input("X"); + auto x_dims = in->dims(); + auto *out = context.Output("Out"); - auto &axes = context.Attr("axis"); - auto x_dims = in->dims(); auto out_dims = phi::make_ddim(GetOutputShape(axes, x_dims)); out->mutable_data(context.GetPlace(), in->type()); @@ -68,48 +70,6 @@ class FlattenKernel : public framework::OpKernel { } }; -template -class FlattenGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - auto *d_x = ctx.Output(framework::GradVarName("X")); - auto *d_out = ctx.Input(framework::GradVarName("Out")); - auto in_dims = ctx.Input("X")->dims(); - - d_x->mutable_data(ctx.GetPlace(), d_out->type()); - framework::TensorCopy( - *d_out, - ctx.GetPlace(), - ctx.template device_context(), - d_x); - d_x->Resize(in_dims); - } -}; - -template -class Flatten2Kernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &context) const override { - auto &axes = context.Attr("axis"); - - auto *in = context.Input("X"); - auto x_dims = in->dims(); - - auto *out = context.Output("Out"); - - auto out_dims = phi::make_ddim( - FlattenKernel::GetOutputShape(axes, x_dims)); - - out->mutable_data(context.GetPlace(), in->type()); - framework::TensorCopy( - *in, - context.GetPlace(), - context.template device_context(), - out); - out->Resize(out_dims); - } -}; - template class Flatten2GradKernel : public framework::OpKernel { public: diff --git a/paddle/fluid/operators/flatten_op_xpu.cc b/paddle/fluid/operators/flatten_op_xpu.cc index 0e79e7b7dda8c..ec54a8f815ab4 100644 --- a/paddle/fluid/operators/flatten_op_xpu.cc +++ b/paddle/fluid/operators/flatten_op_xpu.cc @@ -19,18 +19,6 @@ limitations under the License. */ namespace ops = paddle::operators; namespace plat = paddle::platform; -REGISTER_OP_XPU_KERNEL( - flatten, - ops::FlattenKernel, - ops::FlattenKernel, - ops::FlattenKernel, - ops::FlattenKernel); -REGISTER_OP_XPU_KERNEL( - flatten_grad, - ops::FlattenGradKernel, - ops::FlattenGradKernel, - ops::FlattenGradKernel, - ops::FlattenGradKernel); REGISTER_OP_XPU_KERNEL( flatten2, ops::Flatten2Kernel, diff --git a/paddle/fluid/operators/mkldnn/reshape_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/reshape_mkldnn_op.cc index 3c53b05152b7e..d1bbfe4229372 100644 --- a/paddle/fluid/operators/mkldnn/reshape_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/reshape_mkldnn_op.cc @@ -161,7 +161,7 @@ class ReshapeMKLDNNKernel : public framework::OpKernel { x_dims = x->dims(); auto axes = ctx.Attr("axis"); out_dims = phi::make_ddim( - FlattenKernel::GetOutputShape(axes, x_dims)); + Flatten2Kernel::GetOutputShape(axes, x_dims)); } protected: diff --git a/paddle/fluid/operators/unsqueeze_op.cc b/paddle/fluid/operators/unsqueeze_op.cc deleted file mode 100644 index 5c6816a171fbc..0000000000000 --- a/paddle/fluid/operators/unsqueeze_op.cc +++ /dev/null @@ -1,311 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/unsqueeze_op.h" - -#include -#include -#include - -#include "paddle/fluid/framework/infershape_utils.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/phi/infermeta/unary.h" - -namespace paddle { -namespace operators { - -class UnsqueezeOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext *ctx) const override { - PADDLE_ENFORCE_EQ(ctx->HasInput("X"), - true, - platform::errors::InvalidArgument( - "Input(X) of " - "Unsqueeze operator should not be null.")); - PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), - true, - platform::errors::InvalidArgument( - "Output(Out) of " - "Unsqueeze operator should not be null.")); - - const auto &axes = ctx->Attrs().Get>("axes"); - const auto &x_dims = ctx->GetInputDim("X"); - // Validity Check: input tensor dims (<6). - PADDLE_ENFORCE_LE(x_dims.size(), - 6, - platform::errors::InvalidArgument( - "Invalid " - "dimensions, the rank of Input(X) " - "should be in the range of [1, 6] (Eigen limit)")); - if (!axes.empty()) { - auto out_dims = GetOutputShape(axes, x_dims); - ctx->SetOutputDim("Out", out_dims); - if (x_dims[0] == out_dims[0]) { - // Only pass LoD when the first dimension of output and Input(X) - // are the same. - ctx->ShareLoD("X", "Out"); - } - } else if (ctx->HasInputs("AxesTensorList")) { - auto AxesTensorList = ctx->Inputs("AxesTensorList"); - int output_size = x_dims.size() + static_cast(AxesTensorList.size()); - PADDLE_ENFORCE_LE(output_size, - 6, - platform::errors::InvalidArgument( - "The output tensor's rank should be less than 6.")); - std::vector vec_out_dims(output_size, -1); - ctx->SetOutputDim("Out", phi::make_ddim(vec_out_dims)); - } else if (ctx->HasInput("AxesTensor")) { - auto axes_dims = ctx->GetInputDim("AxesTensor"); - PADDLE_ENFORCE_EQ(axes_dims.size(), - 1, - platform::errors::InvalidArgument( - "Input(AxesTensor)'s dimension of " - "Op(unsqueeze) must be 1. " - "But received AxesTensor's shape = [%s], " - "AxesTensor's dimension = %d.", - axes_dims, - axes_dims.size())); - PADDLE_ENFORCE_GE( - axes_dims[0], - 0, - platform::errors::InvalidArgument( - "Input(AxesTensor)'s shape must be known. But received " - "AxesTensor's shape = [%s]", - axes_dims)); - int output_size = x_dims.size() + static_cast(axes_dims[0]); - PADDLE_ENFORCE_LE(output_size, - 6, - platform::errors::InvalidArgument( - "The output tensor's rank should be less than 6.")); - std::vector vec_out_dims(output_size, -1); - ctx->SetOutputDim("Out", phi::make_ddim(vec_out_dims)); - } - } - - static framework::DDim GetOutputShape(const std::vector unsqz_dims, - const framework::DDim &in_dims) { - int output_size = in_dims.size() + static_cast(unsqz_dims.size()); - int cur_output_size = in_dims.size(); - std::vector output_shape(output_size, 0); - - // Validity Check: rank range. - PADDLE_ENFORCE_LE(output_size, - 6, - platform::errors::InvalidArgument( - "The output tensor's rank should be less than 6.")); - - for (int axis : unsqz_dims) { - int cur = axis < 0 ? axis + cur_output_size + 1 : axis; - // Vaildity Check: the axis bound - PADDLE_ENFORCE_GE( - cur, - 0, - platform::errors::InvalidArgument("The insert dimension value should " - "not be less than 0")); - PADDLE_ENFORCE_LE(cur, - cur_output_size, - platform::errors::InvalidArgument( - "The insert dimension value shoud not be larger " - "than the dimension size of input tensor")); - // Move old axis, and insert new axis - for (int i = cur_output_size; i >= cur; --i) { - if (output_shape[i] == 1) { - // Move axis - output_shape[i + 1] = 1; - output_shape[i] = 0; - } - } - output_shape[cur] = 1; - // Add the output size. - cur_output_size++; - } - - // Make output shape - for (int in_idx = 0, out_idx = 0; out_idx < output_size; ++out_idx) { - if (output_shape[out_idx] == 0) { - output_shape[out_idx] = in_dims[in_idx++]; - } - } - - return phi::make_ddim(output_shape); - } - - protected: - phi::KernelKey GetExpectedKernelType( - const framework::ExecutionContext &ctx) const override { - return phi::KernelKey(framework::TransToProtoVarType( - ctx.Input("X")->type()), - ctx.GetPlace()); - } - - phi::KernelKey GetKernelTypeForVar( - const std::string &var_name, - const phi::DenseTensor &tensor, - const phi::KernelKey &expected_kernel_type) const override { - if (var_name == "AxesTensor" || var_name == "AxesTensorList") { - return phi::KernelKey(phi::Backend::ALL_BACKEND, - expected_kernel_type.layout(), - expected_kernel_type.dtype()); - } - return phi::KernelKey( - tensor.place(), tensor.layout(), expected_kernel_type.dtype()); - } -}; - -class UnsqueezeOpMaker : public framework::OpProtoAndCheckerMaker { - public: - void Make() override { - AddInput("X", "(Tensor). The input tensor of unsqueeze operator."); - AddInput("AxesTensor", - "(Tensor, optional). The dimensions to be inserted. " - "If it exists, it will replace Attr(axes).") - .AsDispensable(); - AddInput( - "AxesTensorList", - "(vector>, optional). The dimensions to be inserted. " - "If it exists, it will replace Attr(axes)." - "The shape of the element in vector must be [1].") - .AsDuplicable() - .AsDispensable(); - AddOutput("Out", "(Tensor). The output tensor of unsqueeze operator."); - AddAttr>("axes", - "(std::vector). List of integers," - " indicating the dimensions to be inserted") - .SetDefault({}) - .AddCustomChecker([](const std::vector &axes) { - // Validity Check: axes dims (<6). - PADDLE_ENFORCE_LT(static_cast(axes.size()), - 6, - platform::errors::InvalidArgument( - "Invalid " - "dimensions, dynamic dimensions should be " - "within [1, 6] dimensions (Eigen limit).")); - // Validity Check: the range of unsqueeze axis. - for (int axis : axes) { - PADDLE_ENFORCE_LT(axis, - 6, - platform::errors::InvalidArgument( - "Invalid " - "dimensions, input axis should be" - "within [1, 6] dimensions (Eigen limit).")); - } - }); - AddComment(R"DOC( - Unsqueeze Operator. - - Insert single-dimensional entries to the shape of a tensor. - Takes one required argument axes, a list of dimensions that will be inserted. - Dimension indices in axes are as seen in the output tensor. - - For example: - Given a tensor such that tensor with shape [3, 4, 5], - then Unsqueeze(tensor, axes=[0, 4]) has shape [1, 3, 4, 5, 1] - )DOC"); - } -}; - -class UnsqueezeGradOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext *ctx) const override { - ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); - ctx->ShareLoD("X", framework::GradVarName("X")); - } - - phi::KernelKey GetExpectedKernelType( - const framework::ExecutionContext &ctx) const override { - return phi::KernelKey(OperatorWithKernel::IndicateVarDataType( - ctx, framework::GradVarName("Out")), - ctx.GetPlace()); - } -}; - -template -class UnsqueezeGradOpMaker : public framework::SingleGradOpMaker { - public: - using framework::SingleGradOpMaker::SingleGradOpMaker; - - void Apply(GradOpPtr grad_op) const override { - grad_op->SetType("unsqueeze_grad"); - grad_op->SetInput("X", this->Input("X")); - grad_op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out")); - grad_op->SetOutput(framework::GradVarName("X"), this->InputGrad("X")); - grad_op->SetAttrMap(this->Attrs()); - } -}; - -template -class UnsqueezeDoubleGradOpMaker : public framework::SingleGradOpMaker { - public: - using framework::SingleGradOpMaker::SingleGradOpMaker; - - void Apply(GradOpPtr grad_op) const override { - grad_op->SetType("unsqueeze"); - grad_op->SetInput("X", this->OutputGrad(framework::GradVarName("X"))); - grad_op->SetOutput("Out", this->InputGrad(framework::GradVarName("Out"))); - grad_op->SetAttrMap(this->Attrs()); - } -}; - -DECLARE_INPLACE_OP_INFERER(UnsqueezeInplaceInferer, {"X", "Out"}); -DECLARE_INPLACE_OP_INFERER(UnsqueezeGradInplaceInferer, - {framework::GradVarName("Out"), - framework::GradVarName("X")}); -DECLARE_NO_NEED_BUFFER_VARS_INFERER(UnsqueezeGradOpNoNeedBufferVarInferer, "X"); -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OPERATOR(unsqueeze, - ops::UnsqueezeOp, - ops::UnsqueezeOpMaker, - ops::UnsqueezeGradOpMaker, - ops::UnsqueezeGradOpMaker); - -REGISTER_OPERATOR(unsqueeze_grad, - ops::UnsqueezeGradOp, - ops::UnsqueezeDoubleGradOpMaker, - ops::UnsqueezeDoubleGradOpMaker, - ops::UnsqueezeGradOpNoNeedBufferVarInferer); - -REGISTER_OP_CPU_KERNEL( - unsqueeze, - ops::UnsqueezeKernel, - ops::UnsqueezeKernel, - ops::UnsqueezeKernel, - ops::UnsqueezeKernel, - ops::UnsqueezeKernel, - ops::UnsqueezeKernel, - ops::UnsqueezeKernel, - ops::UnsqueezeKernel, - ops::UnsqueezeKernel>, - ops::UnsqueezeKernel>, - ops::UnsqueezeKernel); -REGISTER_OP_CPU_KERNEL( - unsqueeze_grad, - ops::UnsqueezeGradKernel, - ops::UnsqueezeGradKernel, - ops::UnsqueezeGradKernel, - ops::UnsqueezeGradKernel, - ops::UnsqueezeGradKernel, - ops::UnsqueezeGradKernel, - ops::UnsqueezeGradKernel, - ops::UnsqueezeGradKernel, - ops::UnsqueezeGradKernel>, - ops::UnsqueezeGradKernel>, - ops::UnsqueezeGradKernel); diff --git a/paddle/fluid/operators/unsqueeze_op.cu.cc b/paddle/fluid/operators/unsqueeze_op.cu.cc deleted file mode 100644 index 3a98a64d858a5..0000000000000 --- a/paddle/fluid/operators/unsqueeze_op.cu.cc +++ /dev/null @@ -1,48 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/unsqueeze_op.h" - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_CUDA_KERNEL( - unsqueeze, - ops::UnsqueezeKernel, - ops::UnsqueezeKernel, - ops::UnsqueezeKernel, - ops::UnsqueezeKernel, - ops::UnsqueezeKernel, - ops::UnsqueezeKernel, - ops::UnsqueezeKernel, - ops::UnsqueezeKernel, - ops::UnsqueezeKernel, - ops::UnsqueezeKernel, - ops::UnsqueezeKernel>, - ops::UnsqueezeKernel>); -REGISTER_OP_CUDA_KERNEL( - unsqueeze_grad, - ops::UnsqueezeGradKernel, - ops::UnsqueezeGradKernel, - ops::UnsqueezeGradKernel, - ops::UnsqueezeGradKernel, - ops::UnsqueezeGradKernel, - ops::UnsqueezeGradKernel, - ops::UnsqueezeGradKernel, - ops::UnsqueezeGradKernel, - ops::UnsqueezeGradKernel, - ops::UnsqueezeGradKernel, - ops::UnsqueezeGradKernel>, - ops::UnsqueezeGradKernel>); diff --git a/paddle/fluid/operators/unsqueeze_op.h b/paddle/fluid/operators/unsqueeze_op.h deleted file mode 100644 index 94d8ede8e134c..0000000000000 --- a/paddle/fluid/operators/unsqueeze_op.h +++ /dev/null @@ -1,145 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/utils.h" -#include "paddle/fluid/platform/device_context.h" -#include "paddle/phi/core/tensor_utils.h" -#include "paddle/phi/kernels/funcs/blas/blas.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace paddle { -namespace operators { - -template -class UnsqueezeKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &context) const override { - auto axes = context.Attr>("axes"); - auto *in = context.Input("X"); - auto *out = context.Output("Out"); - auto x_dims = in->dims(); - - bool need_resize_out_dims = false; - if (axes.empty()) { - auto axes_tensor_list = - context.MultiInput("AxesTensorList"); - if (axes_tensor_list.size() > 0) { - axes = GetDataFromTensorList(axes_tensor_list); - } else if (context.HasInput("AxesTensor")) { - auto *axes_tensor = context.Input("AxesTensor"); - axes = phi::GetVectorFromTensor(axes_tensor); - } - need_resize_out_dims = true; - } - framework::DDim out_dims = out->dims(); - if (need_resize_out_dims) { - out_dims = GetOutputShape(axes, x_dims); - out->Resize(out_dims); - } - out->mutable_data(context.GetPlace(), in->type()); - framework::TensorCopy( - *in, - context.GetPlace(), - context.template device_context(), - out); - out->Resize(out_dims); - } - - static framework::DDim GetOutputShape(const std::vector unsqz_dims, - const framework::DDim &in_dims) { - int output_size = in_dims.size() + static_cast(unsqz_dims.size()); - int cur_output_size = in_dims.size(); - std::vector output_shape(output_size, 0); - - // Validity Check: rank range. - PADDLE_ENFORCE_LE(output_size, - 6, - platform::errors::InvalidArgument( - "The output " - "tensor's rank should be less than 6.")); - - for (int axis : unsqz_dims) { - int cur = axis < 0 ? axis + cur_output_size + 1 : axis; - // Vaildity Check: the axis bound - PADDLE_ENFORCE_GE( - cur, - 0, - platform::errors::InvalidArgument("The insert dimension value should " - "not be less than 0")); - PADDLE_ENFORCE_LE(cur, - cur_output_size, - platform::errors::InvalidArgument( - "The insert dimension value shoule not be larger " - "than the dimension size of input tensor")); - // Move old axis, and insert new axis - for (int i = cur_output_size; i >= cur; --i) { - if (output_shape[i] == 1) { - // Move axis - output_shape[i + 1] = 1; - output_shape[i] = 0; - } - } - output_shape[cur] = 1; - // Add the output size. - cur_output_size++; - } - - // Make output shape - for (int in_idx = 0, out_idx = 0; out_idx < output_size; ++out_idx) { - if (output_shape[out_idx] == 0) { - output_shape[out_idx] = in_dims[in_idx++]; - } - } - - return phi::make_ddim(output_shape); - } -}; - -template -class UnsqueezeGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - auto *d_out = ctx.Input(framework::GradVarName("Out")); - auto *d_x = ctx.Output(framework::GradVarName("X")); - auto in_dims = ctx.Input("X")->dims(); - - d_x->mutable_data(ctx.GetPlace(), d_out->type()); - framework::TensorCopySync(*d_out, ctx.GetPlace(), d_x); - d_x->Resize(in_dims); - } -}; - -template -class Unsqueeze2GradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - auto *d_out = ctx.Input(framework::GradVarName("Out")); - auto *d_x = ctx.Output(framework::GradVarName("X")); - // auto in_dims = d_x->dims(); - - auto xshape_dims = ctx.Input("XShape")->dims(); - auto x_dims = phi::slice_ddim(xshape_dims, 1, xshape_dims.size()); - - d_x->mutable_data(ctx.GetPlace(), d_out->type()); - framework::TensorCopySync(*d_out, ctx.GetPlace(), d_x); - d_x->Resize(x_dims); - } -}; -} // namespace operators -} // namespace paddle diff --git a/test/ir/inference/test_trt_convert_flatten.py b/test/ir/inference/test_trt_convert_flatten.py deleted file mode 100644 index 636502aa23113..0000000000000 --- a/test/ir/inference/test_trt_convert_flatten.py +++ /dev/null @@ -1,447 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import unittest -from functools import partial -from typing import List - -import numpy as np -from program_config import ProgramConfig, TensorConfig -from trt_layer_auto_scan_test import TrtLayerAutoScanTest - -import paddle.inference as paddle_infer - - -class TrtConvertFlattenTest_dim_2(TrtLayerAutoScanTest): - def is_program_valid(self, program_config: ProgramConfig) -> bool: - return True - - def sample_program_configs(self): - def generate_input(batch): - return np.random.random([batch, 32]).astype(np.float32) - - for batch in [1, 4]: - for axis in [0, 1]: - for type in ["flatten", "flatten2"]: - if type == "flatten": - op_outputs = {"Out": ["output_data"]} - else: - op_outputs = { - "Out": ["output_data"], - "XShape": ["xshape_data"], - } - dics = [{"axis": axis}] - ops_config = [ - { - "op_type": "flatten", - "op_inputs": {"X": ["input_data"]}, - "op_outputs": op_outputs, - "op_attrs": dics[0], - } - ] - ops = self.generate_op_config(ops_config) - - program_config = ProgramConfig( - ops=ops, - weights={}, - inputs={ - "input_data": TensorConfig( - data_gen=partial(generate_input, batch) - ) - }, - outputs=["output_data"], - ) - - yield program_config - - def sample_predictor_configs( - self, program_config - ) -> (paddle_infer.Config, List[int], float): - def generate_dynamic_shape(attrs): - self.dynamic_shape.min_input_shape = {"input_data": [1, 8]} - self.dynamic_shape.max_input_shape = {"input_data": [4, 64]} - self.dynamic_shape.opt_input_shape = {"input_data": [2, 32]} - - def clear_dynamic_shape(): - self.dynamic_shape.max_input_shape = {} - self.dynamic_shape.min_input_shape = {} - self.dynamic_shape.opt_input_shape = {} - - def generate_trt_nodes_num(attrs, dynamic_shape): - ver = paddle_infer.get_trt_compile_version() - if ver[0] * 1000 + ver[1] * 100 + ver[0] * 10 >= 7130: - if attrs[0]['axis'] == 1: - return 1, 2 - else: - return 0, 3 - else: - if dynamic_shape: - return 0, 3 - - if attrs[0]['axis'] == 1: - return 1, 2 - else: - return 0, 3 - - attrs = [ - program_config.ops[i].attrs for i in range(len(program_config.ops)) - ] - - # for static_shape - clear_dynamic_shape() - self.trt_param.precision = paddle_infer.PrecisionType.Float32 - program_config.set_input_type(np.float32) - yield self.create_inference_config(), generate_trt_nodes_num( - attrs, False - ), 1e-5 - self.trt_param.precision = paddle_infer.PrecisionType.Half - program_config.set_input_type(np.float16) - yield self.create_inference_config(), generate_trt_nodes_num( - attrs, False - ), (1e-3, 1e-3) - - # for dynamic_shape - generate_dynamic_shape(attrs) - self.trt_param.precision = paddle_infer.PrecisionType.Float32 - program_config.set_input_type(np.float32) - yield self.create_inference_config(), generate_trt_nodes_num( - attrs, True - ), 1e-5 - self.trt_param.precision = paddle_infer.PrecisionType.Half - program_config.set_input_type(np.float16) - yield self.create_inference_config(), generate_trt_nodes_num( - attrs, True - ), (1e-3, 1e-3) - - def test(self): - self.run_test() - - -class TrtConvertFlattenTest_dim_3(TrtLayerAutoScanTest): - def is_program_valid(self, program_config: ProgramConfig) -> bool: - return True - - def sample_program_configs(self): - def generate_input(batch): - return np.random.random([batch, 32, 64]).astype(np.float32) - - for batch in [1, 4]: - for axis in [0, 1, 2]: - for type in ["flatten", "flatten2"]: - if type == "flatten": - op_outputs = {"Out": ["output_data"]} - else: - op_outputs = { - "Out": ["output_data"], - "XShape": ["xshape_data"], - } - dics = [{"axis": axis}] - ops_config = [ - { - "op_type": "flatten", - "op_inputs": {"X": ["input_data"]}, - "op_outputs": op_outputs, - "op_attrs": dics[0], - } - ] - ops = self.generate_op_config(ops_config) - - program_config = ProgramConfig( - ops=ops, - weights={}, - inputs={ - "input_data": TensorConfig( - data_gen=partial(generate_input, batch) - ) - }, - outputs=["output_data"], - ) - - yield program_config - - def sample_predictor_configs( - self, program_config - ) -> (paddle_infer.Config, List[int], float): - def generate_dynamic_shape(attrs): - self.dynamic_shape.min_input_shape = {"input_data": [1, 8, 8]} - self.dynamic_shape.max_input_shape = {"input_data": [4, 32, 64]} - self.dynamic_shape.opt_input_shape = {"input_data": [2, 32, 64]} - - def clear_dynamic_shape(): - self.dynamic_shape.max_input_shape = {} - self.dynamic_shape.min_input_shape = {} - self.dynamic_shape.opt_input_shape = {} - - def generate_trt_nodes_num(attrs, dynamic_shape): - ver = paddle_infer.get_trt_compile_version() - if ver[0] * 1000 + ver[1] * 100 + ver[0] * 10 >= 7130: - if attrs[0]['axis'] == 1: - return 1, 2 - else: - return 0, 3 - else: - if dynamic_shape: - return 0, 3 - - if attrs[0]['axis'] == 1: - return 1, 2 - else: - return 0, 3 - - attrs = [ - program_config.ops[i].attrs for i in range(len(program_config.ops)) - ] - - # for static_shape - clear_dynamic_shape() - self.trt_param.precision = paddle_infer.PrecisionType.Float32 - program_config.set_input_type(np.float32) - yield self.create_inference_config(), generate_trt_nodes_num( - attrs, False - ), 1e-5 - self.trt_param.precision = paddle_infer.PrecisionType.Half - program_config.set_input_type(np.float16) - yield self.create_inference_config(), generate_trt_nodes_num( - attrs, False - ), (1e-3, 1e-3) - - # for dynamic_shape - generate_dynamic_shape(attrs) - self.trt_param.precision = paddle_infer.PrecisionType.Float32 - program_config.set_input_type(np.float32) - yield self.create_inference_config(), generate_trt_nodes_num( - attrs, True - ), 1e-5 - self.trt_param.precision = paddle_infer.PrecisionType.Half - program_config.set_input_type(np.float16) - yield self.create_inference_config(), generate_trt_nodes_num( - attrs, True - ), (1e-3, 1e-3) - - def test(self): - self.run_test() - - -class TrtConvertFlattenTest_dim_4(TrtLayerAutoScanTest): - def is_program_valid(self, program_config: ProgramConfig) -> bool: - return True - - def sample_program_configs(self): - def generate_input(batch): - return np.random.random([batch, 8, 8, 8]).astype(np.float32) - - for batch in [1, 4]: - for axis in [0, 1, 2, 3]: - for type in ["flatten", "flatten2"]: - if type == "flatten": - op_outputs = {"Out": ["output_data"]} - else: - op_outputs = { - "Out": ["output_data"], - "XShape": ["xshape_data"], - } - dics = [{"axis": axis}] - ops_config = [ - { - "op_type": "flatten", - "op_inputs": {"X": ["input_data"]}, - "op_outputs": op_outputs, - "op_attrs": dics[0], - } - ] - ops = self.generate_op_config(ops_config) - - program_config = ProgramConfig( - ops=ops, - weights={}, - inputs={ - "input_data": TensorConfig( - data_gen=partial(generate_input, batch) - ) - }, - outputs=["output_data"], - ) - - yield program_config - - def sample_predictor_configs( - self, program_config - ) -> (paddle_infer.Config, List[int], float): - def generate_dynamic_shape(attrs): - self.dynamic_shape.min_input_shape = {"input_data": [1, 4, 4, 4]} - self.dynamic_shape.max_input_shape = {"input_data": [4, 32, 32, 32]} - self.dynamic_shape.opt_input_shape = {"input_data": [2, 16, 16, 8]} - - def clear_dynamic_shape(): - self.dynamic_shape.max_input_shape = {} - self.dynamic_shape.min_input_shape = {} - self.dynamic_shape.opt_input_shape = {} - - def generate_trt_nodes_num(attrs, dynamic_shape): - ver = paddle_infer.get_trt_compile_version() - if ver[0] * 1000 + ver[1] * 100 + ver[0] * 10 >= 7130: - if attrs[0]['axis'] == 1: - return 1, 2 - else: - return 0, 3 - else: - if dynamic_shape: - return 0, 3 - - if attrs[0]['axis'] == 1: - return 1, 2 - else: - return 0, 3 - - attrs = [ - program_config.ops[i].attrs for i in range(len(program_config.ops)) - ] - - # for static_shape - clear_dynamic_shape() - self.trt_param.precision = paddle_infer.PrecisionType.Float32 - program_config.set_input_type(np.float32) - yield self.create_inference_config(), generate_trt_nodes_num( - attrs, False - ), 1e-5 - self.trt_param.precision = paddle_infer.PrecisionType.Half - program_config.set_input_type(np.float16) - yield self.create_inference_config(), generate_trt_nodes_num( - attrs, False - ), (1e-3, 1e-3) - - # for dynamic_shape - generate_dynamic_shape(attrs) - self.trt_param.precision = paddle_infer.PrecisionType.Float32 - program_config.set_input_type(np.float32) - yield self.create_inference_config(), generate_trt_nodes_num( - attrs, True - ), 1e-5 - self.trt_param.precision = paddle_infer.PrecisionType.Half - program_config.set_input_type(np.float16) - yield self.create_inference_config(), generate_trt_nodes_num( - attrs, True - ), (1e-3, 1e-3) - - def test(self): - self.run_test() - - -class TrtConvertFlattenTest_dim_5(TrtLayerAutoScanTest): - def is_program_valid(self, program_config: ProgramConfig) -> bool: - return True - - def sample_program_configs(self): - def generate_input(batch): - return np.random.random([batch, 8, 8, 8]).astype(np.float32) - - for batch in [1, 4]: - for axis in [0, 1, 2, 3, 4]: - for type in ["flatten", "flatten2"]: - if type == "flatten": - op_outputs = {"Out": ["output_data"]} - else: - op_outputs = { - "Out": ["output_data"], - "XShape": ["xshape_data"], - } - dics = [{"axis": axis}] - ops_config = [ - { - "op_type": "flatten", - "op_inputs": {"X": ["input_data"]}, - "op_outputs": op_outputs, - "op_attrs": dics[0], - } - ] - ops = self.generate_op_config(ops_config) - - program_config = ProgramConfig( - ops=ops, - weights={}, - inputs={ - "input_data": TensorConfig( - data_gen=partial(generate_input, batch) - ) - }, - outputs=["output_data"], - ) - - yield program_config - - def sample_predictor_configs( - self, program_config - ) -> (paddle_infer.Config, List[int], float): - def generate_dynamic_shape(attrs): - self.dynamic_shape.min_input_shape = {"input_data": [1, 4, 4, 4]} - self.dynamic_shape.max_input_shape = {"input_data": [4, 16, 16, 8]} - self.dynamic_shape.opt_input_shape = {"input_data": [2, 16, 16, 8]} - - def clear_dynamic_shape(): - self.dynamic_shape.max_input_shape = {} - self.dynamic_shape.min_input_shape = {} - self.dynamic_shape.opt_input_shape = {} - - def generate_trt_nodes_num(attrs, dynamic_shape): - ver = paddle_infer.get_trt_compile_version() - if ver[0] * 1000 + ver[1] * 100 + ver[0] * 10 >= 7130: - if attrs[0]['axis'] == 1: - return 1, 2 - else: - return 0, 3 - else: - if dynamic_shape: - return 0, 3 - - if attrs[0]['axis'] == 1: - return 1, 2 - else: - return 0, 3 - - attrs = [ - program_config.ops[i].attrs for i in range(len(program_config.ops)) - ] - - # for static_shape - clear_dynamic_shape() - self.trt_param.precision = paddle_infer.PrecisionType.Float32 - program_config.set_input_type(np.float32) - yield self.create_inference_config(), generate_trt_nodes_num( - attrs, False - ), 1e-5 - self.trt_param.precision = paddle_infer.PrecisionType.Half - program_config.set_input_type(np.float16) - yield self.create_inference_config(), generate_trt_nodes_num( - attrs, False - ), (1e-3, 1e-3) - - # for dynamic_shape - generate_dynamic_shape(attrs) - self.trt_param.precision = paddle_infer.PrecisionType.Float32 - program_config.set_input_type(np.float32) - yield self.create_inference_config(), generate_trt_nodes_num( - attrs, True - ), 1e-5 - self.trt_param.precision = paddle_infer.PrecisionType.Half - program_config.set_input_type(np.float16) - yield self.create_inference_config(), generate_trt_nodes_num( - attrs, True - ), (1e-3, 1e-3) - - def test(self): - self.run_test() - - -if __name__ == "__main__": - unittest.main() diff --git a/test/legacy_test/test_flatten_op.py b/test/legacy_test/test_flatten_op.py deleted file mode 100644 index f59c6a91028d1..0000000000000 --- a/test/legacy_test/test_flatten_op.py +++ /dev/null @@ -1,97 +0,0 @@ -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import unittest - -import numpy as np -from op_test import OpTest, paddle_static_guard - -import paddle - - -class TestFlattenOp(OpTest): - def setUp(self): - self.op_type = "flatten" - self.init_test_case() - self.inputs = {"X": np.random.random(self.in_shape).astype("float64")} - self.init_attrs() - self.outputs = {"Out": self.inputs["X"].reshape(self.new_shape)} - - def test_check_output(self): - self.check_output() - - def test_check_grad(self): - self.check_grad(["X"], "Out") - - def init_test_case(self): - self.in_shape = (3, 2, 2, 10) - self.axis = 1 - self.new_shape = (3, 40) - - def init_attrs(self): - self.attrs = {"axis": self.axis} - - -class TestFlattenOp1(TestFlattenOp): - def init_test_case(self): - self.in_shape = (3, 2, 2, 10) - self.axis = 0 - self.new_shape = (1, 120) - - -class TestFlattenOpWithDefaultAxis(TestFlattenOp): - def init_test_case(self): - self.in_shape = (10, 2, 2, 3) - self.new_shape = (10, 12) - - def init_attrs(self): - self.attrs = {} - - -class TestFlattenOpSixDims(TestFlattenOp): - def init_test_case(self): - self.in_shape = (3, 2, 3, 2, 4, 4) - self.axis = 4 - self.new_shape = (36, 16) - - -class TestFlattenOpFP16(unittest.TestCase): - def test_fp16_with_gpu(self): - if paddle.base.core.is_compiled_with_cuda(): - with paddle_static_guard(): - place = paddle.CUDAPlace(0) - with paddle.static.program_guard( - paddle.static.Program(), paddle.static.Program() - ): - input = np.random.random([12, 14]).astype("float16") - x = paddle.static.data( - name="x", shape=[12, 14], dtype="float16" - ) - - y = paddle.flatten(x) - - exe = paddle.static.Executor(place) - res = exe.run( - paddle.static.default_main_program(), - feed={ - "x": input, - }, - fetch_list=[y], - ) - - np.testing.assert_array_equal(res[0].shape, [12 * 14]) - - -if __name__ == "__main__": - unittest.main() diff --git a/test/legacy_test/test_unsqueeze_op.py b/test/legacy_test/test_unsqueeze_op.py deleted file mode 100755 index 39aec97e23ecd..0000000000000 --- a/test/legacy_test/test_unsqueeze_op.py +++ /dev/null @@ -1,423 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import unittest - -import gradient_checker -import numpy as np -from decorator_helper import prog_scope -from op_test import OpTest, convert_float_to_uint16 - -import paddle -from paddle import base -from paddle.base import core - -paddle.enable_static() - - -# Correct: General. -class TestUnsqueezeOp(OpTest): - def setUp(self): - self.init_test_case() - self.op_type = "unsqueeze" - self.inputs = {"X": np.random.random(self.ori_shape).astype("float64")} - self.init_attrs() - self.outputs = {"Out": self.inputs["X"].reshape(self.new_shape)} - - def test_check_output(self): - self.check_output() - - def test_check_grad(self): - self.check_grad(["X"], "Out") - - def init_test_case(self): - self.ori_shape = (3, 40) - self.axes = (1, 2) - self.new_shape = (3, 1, 1, 40) - - def init_attrs(self): - self.attrs = {"axes": self.axes} - - -class TestUnsqueezeFP16Op(OpTest): - def setUp(self): - self.init_test_case() - self.op_type = "unsqueeze" - self.inputs = {"X": np.random.random(self.ori_shape).astype("float16")} - self.init_attrs() - self.outputs = {"Out": self.inputs["X"].reshape(self.new_shape)} - - def test_check_output(self): - self.check_output() - - def test_check_grad(self): - self.check_grad(["X"], "Out") - - def init_test_case(self): - self.ori_shape = (3, 40) - self.axes = (1, 2) - self.new_shape = (3, 1, 1, 40) - - def init_attrs(self): - self.attrs = {"axes": self.axes} - - -class TestUnsqueezeBF16Op(OpTest): - def setUp(self): - self.init_test_case() - self.op_type = "unsqueeze" - self.dtype = np.uint16 - x = np.random.random(self.ori_shape).astype("float32") - out = x.reshape(self.new_shape) - self.inputs = {"X": convert_float_to_uint16(x)} - self.init_attrs() - self.outputs = {"Out": convert_float_to_uint16(out)} - - def test_check_output(self): - self.check_output() - - def test_check_grad(self): - self.check_grad(["X"], "Out") - - def init_test_case(self): - self.ori_shape = (3, 40) - self.axes = (1, 2) - self.new_shape = (3, 1, 1, 40) - - def init_attrs(self): - self.attrs = {"axes": self.axes} - - -# Correct: Single input index. -class TestUnsqueezeOp1(TestUnsqueezeOp): - def init_test_case(self): - self.ori_shape = (20, 5) - self.axes = (-1,) - self.new_shape = (20, 5, 1) - - -# Correct: Mixed input axis. -class TestUnsqueezeOp2(TestUnsqueezeOp): - def init_test_case(self): - self.ori_shape = (20, 5) - self.axes = (0, -1) - self.new_shape = (1, 20, 5, 1) - - -# Correct: There is duplicated axis. -class TestUnsqueezeOp3(TestUnsqueezeOp): - def init_test_case(self): - self.ori_shape = (10, 2, 5) - self.axes = (0, 3, 3) - self.new_shape = (1, 10, 2, 1, 1, 5) - - -# Correct: Reversed axes. -class TestUnsqueezeOp4(TestUnsqueezeOp): - def init_test_case(self): - self.ori_shape = (10, 2, 5) - self.axes = (3, 1, 1) - self.new_shape = (10, 1, 1, 2, 5, 1) - - -# axis is empty, x is ND -class TestUnsqueezeOp5(TestUnsqueezeOp): - def init_test_case(self): - self.ori_shape = () - self.axes = () - self.new_shape = () - - -# axis is empty, x is 0D -class TestUnsqueezeOp6(TestUnsqueezeOp): - def init_test_case(self): - self.ori_shape = (10, 2, 5) - self.axes = () - self.new_shape = (10, 2, 5) - - -class TestUnsqueezeOp_ZeroDim1(TestUnsqueezeOp): - def init_test_case(self): - self.ori_shape = () - self.axes = (-1,) - self.new_shape = 1 - - -class TestUnsqueezeOp_ZeroDim2(TestUnsqueezeOp): - def init_test_case(self): - self.ori_shape = () - self.axes = (-1, 1) - self.new_shape = (1, 1) - - -class TestUnsqueezeOp_ZeroDim3(TestUnsqueezeOp): - def init_test_case(self): - self.ori_shape = () - self.axes = (0, 1, 2) - self.new_shape = (1, 1, 1) - - -class API_TestUnsqueeze(unittest.TestCase): - def test_out(self): - paddle.enable_static() - with paddle.static.program_guard( - paddle.static.Program(), paddle.static.Program() - ): - data1 = paddle.static.data('data1', shape=[-1, 10], dtype='float64') - result_squeeze = paddle.unsqueeze(data1, axis=[1]) - place = paddle.CPUPlace() - exe = paddle.static.Executor(place) - input1 = np.random.random([5, 1, 10]).astype('float64') - input = np.squeeze(input1, axis=1) - (result,) = exe.run( - feed={"data1": input}, fetch_list=[result_squeeze] - ) - np.testing.assert_allclose(input1, result, rtol=1e-05) - - -class TestUnsqueezeOpError(unittest.TestCase): - def test_errors(self): - paddle.enable_static() - with paddle.static.program_guard( - paddle.static.Program(), paddle.static.Program() - ): - # The type of axis in split_op should be int or Variable. - def test_axes_type(): - x6 = paddle.static.data( - shape=[-1, 10], dtype='float16', name='x3' - ) - paddle.unsqueeze(x6, axis=3.2) - - self.assertRaises(TypeError, test_axes_type) - - -class API_TestUnsqueeze2(unittest.TestCase): - def test_out(self): - paddle.enable_static() - with paddle.static.program_guard( - paddle.static.Program(), paddle.static.Program() - ): - data1 = paddle.static.data('data1', shape=[-1, 10], dtype='float64') - data2 = paddle.static.data('data2', shape=[1], dtype='int32') - result_squeeze = paddle.unsqueeze(data1, axis=data2) - place = paddle.CPUPlace() - exe = paddle.static.Executor(place) - input1 = np.random.random([5, 1, 10]).astype('float64') - input2 = np.array([1]).astype('int32') - input = np.squeeze(input1, axis=1) - (result1,) = exe.run( - feed={"data1": input, "data2": input2}, - fetch_list=[result_squeeze], - ) - np.testing.assert_allclose(input1, result1, rtol=1e-05) - - -class API_TestUnsqueeze3(unittest.TestCase): - def test_out(self): - paddle.enable_static() - with paddle.static.program_guard( - paddle.static.Program(), paddle.static.Program() - ): - data1 = paddle.static.data('data1', shape=[-1, 10], dtype='float64') - data2 = paddle.static.data('data2', shape=[1], dtype='int32') - result_squeeze = paddle.unsqueeze(data1, axis=[data2, 3]) - place = paddle.CPUPlace() - exe = paddle.static.Executor(place) - input1 = np.random.random([5, 1, 10, 1]).astype('float64') - input2 = np.array([1]).astype('int32') - input = np.squeeze(input1) - (result1,) = exe.run( - feed={"data1": input, "data2": input2}, - fetch_list=[result_squeeze], - ) - np.testing.assert_array_equal(input1, result1) - self.assertEqual(input1.shape, result1.shape) - - -class API_TestDyUnsqueeze(unittest.TestCase): - def test_out(self): - paddle.disable_static() - input_1 = np.random.random([5, 1, 10]).astype("int32") - input1 = np.expand_dims(input_1, axis=1) - input = paddle.to_tensor(input_1) - output = paddle.unsqueeze(input, axis=[1]) - out_np = output.numpy() - np.testing.assert_array_equal(input1, out_np) - self.assertEqual(input1.shape, out_np.shape) - - -class API_TestDyUnsqueeze2(unittest.TestCase): - def test_out(self): - paddle.disable_static() - input1 = np.random.random([5, 10]).astype("int32") - out1 = np.expand_dims(input1, axis=1) - input = paddle.to_tensor(input1) - output = paddle.unsqueeze(input, axis=1) - out_np = output.numpy() - np.testing.assert_array_equal(out1, out_np) - self.assertEqual(out1.shape, out_np.shape) - - -class API_TestDyUnsqueezeAxisTensor(unittest.TestCase): - def test_out(self): - paddle.disable_static() - input1 = np.random.random([5, 10]).astype("int32") - out1 = np.expand_dims(input1, axis=1) - out1 = np.expand_dims(out1, axis=2) - input = paddle.to_tensor(input1) - output = paddle.unsqueeze(input, axis=paddle.to_tensor([1, 2])) - out_np = output.numpy() - np.testing.assert_array_equal(out1, out_np) - self.assertEqual(out1.shape, out_np.shape) - - -class API_TestDyUnsqueezeAxisTensorList(unittest.TestCase): - def test_out(self): - paddle.disable_static() - input1 = np.random.random([5, 10]).astype("int32") - # Actually, expand_dims supports tuple since version 1.18.0 - out1 = np.expand_dims(input1, axis=1) - out1 = np.expand_dims(out1, axis=2) - input = paddle.to_tensor(input1) - output = paddle.unsqueeze( - paddle.to_tensor(input1), - axis=[paddle.to_tensor([1]), paddle.to_tensor([2])], - ) - out_np = output.numpy() - np.testing.assert_array_equal(out1, out_np) - self.assertEqual(out1.shape, out_np.shape) - - -class API_TestDygraphUnSqueeze(unittest.TestCase): - def setUp(self): - self.executed_api() - - def executed_api(self): - self.unsqueeze = paddle.unsqueeze - - def test_out(self): - paddle.disable_static() - input_1 = np.random.random([5, 1, 10]).astype("int32") - input = paddle.to_tensor(input_1) - output = self.unsqueeze(input, axis=[1]) - out_np = output.numpy() - expected_out = np.expand_dims(input_1, axis=1) - np.testing.assert_allclose(expected_out, out_np, rtol=1e-05) - - def test_out_int8(self): - paddle.disable_static() - input_1 = np.random.random([5, 1, 10]).astype("int8") - input = paddle.to_tensor(input_1) - output = self.unsqueeze(input, axis=[1]) - out_np = output.numpy() - expected_out = np.expand_dims(input_1, axis=1) - np.testing.assert_allclose(expected_out, out_np, rtol=1e-05) - - def test_out_uint8(self): - paddle.disable_static() - input_1 = np.random.random([5, 1, 10]).astype("uint8") - input = paddle.to_tensor(input_1) - output = self.unsqueeze(input, axis=1) - out_np = output.numpy() - expected_out = np.expand_dims(input_1, axis=1) - np.testing.assert_allclose(expected_out, out_np, rtol=1e-05) - - def test_axis_not_list(self): - paddle.disable_static() - input_1 = np.random.random([5, 1, 10]).astype("int32") - input = paddle.to_tensor(input_1) - output = self.unsqueeze(input, axis=1) - out_np = output.numpy() - expected_out = np.expand_dims(input_1, axis=1) - np.testing.assert_allclose(expected_out, out_np, rtol=1e-05) - - def test_dimension_not_1(self): - paddle.disable_static() - input_1 = np.random.random([5, 1, 10]).astype("int32") - input = paddle.to_tensor(input_1) - output = self.unsqueeze(input, axis=(1, 2)) - out_np = output.numpy() - expected_out = np.expand_dims(input_1, axis=(1, 2)) - np.testing.assert_allclose(expected_out, out_np, rtol=1e-05) - - -class API_TestDygraphUnSqueezeInplace(API_TestDygraphUnSqueeze): - def executed_api(self): - self.unsqueeze = paddle.unsqueeze_ - - -class TestUnsqueezeDoubleGradCheck(unittest.TestCase): - def unsqueeze_wrapper(self, x): - return paddle.unsqueeze(x[0], [0, 2]) - - @prog_scope() - def func(self, place): - # the shape of input variable should be clearly specified, not inlcude -1. - eps = 0.005 - dtype = np.float32 - - data = paddle.static.data('data', [2, 3, 4], dtype) - data.persistable = True - out = paddle.unsqueeze(data, [0, 2]) - data_arr = np.random.uniform(-1, 1, data.shape).astype(dtype) - - gradient_checker.double_grad_check( - [data], out, x_init=[data_arr], place=place, eps=eps - ) - gradient_checker.double_grad_check_for_dygraph( - self.unsqueeze_wrapper, [data], out, x_init=[data_arr], place=place - ) - - def test_grad(self): - paddle.enable_static() - places = [base.CPUPlace()] - if core.is_compiled_with_cuda(): - places.append(base.CUDAPlace(0)) - for p in places: - self.func(p) - - -class TestUnsqueezeTripleGradCheck(unittest.TestCase): - def unsqueeze_wrapper(self, x): - return paddle.unsqueeze(x[0], [0, 2]) - - @prog_scope() - def func(self, place): - # the shape of input variable should be clearly specified, not inlcude -1. - eps = 0.005 - dtype = np.float32 - - data = paddle.static.data('data', [2, 3, 4], dtype) - data.persistable = True - out = paddle.unsqueeze(data, [0, 2]) - data_arr = np.random.uniform(-1, 1, data.shape).astype(dtype) - - gradient_checker.triple_grad_check( - [data], out, x_init=[data_arr], place=place, eps=eps - ) - gradient_checker.triple_grad_check_for_dygraph( - self.unsqueeze_wrapper, [data], out, x_init=[data_arr], place=place - ) - - def test_grad(self): - paddle.enable_static() - places = [base.CPUPlace()] - if core.is_compiled_with_cuda(): - places.append(base.CUDAPlace(0)) - for p in places: - self.func(p) - - -if __name__ == "__main__": - unittest.main() diff --git a/test/xpu/test_flatten_op_xpu.py b/test/xpu/test_flatten_op_xpu.py deleted file mode 100644 index 7673ec9ba3d6d..0000000000000 --- a/test/xpu/test_flatten_op_xpu.py +++ /dev/null @@ -1,88 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import unittest - -import numpy as np -from get_test_cover_info import ( - XPUOpTestWrapper, - create_test_class, - get_xpu_op_support_types, -) -from op_test_xpu import XPUOpTest - -import paddle - -paddle.enable_static() - - -class XPUTestFlattenOp(XPUOpTestWrapper): - def __init__(self): - self.op_name = 'flatten' - self.use_dynamic_create_class = False - - class TestFlattenOp(XPUOpTest): - def setUp(self): - self.op_type = "flatten" - self.use_xpu = True - self.place = paddle.XPUPlace(0) - self.init_test_case() - self.dtype = self.in_type - self.inputs = { - "X": np.random.random(self.in_shape).astype(self.dtype) - } - self.init_attrs() - self.outputs = {"Out": self.inputs["X"].reshape(self.new_shape)} - - def test_check_output(self): - self.check_output_with_place(self.place) - - def test_check_grad(self): - self.check_grad_with_place(self.place, ["X"], "Out") - - def init_test_case(self): - self.in_shape = (3, 2, 2, 10) - self.axis = 1 - self.new_shape = (3, 40) - - def init_attrs(self): - self.attrs = {"axis": self.axis} - - class TestFlattenOp1(TestFlattenOp): - def init_test_case(self): - self.in_shape = (3, 2, 2, 10) - self.axis = 0 - self.new_shape = (1, 120) - - class TestFlattenOpWithDefaultAxis(TestFlattenOp): - def init_test_case(self): - self.in_shape = (10, 2, 2, 3) - self.new_shape = (10, 12) - - def init_attrs(self): - self.attrs = {} - - class TestFlattenOpSixDims(TestFlattenOp): - def init_test_case(self): - self.in_shape = (3, 2, 3, 2, 4, 4) - self.axis = 4 - self.new_shape = (36, 16) - - -support_types = get_xpu_op_support_types('flatten') -for stype in support_types: - create_test_class(globals(), XPUTestFlattenOp, stype) - -if __name__ == "__main__": - unittest.main() diff --git a/test/xpu/test_unsqueeze_op_xpu.py b/test/xpu/test_unsqueeze_op_xpu.py deleted file mode 100644 index 333633031bdfd..0000000000000 --- a/test/xpu/test_unsqueeze_op_xpu.py +++ /dev/null @@ -1,104 +0,0 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import unittest - -import numpy as np -from get_test_cover_info import ( - XPUOpTestWrapper, - create_test_class, - get_xpu_op_support_types, -) -from op_test_xpu import XPUOpTest - -import paddle - -paddle.enable_static() - - -# Correct: General. -class XPUTestUnsqueezeOp(XPUOpTestWrapper): - def __init__(self): - self.op_name = "unsqueeze" - self.use_dynamic_create_class = False - - class TestUnsqueezeOp(XPUOpTest): - def setUp(self): - self.op_type = "unsqueeze" - self.__class__.op_type = "unsqueeze" - self.use_mkldnn = False - self.init_test_case() - self.inputs = { - "X": np.random.random(self.ori_shape).astype(self.dtype) - } - self.init_attrs() - self.outputs = {"Out": self.inputs["X"].reshape(self.new_shape)} - - def init_dtype(self): - self.dtype = self.in_type - - def test_check_output(self): - place = paddle.XPUPlace(0) - self.check_output_with_place(place) - - def test_check_grad(self): - place = paddle.XPUPlace(0) - if self.dtype == np.bool_: - return - else: - self.check_grad_with_place(place, ['X'], 'Out') - - def init_test_case(self): - self.ori_shape = (3, 40) - self.axes = (1, 2) - self.new_shape = (3, 1, 1, 40) - - def init_attrs(self): - self.attrs = {"axes": self.axes} - - # Correct: Single input index. - class TestUnsqueezeOp1(TestUnsqueezeOp): - def init_test_case(self): - self.ori_shape = (20, 5) - self.axes = (-1,) - self.new_shape = (20, 5, 1) - - # Correct: Mixed input axis. - class TestUnsqueezeOp2(TestUnsqueezeOp): - def init_test_case(self): - self.ori_shape = (20, 5) - self.axes = (0, -1) - self.new_shape = (1, 20, 5, 1) - - # Correct: There is duplicated axis. - class TestUnsqueezeOp3(TestUnsqueezeOp): - def init_test_case(self): - self.ori_shape = (10, 2, 5) - self.axes = (0, 3, 3) - self.new_shape = (1, 10, 2, 1, 1, 5) - - # Correct: Reversed axes. - class TestUnsqueezeOp4(TestUnsqueezeOp): - def init_test_case(self): - self.ori_shape = (10, 2, 5) - self.axes = (3, 1, 1) - self.new_shape = (10, 1, 1, 2, 5, 1) - - -support_types = get_xpu_op_support_types("unsqueeze") -for stype in support_types: - create_test_class(globals(), XPUTestUnsqueezeOp, stype) - -if __name__ == "__main__": - unittest.main() diff --git a/tools/parallel_UT_rule.py b/tools/parallel_UT_rule.py index c22938e27d150..e7efc6cb9f1e5 100755 --- a/tools/parallel_UT_rule.py +++ b/tools/parallel_UT_rule.py @@ -1118,7 +1118,6 @@ 'test_prelu_mkldnn_op', 'test_box_coder_op', 'test_atan2_op', - 'test_unsqueeze_op', 'test_profiler', 'test_affine_channel_op', 'test_leaky_relu_grad_grad_functor', @@ -1592,7 +1591,6 @@ 'test_trt_conv_quant_dequant_pass', 'test_trt_convert_elementwise', 'test_trt_convert_depthwise_conv2d_transpose', - 'test_trt_convert_flatten', 'test_trt_matmul_quant_dequant', 'test_trt_convert_dropout', 'test_trt_convert_conv2d_transpose', @@ -2456,7 +2454,6 @@ 'test_yolov3_loss_op', 'test_where_index', 'test_variance_layer', - 'test_unsqueeze_op', 'test_translated_layer', 'test_tensor_shape', 'test_slice', diff --git a/tools/windows/run_unittests.sh b/tools/windows/run_unittests.sh index e7c05f2768a83..f570fca753e58 100644 --- a/tools/windows/run_unittests.sh +++ b/tools/windows/run_unittests.sh @@ -145,7 +145,6 @@ disable_win_inference_test="^trt_quant_int8_yolov3_r50_test$|\ ^test_tensor_scalar_type_promotion_dynamic$|\ ^test_model$|\ ^test_py_reader_combination$|\ -^test_trt_convert_flatten$|\ ^test_py_reader_push_pop$|\ ^test_parallel_executor_feed_persistable_var$|\ ^test_parallel_executor_inference_feed_partial_data$|\ From 3002433c2646b77ff084c8f0e988c13e91d51995 Mon Sep 17 00:00:00 2001 From: wanghuancoder Date: Mon, 9 Oct 2023 12:41:12 +0800 Subject: [PATCH 24/62] [CleanOps]Delete squeeze op (#57884) * del squeeze op --- paddle/fluid/operators/squeeze_op.cc | 269 --------------------- paddle/fluid/operators/squeeze_op.cu.cc | 45 ---- paddle/fluid/operators/squeeze_op.h | 93 ++++--- test/legacy_test/test_squeeze_op.py | 307 ------------------------ test/xpu/test_squeeze_op_xpu.py | 126 ---------- tools/parallel_UT_rule.py | 2 - 6 files changed, 59 insertions(+), 783 deletions(-) delete mode 100644 paddle/fluid/operators/squeeze_op.cc delete mode 100644 paddle/fluid/operators/squeeze_op.cu.cc delete mode 100755 test/legacy_test/test_squeeze_op.py delete mode 100644 test/xpu/test_squeeze_op_xpu.py diff --git a/paddle/fluid/operators/squeeze_op.cc b/paddle/fluid/operators/squeeze_op.cc deleted file mode 100644 index 115901d3ee2ee..0000000000000 --- a/paddle/fluid/operators/squeeze_op.cc +++ /dev/null @@ -1,269 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/squeeze_op.h" - -#include -#include -#include -#include - -#include "paddle/fluid/framework/infershape_utils.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/phi/infermeta/unary.h" - -namespace paddle { -namespace operators { - -framework::DDim GetOutputShape(const std::vector squeeze_dims, - const framework::DDim &in_dims, - bool is_runtime) { - size_t num_squeeze_dims = squeeze_dims.size(); - std::vector should_squeeze(in_dims.size(), false); - - // Mark dimensions need to be squeezed. - if (num_squeeze_dims == 0) { - for (int i = 0; i < in_dims.size(); ++i) { - if (in_dims[i] == 1) { - should_squeeze[i] = true; - } - } - } else { - for (size_t i = 0; i < num_squeeze_dims; ++i) { - int current = squeeze_dims[i] < 0 ? squeeze_dims[i] + in_dims.size() - : squeeze_dims[i]; - - PADDLE_ENFORCE_GE( - current, - 0, - platform::errors::InvalidArgument( - "Each axis in Attr(axes) should be in the range of [%d, %d]" - "But current axis is:%d, input tensor's shape = [%s].", - -in_dims.size(), - in_dims.size() - 1, - current, - in_dims)); - PADDLE_ENFORCE_LT( - current, - in_dims.size(), - platform::errors::InvalidArgument( - "Each axis in Attr(axes) should be in the range of [%d, %d]" - "But current axis is:%d, input tensor's shape = [%s].", - -in_dims.size(), - in_dims.size() - 1, - current, - in_dims)); - - if (!should_squeeze[current]) { - if (is_runtime) { - // At run time, dim of 1 is allowed to squeeze - if (in_dims[current] == 1) { - should_squeeze[current] = true; - } - } else { - // At compile time, dim of -1 or 1 is allowed to squeeze - if (in_dims[current] == 1 || in_dims[current] == -1) { - should_squeeze[current] = true; - } - } - } - } - } - // Make output dimensions - std::vector output_shape; - for (int i = 0; i < in_dims.size(); ++i) { - if (!should_squeeze[i]) { - output_shape.push_back(in_dims[i]); - } - } - return phi::make_ddim(output_shape); -} - -class SqueezeOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext *ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Squeeze"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Squeeze"); - - const auto &x_dims = ctx->GetInputDim("X"); - // Check input tensor dims (<6) Eigen limit. - PADDLE_ENFORCE_LE(x_dims.size(), - 6, - platform::errors::InvalidArgument( - "The dimensions of Input(X) " - "should be in the range of [1, 6] (Eigen limit)." - "But received X's dimensions = %d, X's shape=[%s].", - x_dims.size(), - x_dims)); - - const auto &axes = ctx->Attrs().Get>("axes"); - auto out_dims = GetOutputShape(axes, x_dims, false); - ctx->SetOutputDim("Out", out_dims); - if (x_dims[0] == out_dims[0]) { - // Only pass LoD when the first dimension of output and Input(X) - // are the same. - ctx->ShareLoD("X", "Out"); - } - } - - protected: - phi::KernelKey GetExpectedKernelType( - const framework::ExecutionContext &ctx) const override { - auto input_data_type = - framework::OperatorWithKernel::IndicateVarDataType(ctx, "X"); - return phi::KernelKey(input_data_type, ctx.GetPlace()); - } -}; - -class SqueezeGradOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext *context) const override { - context->SetOutputDim(framework::GradVarName("X"), - context->GetInputDim("X")); - context->ShareLoD("X", framework::GradVarName("X")); - } - - protected: - phi::KernelKey GetExpectedKernelType( - const framework::ExecutionContext &ctx) const override { - auto input_data_type = framework::OperatorWithKernel::IndicateVarDataType( - ctx, framework::GradVarName("Out")); - return phi::KernelKey(input_data_type, ctx.GetPlace()); - } -}; - -class SqueezeOpMaker : public framework::OpProtoAndCheckerMaker { - public: - void Make() override { - AddInput("X", "(Tensor). The input tensor of squeeze operator."); - AddOutput("Out", "(Tensor). The output tensor of squeeze operator."); - AddAttr>("axes", - "(std::vector). List of integers," - " indicating the dimensions to squeeze.") - .SetDefault({}) - .SupportTensor(); - AddAttr("use_mkldnn", - "(bool, default false) Only used in mkldnn kernel") - .SetDefault(false) - .AsExtra(); - AddAttr( - "mkldnn_data_type", - "(string, default \"float32\"). Data type of mkldnn kernel") - .SetDefault("float32") - .InEnum({"float32", "bfloat16"}) - .AsExtra(); - AddComment(R"DOC( - Squeeze Operator. - - Remove single-dimensional entries from the shape of a tensor. - Takes a parameter axes with a list of axes to squeeze. - If axes is not provided, all the single dimensions will be removed from the shape. - If an axis is selected with shape entry not equal to one, an error is raised. - - Examples: - Case 1: - Given - X.shape = (1, 3, 1, 5) - and - axes = [0] - we get: - Out.shape = (3, 1, 5) - - Case 2: - Given - X.shape = (1, 3, 1, 5) - and - axes = [] - we get: - Out.shape = (3, 5) - )DOC"); - } -}; - -template -class SqueezeGradOpMaker : public framework::SingleGradOpMaker { - public: - using framework::SingleGradOpMaker::SingleGradOpMaker; - - void Apply(GradOpPtr grad_op) const override { - grad_op->SetType("squeeze_grad"); - grad_op->SetInput("X", this->Input("X")); - grad_op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out")); - grad_op->SetOutput(framework::GradVarName("X"), this->InputGrad("X")); - grad_op->SetAttrMap(this->Attrs()); - } -}; - -template -class SqueezeDoubleGradOpMaker : public framework::SingleGradOpMaker { - public: - using framework::SingleGradOpMaker::SingleGradOpMaker; - - void Apply(GradOpPtr grad_op) const override { - grad_op->SetType("squeeze"); - grad_op->SetInput("X", this->OutputGrad(framework::GradVarName("X"))); - grad_op->SetOutput("Out", this->InputGrad(framework::GradVarName("Out"))); - grad_op->SetAttrMap(this->Attrs()); - } -}; - -DECLARE_INPLACE_OP_INFERER(SqueezeInplaceInferer, {"X", "Out"}); -DECLARE_INPLACE_OP_INFERER(SqueezeGradInplaceInferer, - {framework::GradVarName("Out"), - framework::GradVarName("X")}); -DECLARE_NO_NEED_BUFFER_VARS_INFERER(SqueezeGradNoNeedBufferVarsInferer, "X"); -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -REGISTER_OPERATOR(squeeze, - ops::SqueezeOp, - ops::SqueezeOpMaker, - ops::SqueezeGradOpMaker, - ops::SqueezeGradOpMaker); -REGISTER_OPERATOR(squeeze_grad, - ops::SqueezeGradOp, - ops::SqueezeDoubleGradOpMaker, - ops::SqueezeDoubleGradOpMaker, - ops::SqueezeGradNoNeedBufferVarsInferer); - -REGISTER_OP_CPU_KERNEL( - squeeze, - ops::SqueezeKernel, - ops::SqueezeKernel, - ops::SqueezeKernel, - ops::SqueezeKernel, - ops::SqueezeKernel, - ops::SqueezeKernel, - ops::SqueezeKernel, - ops::SqueezeKernel>, - ops::SqueezeKernel>, - ops::SqueezeKernel); -REGISTER_OP_CPU_KERNEL( - squeeze_grad, - ops::SqueezeGradKernel, - ops::SqueezeGradKernel, - ops::SqueezeGradKernel, - ops::SqueezeGradKernel, - ops::SqueezeGradKernel, - ops::SqueezeGradKernel, - ops::SqueezeGradKernel, - ops::SqueezeGradKernel>, - ops::SqueezeGradKernel>, - ops::SqueezeGradKernel); diff --git a/paddle/fluid/operators/squeeze_op.cu.cc b/paddle/fluid/operators/squeeze_op.cu.cc deleted file mode 100644 index a77b369c40373..0000000000000 --- a/paddle/fluid/operators/squeeze_op.cu.cc +++ /dev/null @@ -1,45 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/squeeze_op.h" - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_CUDA_KERNEL( - squeeze, - ops::SqueezeKernel, - ops::SqueezeKernel, - ops::SqueezeKernel, - ops::SqueezeKernel, - ops::SqueezeKernel, - ops::SqueezeKernel, - ops::SqueezeKernel, - ops::SqueezeKernel, - ops::SqueezeKernel, - ops::SqueezeKernel>, - ops::SqueezeKernel>); -REGISTER_OP_CUDA_KERNEL( - squeeze_grad, - ops::SqueezeGradKernel, - ops::SqueezeGradKernel, - ops::SqueezeGradKernel, - ops::SqueezeGradKernel, - ops::SqueezeGradKernel, - ops::SqueezeGradKernel, - ops::SqueezeGradKernel, - ops::SqueezeGradKernel, - ops::SqueezeGradKernel, - ops::SqueezeGradKernel>, - ops::SqueezeGradKernel>); diff --git a/paddle/fluid/operators/squeeze_op.h b/paddle/fluid/operators/squeeze_op.h index 0c5b5dfd4c8b0..6f0da1d42e546 100644 --- a/paddle/fluid/operators/squeeze_op.h +++ b/paddle/fluid/operators/squeeze_op.h @@ -26,42 +26,67 @@ namespace operators { framework::DDim GetOutputShape(const std::vector squeeze_dims, const framework::DDim &in_dims, - bool is_runtime); - -template -class SqueezeKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &context) const override { - auto *in = context.Input("X"); - auto *out = context.Output("Out"); - - auto &axes = context.Attr>("axes"); - auto x_dims = in->dims(); - auto out_dims = GetOutputShape(axes, x_dims, true); - - out->mutable_data(context.GetPlace(), in->type()); - framework::TensorCopy( - *in, - context.GetPlace(), - context.template device_context(), - out); - out->Resize(out_dims); + bool is_runtime) { + size_t num_squeeze_dims = squeeze_dims.size(); + std::vector should_squeeze(in_dims.size(), false); + + // Mark dimensions need to be squeezed. + if (num_squeeze_dims == 0) { + for (int i = 0; i < in_dims.size(); ++i) { + if (in_dims[i] == 1) { + should_squeeze[i] = true; + } + } + } else { + for (size_t i = 0; i < num_squeeze_dims; ++i) { + int current = squeeze_dims[i] < 0 ? squeeze_dims[i] + in_dims.size() + : squeeze_dims[i]; + + PADDLE_ENFORCE_GE( + current, + 0, + platform::errors::InvalidArgument( + "Each axis in Attr(axes) should be in the range of [%d, %d]" + "But current axis is:%d, input tensor's shape = [%s].", + -in_dims.size(), + in_dims.size() - 1, + current, + in_dims)); + PADDLE_ENFORCE_LT( + current, + in_dims.size(), + platform::errors::InvalidArgument( + "Each axis in Attr(axes) should be in the range of [%d, %d]" + "But current axis is:%d, input tensor's shape = [%s].", + -in_dims.size(), + in_dims.size() - 1, + current, + in_dims)); + + if (!should_squeeze[current]) { + if (is_runtime) { + // At run time, dim of 1 is allowed to squeeze + if (in_dims[current] == 1) { + should_squeeze[current] = true; + } + } else { + // At compile time, dim of -1 or 1 is allowed to squeeze + if (in_dims[current] == 1 || in_dims[current] == -1) { + should_squeeze[current] = true; + } + } + } + } } -}; - -template -class SqueezeGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - auto *d_out = ctx.Input(framework::GradVarName("Out")); - auto *d_x = ctx.Output(framework::GradVarName("X")); - auto in_dims = ctx.Input("X")->dims(); - - d_x->mutable_data(ctx.GetPlace(), d_out->type()); - framework::TensorCopySync(*d_out, ctx.GetPlace(), d_x); - d_x->Resize(in_dims); + // Make output dimensions + std::vector output_shape; + for (int i = 0; i < in_dims.size(); ++i) { + if (!should_squeeze[i]) { + output_shape.push_back(in_dims[i]); + } } -}; + return phi::make_ddim(output_shape); +} template class Squeeze2Kernel : public framework::OpKernel { diff --git a/test/legacy_test/test_squeeze_op.py b/test/legacy_test/test_squeeze_op.py deleted file mode 100755 index 294a86db6dd04..0000000000000 --- a/test/legacy_test/test_squeeze_op.py +++ /dev/null @@ -1,307 +0,0 @@ -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import unittest - -import gradient_checker -import numpy as np -from decorator_helper import prog_scope -from op_test import OpTest, convert_float_to_uint16 - -import paddle -from paddle import base -from paddle.base import Program, core, program_guard - -paddle.enable_static() - - -# Correct: General. -class TestSqueezeOp(OpTest): - def setUp(self): - self.op_type = "squeeze" - self.init_test_case() - self.inputs = {"X": np.random.random(self.ori_shape).astype("float64")} - self.init_attrs() - self.outputs = { - "Out": self.inputs["X"].reshape(self.new_shape), - } - - def test_check_output(self): - self.check_output() - - def test_check_grad(self): - self.check_grad(["X"], "Out") - - def init_test_case(self): - self.ori_shape = (1, 3, 1, 40) - self.axes = (0, 2) - self.new_shape = (3, 40) - - def init_attrs(self): - self.attrs = {"axes": self.axes} - - -class TestSqueezeFP16Op(OpTest): - def setUp(self): - self.op_type = "squeeze" - self.init_test_case() - self.inputs = {"X": np.random.random(self.ori_shape).astype("float16")} - self.init_attrs() - self.outputs = { - "Out": self.inputs["X"].reshape(self.new_shape), - } - - def test_check_output(self): - self.check_output() - - def test_check_grad(self): - self.check_grad(["X"], "Out") - - def init_test_case(self): - self.ori_shape = (1, 3, 1, 40) - self.axes = (0, 2) - self.new_shape = (3, 40) - - def init_attrs(self): - self.attrs = {"axes": self.axes} - - -class TestSqueezeBF16Op(OpTest): - def setUp(self): - self.op_type = "squeeze" - self.dtype = np.uint16 - self.init_test_case() - x = np.random.random(self.ori_shape).astype("float32") - out = x.reshape(self.new_shape) - self.inputs = {"X": convert_float_to_uint16(x)} - self.init_attrs() - self.outputs = {"Out": convert_float_to_uint16(out)} - - def test_check_output(self): - self.check_output() - - def test_check_grad(self): - self.check_grad(["X"], "Out") - - def init_test_case(self): - self.ori_shape = (1, 3, 1, 40) - self.axes = (0, 2) - self.new_shape = (3, 40) - - def init_attrs(self): - self.attrs = {"axes": self.axes} - - -# Correct: There is mins axis. -class TestSqueezeOp1(TestSqueezeOp): - def init_test_case(self): - self.ori_shape = (1, 3, 1, 40) - self.axes = (0, -2) - self.new_shape = (3, 40) - - -# Correct: No axes input. -class TestSqueezeOp2(TestSqueezeOp): - def init_test_case(self): - self.ori_shape = (1, 20, 1, 5) - self.axes = () - self.new_shape = (20, 5) - - -# Correct: Just part of axes be squeezed. -class TestSqueezeOp3(TestSqueezeOp): - def init_test_case(self): - self.ori_shape = (6, 1, 5, 1, 4, 1) - self.axes = (1, -1) - self.new_shape = (6, 5, 1, 4) - - -# Correct: The demension of axis is not of size 1 remains unchanged. -class TestSqueezeOp4(TestSqueezeOp): - def init_test_case(self): - self.ori_shape = (6, 1, 5, 1, 4, 1) - self.axes = (1, 2) - self.new_shape = (6, 5, 1, 4, 1) - - -class TestSqueezeOpError(unittest.TestCase): - def test_errors(self): - paddle.enable_static() - with program_guard(Program(), Program()): - # The input type of softmax_op must be Variable. - x1 = base.create_lod_tensor( - np.array([[-1]]), [[1]], paddle.CPUPlace() - ) - self.assertRaises(TypeError, paddle.squeeze, x1) - # The input axes of squeeze must be list. - x2 = paddle.static.data(name='x2', shape=[4], dtype="int32") - self.assertRaises(TypeError, paddle.squeeze, x2, axes=0) - # The input dtype of squeeze not support float16. - x3 = paddle.static.data(name='x3', shape=[4], dtype="float16") - self.assertRaises(TypeError, paddle.squeeze, x3, axes=0) - - -class API_TestSqueeze(unittest.TestCase): - def setUp(self): - self.executed_api() - - def executed_api(self): - self.squeeze = paddle.squeeze - - def test_out(self): - paddle.enable_static() - with paddle.static.program_guard( - paddle.static.Program(), paddle.static.Program() - ): - data1 = paddle.static.data( - 'data1', shape=[-1, 1, 10], dtype='float64' - ) - result_squeeze = self.squeeze(data1, axis=[1]) - place = paddle.CPUPlace() - exe = paddle.static.Executor(place) - input1 = np.random.random([5, 1, 10]).astype('float64') - (result,) = exe.run( - feed={"data1": input1}, fetch_list=[result_squeeze] - ) - expected_result = np.squeeze(input1, axis=1) - np.testing.assert_allclose(expected_result, result, rtol=1e-05) - - -class API_TestStaticSqueeze_(API_TestSqueeze): - def executed_api(self): - self.squeeze = paddle.squeeze_ - - -class API_TestDygraphSqueeze(unittest.TestCase): - def setUp(self): - self.executed_api() - - def executed_api(self): - self.squeeze = paddle.squeeze - - def test_out(self): - paddle.disable_static() - input_1 = np.random.random([5, 1, 10]).astype("int32") - input = paddle.to_tensor(input_1) - output = self.squeeze(input, axis=[1]) - out_np = output.numpy() - expected_out = np.squeeze(input_1, axis=1) - np.testing.assert_allclose(expected_out, out_np, rtol=1e-05) - - def test_out_int8(self): - paddle.disable_static() - input_1 = np.random.random([5, 1, 10]).astype("int8") - input = paddle.to_tensor(input_1) - output = self.squeeze(input, axis=[1]) - out_np = output.numpy() - expected_out = np.squeeze(input_1, axis=1) - np.testing.assert_allclose(expected_out, out_np, rtol=1e-05) - - def test_out_uint8(self): - paddle.disable_static() - input_1 = np.random.random([5, 1, 10]).astype("uint8") - input = paddle.to_tensor(input_1) - output = self.squeeze(input, axis=[1]) - out_np = output.numpy() - expected_out = np.squeeze(input_1, axis=1) - np.testing.assert_allclose(expected_out, out_np, rtol=1e-05) - - def test_axis_not_list(self): - paddle.disable_static() - input_1 = np.random.random([5, 1, 10]).astype("int32") - input = paddle.to_tensor(input_1) - output = self.squeeze(input, axis=1) - out_np = output.numpy() - expected_out = np.squeeze(input_1, axis=1) - np.testing.assert_allclose(expected_out, out_np, rtol=1e-05) - - def test_dimension_not_1(self): - paddle.disable_static() - input_1 = np.random.random([5, 1, 10]).astype("int32") - input = paddle.to_tensor(input_1) - output = self.squeeze(input, axis=(1, 0)) - out_np = output.numpy() - expected_out = np.squeeze(input_1, axis=1) - np.testing.assert_allclose(expected_out, out_np, rtol=1e-05) - - -class API_TestDygraphSqueezeInplace(API_TestDygraphSqueeze): - def executed_api(self): - self.squeeze = paddle.squeeze_ - - -class TestSqueezeDoubleGradCheck(unittest.TestCase): - def squeeze_wrapper(self, x): - return paddle.squeeze(x[0]) - - @prog_scope() - def func(self, place): - # the shape of input variable should be clearly specified, not inlcude -1. - eps = 0.005 - dtype = np.float32 - - data = paddle.static.data('data', [2, 3], dtype) - data.persistable = True - out = paddle.squeeze(data) - data_arr = np.random.uniform(-1, 1, data.shape).astype(dtype) - - gradient_checker.double_grad_check( - [data], out, x_init=[data_arr], place=place, eps=eps - ) - gradient_checker.double_grad_check_for_dygraph( - self.squeeze_wrapper, [data], out, x_init=[data_arr], place=place - ) - - def test_grad(self): - paddle.enable_static() - places = [base.CPUPlace()] - if core.is_compiled_with_cuda(): - places.append(base.CUDAPlace(0)) - for p in places: - self.func(p) - - -class TestSqueezeTripleGradCheck(unittest.TestCase): - def squeeze_wrapper(self, x): - return paddle.squeeze(x[0]) - - @prog_scope() - def func(self, place): - # the shape of input variable should be clearly specified, not inlcude -1. - eps = 0.005 - dtype = np.float32 - - data = paddle.static.data('data', [2, 3], dtype) - data.persistable = True - out = paddle.squeeze(data) - data_arr = np.random.uniform(-1, 1, data.shape).astype(dtype) - - gradient_checker.triple_grad_check( - [data], out, x_init=[data_arr], place=place, eps=eps - ) - gradient_checker.triple_grad_check_for_dygraph( - self.squeeze_wrapper, [data], out, x_init=[data_arr], place=place - ) - - def test_grad(self): - paddle.enable_static() - places = [base.CPUPlace()] - if core.is_compiled_with_cuda(): - places.append(base.CUDAPlace(0)) - for p in places: - self.func(p) - - -if __name__ == "__main__": - unittest.main() diff --git a/test/xpu/test_squeeze_op_xpu.py b/test/xpu/test_squeeze_op_xpu.py deleted file mode 100644 index c5b9efce7a770..0000000000000 --- a/test/xpu/test_squeeze_op_xpu.py +++ /dev/null @@ -1,126 +0,0 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import unittest - -import numpy as np -from get_test_cover_info import ( - XPUOpTestWrapper, - create_test_class, - get_xpu_op_support_types, -) -from op_test_xpu import XPUOpTest - -import paddle -from paddle import base -from paddle.base import Program, program_guard - -paddle.enable_static() - - -class XPUTestSqueezeOp(XPUOpTestWrapper): - def __init__(self): - self.op_name = "squeeze" - self.use_dynamic_create_class = False - - # Correct: General. - class TestSqueezeOp(XPUOpTest): - def setUp(self): - self.op_type = "squeeze" - self.__class__.op_type = "squeeze" - self.use_mkldnn = False - self.init_dtype() - self.init_test_case() - self.inputs = { - "X": np.random.random(self.ori_shape).astype(self.dtype) - } - self.init_attrs() - self.outputs = { - "Out": self.inputs["X"].reshape(self.new_shape), - } - - def init_dtype(self): - self.dtype = self.in_type - - def test_check_output(self): - place = paddle.XPUPlace(0) - self.check_output_with_place(place) - - def test_check_grad(self): - place = paddle.XPUPlace(0) - if self.dtype == np.bool_: - return - else: - self.check_grad_with_place(place, ['X'], 'Out') - - def init_test_case(self): - self.ori_shape = (1, 3, 1, 40) - self.axes = (0, 2) - self.new_shape = (3, 40) - - def init_attrs(self): - self.attrs = {"axes": self.axes} - - # Correct: There is mins axis. - class TestSqueezeOp1(TestSqueezeOp): - def init_test_case(self): - self.ori_shape = (1, 3, 1, 40) - self.axes = (0, -2) - self.new_shape = (3, 40) - - # Correct: No axes input. - class TestSqueezeOp2(TestSqueezeOp): - def init_test_case(self): - self.ori_shape = (1, 20, 1, 5) - self.axes = () - self.new_shape = (20, 5) - - # Correct: Just part of axes be squeezed. - class TestSqueezeOp3(TestSqueezeOp): - def init_test_case(self): - self.ori_shape = (6, 1, 5, 1, 4, 1) - self.axes = (1, -1) - self.new_shape = (6, 5, 1, 4) - - # Correct: The demension of axis is not of size 1 remains unchanged. - class TestSqueezeOp4(TestSqueezeOp): - def init_test_case(self): - self.ori_shape = (6, 1, 5, 1, 4, 1) - self.axes = (1, 2) - self.new_shape = (6, 5, 1, 4, 1) - - -class TestSqueezeOpError(unittest.TestCase): - def test_errors(self): - paddle.enable_static() - with program_guard(Program(), Program()): - # The input type of softmax_op must be Variable. - x1 = base.create_lod_tensor( - np.array([[-1]]), [[1]], paddle.XPUPlace(0) - ) - self.assertRaises(TypeError, paddle.squeeze, x1) - # The input axes of squeeze must be list. - x2 = paddle.static.data(name='x2', shape=[4], dtype="int32") - self.assertRaises(TypeError, paddle.squeeze, x2, axes=0) - # The input dtype of squeeze not support float16. - x3 = paddle.static.data(name='x3', shape=[4], dtype="float16") - self.assertRaises(TypeError, paddle.squeeze, x3, axes=0) - - -support_types = get_xpu_op_support_types("squeeze") -for stype in support_types: - create_test_class(globals(), XPUTestSqueezeOp, stype) - -if __name__ == "__main__": - unittest.main() diff --git a/tools/parallel_UT_rule.py b/tools/parallel_UT_rule.py index e7efc6cb9f1e5..cb715c64dd48a 100755 --- a/tools/parallel_UT_rule.py +++ b/tools/parallel_UT_rule.py @@ -1492,7 +1492,6 @@ 'test_yolov3_loss_op', 'test_decayed_adagrad_op', 'test_split_mkldnn_op', - 'test_squeeze_op', 'test_save_inference_model', 'test_smooth_l1_loss', 'test_bilateral_slice_op', @@ -2373,7 +2372,6 @@ 'test_atan2_op', 'test_tensor_fill_', 'test_std_layer', - 'test_squeeze_op', 'test_split_op', 'test_sign_op', 'test_set_value_op', From 36ea33407392b07c5e1e5271d5dfcacc2d6e5e65 Mon Sep 17 00:00:00 2001 From: wanghuancoder Date: Mon, 9 Oct 2023 13:49:26 +0800 Subject: [PATCH 25/62] [CleanOps]del unuseful op5 (#57794) * del unuseful op5 --- paddle/fluid/framework/op_compatible_info.cc | 3 - paddle/fluid/operators/filter_by_instag_op.cc | 181 ----- paddle/fluid/operators/filter_by_instag_op.cu | 639 ----------------- paddle/fluid/operators/filter_by_instag_op.h | 231 ------- paddle/fluid/operators/prroi_pool_op.cc | 214 ------ paddle/fluid/operators/prroi_pool_op.cu | 439 ------------ paddle/fluid/operators/prroi_pool_op.h | 653 ------------------ paddle/fluid/operators/random_crop_op.cc | 107 --- paddle/fluid/operators/random_crop_op.cu | 27 - paddle/fluid/operators/random_crop_op.h | 232 ------- paddle/fluid/operators/space_to_depth_op.cc | 242 ------- paddle/fluid/operators/space_to_depth_op.cu | 35 - paddle/fluid/operators/space_to_depth_op.h | 131 ---- .../fluid/operators/squared_l2_distance_op.cc | 232 ------- .../fluid/operators/squared_l2_distance_op.cu | 24 - .../fluid/operators/squared_l2_distance_op.h | 132 ---- paddle/fluid/operators/tree_conv_op.cc | 240 ------- paddle/fluid/operators/tree_conv_op.cu | 22 - paddle/fluid/operators/tree_conv_op.h | 148 ---- paddle/fluid/operators/unity_build_rule.cmake | 34 +- test/legacy_test/CMakeLists.txt | 5 +- test/legacy_test/test_filter_by_instag_op.py | 279 -------- test/legacy_test/test_random_crop_op.py | 48 -- test/legacy_test/test_space_to_depth_op.py | 152 ---- .../test_squared_l2_distance_op.py | 86 --- test/legacy_test/test_tree_conv_op.py | 167 ----- test/white_list/check_shape_white_list.py | 2 - .../compile_vs_runtime_white_list.py | 1 - test/white_list/no_grad_set_white_list.py | 1 - test/white_list/op_accuracy_white_list.py | 1 - tools/enforce/count_enforce_by_file.sh | 1 - tools/parallel_UT_rule.py | 11 - tools/static_mode_white_list.py | 5 - tools/windows/run_unittests.sh | 1 - 34 files changed, 8 insertions(+), 4718 deletions(-) delete mode 100644 paddle/fluid/operators/filter_by_instag_op.cc delete mode 100644 paddle/fluid/operators/filter_by_instag_op.cu delete mode 100644 paddle/fluid/operators/filter_by_instag_op.h delete mode 100644 paddle/fluid/operators/prroi_pool_op.cc delete mode 100644 paddle/fluid/operators/prroi_pool_op.cu delete mode 100644 paddle/fluid/operators/prroi_pool_op.h delete mode 100644 paddle/fluid/operators/random_crop_op.cc delete mode 100644 paddle/fluid/operators/random_crop_op.cu delete mode 100644 paddle/fluid/operators/random_crop_op.h delete mode 100644 paddle/fluid/operators/space_to_depth_op.cc delete mode 100644 paddle/fluid/operators/space_to_depth_op.cu delete mode 100644 paddle/fluid/operators/space_to_depth_op.h delete mode 100644 paddle/fluid/operators/squared_l2_distance_op.cc delete mode 100644 paddle/fluid/operators/squared_l2_distance_op.cu delete mode 100644 paddle/fluid/operators/squared_l2_distance_op.h delete mode 100644 paddle/fluid/operators/tree_conv_op.cc delete mode 100644 paddle/fluid/operators/tree_conv_op.cu delete mode 100644 paddle/fluid/operators/tree_conv_op.h delete mode 100644 test/legacy_test/test_filter_by_instag_op.py delete mode 100644 test/legacy_test/test_random_crop_op.py delete mode 100644 test/legacy_test/test_space_to_depth_op.py delete mode 100644 test/legacy_test/test_squared_l2_distance_op.py delete mode 100644 test/legacy_test/test_tree_conv_op.py diff --git a/paddle/fluid/framework/op_compatible_info.cc b/paddle/fluid/framework/op_compatible_info.cc index 1b7bcb14295dd..1a9beec01bb46 100644 --- a/paddle/fluid/framework/op_compatible_info.cc +++ b/paddle/fluid/framework/op_compatible_info.cc @@ -78,8 +78,6 @@ void OpCompatibleMap::InitOpCompatibleMap() { op_compatible_map_["dpsgd"] = {"1.6.0", OpCompatibleType::DEFIN_NOT}; op_compatible_map_["eye"] = {"1.6.0", OpCompatibleType::DEFIN_NOT}; op_compatible_map_["fill_any_like"] = {"1.6.0", OpCompatibleType::DEFIN_NOT}; - op_compatible_map_["filter_by_instag"] = {"1.6.0", - OpCompatibleType::DEFIN_NOT}; op_compatible_map_["hard_swish"] = {"1.6.0", OpCompatibleType::DEFIN_NOT}; op_compatible_map_["gather_nd"] = {"1.6.0", OpCompatibleType::DEFIN_NOT}; op_compatible_map_["instance_norm"] = {"1.6.0", OpCompatibleType::DEFIN_NOT}; @@ -90,7 +88,6 @@ void OpCompatibleMap::InitOpCompatibleMap() { op_compatible_map_["multiclass_nms2"] = {"1.6.0", OpCompatibleType::DEFIN_NOT}; op_compatible_map_["one_hot_v2"] = {"1.6.0", OpCompatibleType::DEFIN_NOT}; - op_compatible_map_["prroi_pool"] = {"1.6.0", OpCompatibleType::DEFIN_NOT}; op_compatible_map_["pull_box_sparse"] = {"1.6.0", OpCompatibleType::DEFIN_NOT}; op_compatible_map_["scatter_nd_add"] = {"1.6.0", OpCompatibleType::DEFIN_NOT}; diff --git a/paddle/fluid/operators/filter_by_instag_op.cc b/paddle/fluid/operators/filter_by_instag_op.cc deleted file mode 100644 index 02735a1ee5be0..0000000000000 --- a/paddle/fluid/operators/filter_by_instag_op.cc +++ /dev/null @@ -1,181 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/operators/filter_by_instag_op.h" - -#include - -#include "paddle/fluid/framework/no_need_buffer_vars_inference.h" -#include "paddle/fluid/framework/var_type_inference.h" - -namespace paddle { -namespace operators { -class FilterByInstagOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE_EQ( - ctx->HasInput("Ins"), - true, - platform::errors::InvalidArgument("Input(Ins) should be not null.")); - PADDLE_ENFORCE_EQ(ctx->HasInput("Ins_tag"), - true, - platform::errors::InvalidArgument( - "Input(Ins_tag) should be not null.")); - PADDLE_ENFORCE_EQ(ctx->HasInput("Filter_tag"), - true, - platform::errors::InvalidArgument( - "Input(Filter_tag) should be not null.")); - - PADDLE_ENFORCE_EQ( - ctx->HasOutput("Out"), - true, - platform::errors::InvalidArgument("Output(Out) should be not null.")); - PADDLE_ENFORCE_EQ(ctx->HasOutput("LossWeight"), - true, - platform::errors::InvalidArgument( - "Output(LossWeight) shoudl not be null.")); - PADDLE_ENFORCE_EQ(ctx->HasOutput("IndexMap"), - true, - platform::errors::InvalidArgument( - "Output(IndexMap) should be not null.")); - - auto x1_dims = ctx->GetInputDim("Ins"); // batch_size * vec - - ctx->SetOutputDim("Out", phi::make_ddim({-1, x1_dims[1]})); - ctx->SetOutputDim("LossWeight", phi::make_ddim({-1, 1})); - ctx->SetOutputDim("IndexMap", phi::make_ddim({-1, 2})); - } - - protected: - phi::KernelKey GetExpectedKernelType( - const framework::ExecutionContext& ctx) const override { - auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "Ins"); - return phi::KernelKey(data_type, ctx.device_context().GetPlace()); - } -}; - -class FilterByInstagOpMaker : public framework::OpProtoAndCheckerMaker { - public: - void Make() override { - AddInput("Ins", "(phi::DenseTensor) embeded tensor"); - AddInput("Ins_tag", "(phi::DenseTensor) ins tag list"); - AddInput("Filter_tag", "(1D Tensor) filter tag list"); - AddAttr("is_lod", "is Ins with LoD info or not, default True"); - AddAttr("out_val_if_empty", - "if the output after filter is empty, the output value") - .SetDefault(0); - AddOutput("Out", "(phi::DenseTensor) embeded tensor filtered by instag"); - AddOutput("LossWeight", "(Tensor) loss weight."); - AddOutput("IndexMap", - "(phi::DenseTensor) mapping from Out rows to X1 rows"); - AddComment(R"DOC( -Filter By Instag Op - -This operator is used to filter embeded ins. - -There are 3 inputs. First is embeded ins, Second is tags for ins, -Third is tags to filter. - -There are 3 outputs. First is filtered embeded ins, Second is Loss Weight, -Third is the IndexMap from Out line number to X1 line number. -)DOC"); - } -}; - -class FilterByInstagOpGrad : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE_EQ(ctx->HasInput("IndexMap"), - true, - platform::errors::InvalidArgument( - "Input(IndexMap) should be not null")); - PADDLE_ENFORCE_EQ(ctx->HasInput(framework::GradVarName("Out")), - true, - platform::errors::InvalidArgument( - "Grad Input(Out) should be not null")); - PADDLE_ENFORCE_EQ( - ctx->HasInput("Ins"), - true, - platform::errors::InvalidArgument("Input(Ins) should be not null")); - PADDLE_ENFORCE_EQ(ctx->HasInput("LossWeight"), - true, - platform::errors::InvalidArgument( - "Input(LossWeight) should be not null")); - PADDLE_ENFORCE_EQ(ctx->HasOutput(framework::GradVarName("Ins")), - true, - platform::errors::InvalidArgument( - "Grad Output(Ins) should be not null")); - - auto grad_out_dims = ctx->GetInputDim(framework::GradVarName("Out")); - auto x1_dims = ctx->GetInputDim("Ins"); - ctx->SetOutputDim(framework::GradVarName("Ins"), - phi::make_ddim({x1_dims[0], grad_out_dims[1]})); - } - - protected: - phi::KernelKey GetExpectedKernelType( - const framework::ExecutionContext& ctx) const override { - auto data_type = OperatorWithKernel::IndicateVarDataType( - ctx, framework::GradVarName("Out")); - return phi::KernelKey(data_type, ctx.device_context().GetPlace()); - } -}; - -template -class FilterByInstagGradOpMaker : public framework::SingleGradOpMaker { - public: - using framework::SingleGradOpMaker::SingleGradOpMaker; - - protected: - void Apply(GradOpPtr op) const override { - op->SetType("filter_by_instag_grad"); - op->SetInput("IndexMap", this->Output("IndexMap")); - op->SetInput("Ins", this->Input("Ins")); - op->SetAttrMap(this->Attrs()); - op->SetInput("LossWeight", this->Output("LossWeight")); - op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out")); - op->SetOutput(framework::GradVarName("Ins"), this->InputGrad("Ins")); - } -}; -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OPERATOR(filter_by_instag, - ops::FilterByInstagOp, - ops::FilterByInstagOpMaker, - ops::FilterByInstagGradOpMaker, - ops::FilterByInstagGradOpMaker); - -REGISTER_OPERATOR(filter_by_instag_grad, ops::FilterByInstagOpGrad); - -PD_REGISTER_STRUCT_KERNEL(filter_by_instag, - CPU, - ALL_LAYOUT, - ops::FilterByInstagKernel, - float, - double, - int32_t, - int64_t) {} - -PD_REGISTER_STRUCT_KERNEL(filter_by_instag_grad, - CPU, - ALL_LAYOUT, - ops::FilterByInstagGradKernel, - float, - double, - int32_t, - int64_t) {} diff --git a/paddle/fluid/operators/filter_by_instag_op.cu b/paddle/fluid/operators/filter_by_instag_op.cu deleted file mode 100644 index 4449044acb89b..0000000000000 --- a/paddle/fluid/operators/filter_by_instag_op.cu +++ /dev/null @@ -1,639 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// #if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 11000 - -#include "paddle/fluid/operators/filter_by_instag_op.h" - -#if defined(PADDLE_WITH_CUDA) -#include -#endif - -#include -#include - -#include -#include -#include -#include - -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/memory/memcpy.h" -#include "paddle/fluid/platform/device/gpu/gpu_info.h" -#include "paddle/fluid/platform/enforce.h" -#include "paddle/phi/core/mixed_vector.h" - -#if defined(PADDLE_WITH_CUDA) -namespace cg = cooperative_groups; -#endif - -namespace paddle { -namespace operators { - -using SelectedRows = phi::SelectedRows; - -template -using Vector = phi::Vector; - -#define WARP_SIZE 32 -#define MAX_WARP_NUM 32 - -#if defined(PADDLE_WITH_CUDA) - -template -__global__ void filter_copy_fuse_kernel(const size_t N, - const int ins_per_thread, - size_t* x1_lods_data, - size_t* x2_lods_data, - const int64_t* x2_data, - const int64_t* x3_data, - int64_t filter_tag_size, - T* out_data, - int64_t* map_data, - size_t* map_lods_data, - size_t* out_lods_data, - size_t* out_idx_data, - const T* x1_data, - int x1_embed_size, - float* loss_weight_data, - float fill_value) { - // N is instance num - // one threads for ins_per_thread instances - int idx = blockIdx.x * blockDim.x + threadIdx.x; - - cg::thread_block b = cg::this_thread_block(); - cg::thread_block_tile g = cg::tiled_partition(b); - - int gid = idx / WARP_SIZE; - - // general use - int thread_num = - (N + (ins_per_thread - 1)) / ins_per_thread; // real thread num - int total_warp_num = thread_num / WARP_SIZE; // 30 - int remain_thread_num = thread_num % WARP_SIZE; // 16 - - int warp_thread_num = -1; - if (gid < total_warp_num) { - warp_thread_num = WARP_SIZE; - } else { - warp_thread_num = remain_thread_num; - } - - int group_num = total_warp_num; - if (remain_thread_num > 0) { - group_num = total_warp_num + 1; - } - - if (gid >= group_num) return; - - int ins_start = idx * ins_per_thread; - int ins_end = (idx + 1) * ins_per_thread; - - if (N < ins_end) ins_end = N; - - int flag_data[5]; - int prefix_sum_data[5]; - int prefix_sum_data2[5]; - - __shared__ int shr[MAX_WARP_NUM]; - __shared__ int shr2[MAX_WARP_NUM]; - __shared__ int shr3[MAX_WARP_NUM]; - - for (int p = ins_start; p < ins_end; p++) { - int ins_tag_start = x2_lods_data[p]; - int ins_tag_end = x2_lods_data[p + 1]; - flag_data[p - ins_start] = 0; - // filter logic - int i = ins_tag_start; - for (; i < ins_tag_end; i++) { - int64_t ins_tag = x2_data[i]; - int j = 0; - for (; j < filter_tag_size; j++) { - if (x3_data[j] == ins_tag) break; - } - // if ins_tag in filter tag - if (j < filter_tag_size) { - flag_data[p - ins_start] = 1; - break; - } - } - } - - int sum_addr = 0; - int sum_flag = 0; - int sum_out_lods = 0; - - int local_addr = 0; - int local_flag = 0; - int local_out_lods = 0; - - if (ins_start < ins_end) { - for (int p = ins_start; p < ins_end; p++) { - int previous = -1; - if (p == ins_start) { - previous = 0; - } else { - previous = prefix_sum_data[p - ins_start - 1]; - } - - prefix_sum_data[p - ins_start] = - previous + - flag_data[p - ins_start] * (x1_lods_data[p + 1] - x1_lods_data[p]); - } - - local_addr = prefix_sum_data[ins_end - 1 - ins_start]; - sum_addr = local_addr; - - for (int p = ins_start; p < ins_end; p++) { - local_flag += flag_data[p - ins_start]; - } - sum_flag = local_flag; - - for (int p = ins_start; p < ins_end; p++) { - local_out_lods += - flag_data[p - ins_start] * (x1_lods_data[p + 1] - x1_lods_data[p]); - } - - sum_out_lods = local_out_lods; - } - - for (int i = 1; i < warp_thread_num; i *= 2) { - int temp_addr = g.shfl_up(sum_addr, i); - int temp_flag = g.shfl_up(sum_flag, i); - int temp_out_lods = g.shfl_up(sum_out_lods, i); - - if (g.thread_rank() >= i) { - sum_addr += temp_addr; - sum_flag += temp_flag; - sum_out_lods += temp_out_lods; - } - } - - if (g.thread_rank() == warp_thread_num - 1) { - shr[gid] = sum_addr; - shr2[gid] = sum_flag; - shr3[gid] = sum_out_lods; - } - - b.sync(); - - int sum_addr2 = 0; - int sum_flag2 = 0; - int sum_out_lods2 = 0; - - // communicate between warp - if (g.thread_rank() < group_num) { - sum_addr2 = shr[g.thread_rank()]; - sum_flag2 = shr2[g.thread_rank()]; - sum_out_lods2 = shr3[g.thread_rank()]; - } - - for (int i = 1; i < group_num; i *= 2) { - int temp_addr2 = g.shfl_up(sum_addr2, i); - int temp_flag2 = g.shfl_up(sum_flag2, i); - int temp_out_lods2 = g.shfl_up(sum_out_lods2, i); - - if (g.thread_rank() >= i) { - sum_addr2 += temp_addr2; - sum_flag2 += temp_flag2; - sum_out_lods2 += temp_out_lods2; - } - } - - int sum_addr3 = g.shfl(sum_addr2, gid); - int sum_flag3 = g.shfl(sum_flag2, gid); - int sum_out_lods3 = g.shfl(sum_out_lods2, gid); - - int p_flag; - int p_addr; - int p_out_lods; - - if (ins_start < ins_end) { - p_addr = sum_addr3 - shr[gid] + sum_addr - local_addr; - p_flag = sum_flag3 - shr2[gid] + sum_flag - local_flag; - p_out_lods = sum_out_lods3 - shr3[gid] + sum_out_lods - local_out_lods; - - for (int p = ins_start; p < ins_end; p++) { - if (ins_start == p) { - prefix_sum_data2[p - ins_start] = p_addr; - } else { - prefix_sum_data2[p - ins_start] = - prefix_sum_data2[p - ins_start - 1] + - flag_data[p - ins_start - 1] * - (x1_lods_data[p] - x1_lods_data[p - 1]); - } - } - - if (gid == 0 && g.thread_rank() == group_num - 1) { - *out_idx_data = (sum_flag2 + 1); - map_lods_data[sum_flag2] = sum_flag2; - } - } - - int sum_out_lods4 = g.shfl(sum_out_lods2 + 1, group_num - 1); - - if (ins_start < ins_end) { - int out_lods_idx = p_flag + 1; - for (int p = ins_start; p < ins_end; p++) { - if (flag_data[p - ins_start] == 1) { - size_t batch_len = x1_lods_data[p + 1] - x1_lods_data[p]; - int t = out_lods_idx - 1; - int previous; - if (out_lods_idx == p_flag + 1) { - previous = p_out_lods; - } else { - previous = out_lods_data[t]; - } - map_data[t * 3] = (int64_t)previous; - map_data[t * 3 + 1] = x1_lods_data[p]; - map_lods_data[t] = t; - out_lods_data[out_lods_idx] = previous + batch_len; - map_data[t * 3 + 2] = batch_len; - out_lods_idx++; - } - } - - // fill loss_weight_data - if (sum_out_lods4 > 1) { - int out_data_num = sum_out_lods4 - 1; - int out_start = ins_start; - if (out_start < out_data_num) { - int out_end = ins_end >= out_data_num ? out_data_num : ins_end; - for (int p = out_start; p < out_end; p++) { - loss_weight_data[p] = fill_value; - } - } - } - - for (int p = ins_start; p < ins_end; p++) { - // copy logic - if (flag_data[p - ins_start] == 1) { - auto output_start_idx = prefix_sum_data2[p - ins_start]; - T* dst = out_data + output_start_idx * x1_embed_size; - const T* src_start = x1_data + x1_lods_data[p] * x1_embed_size; - const T* src_end = x1_data + x1_lods_data[p + 1] * x1_embed_size; - for (const T* j = src_start; j != src_end; dst++, j++) { - *dst = *j; - } - } - } - } - - b.sync(); -} - -template -__global__ void copy_grad_kernel(const size_t N, - const int ins_per_thread, - const T* out_grad_data, - T* x1_grad_data, - const int64_t* map_data, - int x1_embed_size) { - // N is instance num - // one threads for one instance - int idx = blockIdx.x * blockDim.x + threadIdx.x; - int ins_start = idx * ins_per_thread; - int ins_end = (idx + 1) * ins_per_thread; - if (ins_start >= N) { - return; - } - if (ins_end > N) ins_end = N; - for (int p = ins_start; p < ins_end; p++) { - T* dst = x1_grad_data + map_data[p * 3 + 1] * x1_embed_size; - const T* src_start = out_grad_data + map_data[p * 3] * x1_embed_size; - const T* src_end = - out_grad_data + (map_data[p * 3] + map_data[p * 3 + 2]) * x1_embed_size; - - for (const T* j = src_start; j != src_end; dst++, j++) { - *dst = *j; - } - } -} - -#endif - -template -class FilterByInstagGPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { -#if defined(PADDLE_WITH_CUDA) - - auto gpu_place = context.GetPlace(); - - gpuStream_t current_stream = context.cuda_device_context().stream(); - - int max_thread_num_per_block = 1024; - // context.cuda_device_context().GetMaxThreadsPerBlock(); - // X1 is global FC output - // Dim [batch size, embedding size] - const phi::DenseTensor* x1 = context.Input("Ins"); - bool is_lod = context.Attr("is_lod"); - - int is_x1_lod = -1; - if (is_lod) - is_x1_lod = 1; - else - is_x1_lod = 0; - - int64_t out_val_if_empty = context.Attr("out_val_if_empty"); - size_t x1_embed_size = x1->dims()[1]; - // X2 is ins tag list - // LoD [[0, Sum(ins1), Sum(ins1, ins2), ... ]] - const phi::DenseTensor* x2 = context.Input("Ins_tag"); - // expected auto = const int64_t - const int64_t* x2_data = x2->data(); - - // X3 is local fc tag list - // LoD [[0, Sum(fc1), Sum(fc1, fc2) ...]] - const phi::DenseTensor* x3 = context.Input("Filter_tag"); - const int64_t* x3_data = x3->data(); - - Vector x2_lods; - if (x2->lod().size() != 0) { // lod_level = 1 - x2_lods = x2->lod()[0]; - } else { // lod_level = 0 - const size_t x2_lods_size = x2->dims()[0]; - const size_t instag_per_num = x2->dims()[1]; - // x2_lods.resize(x2->dims()[0] + 1); - // move to cuda - x2_lods.push_back(0); - for (size_t i = 0; i < x2_lods_size; i++) { - x2_lods.push_back(x2_lods.back() + instag_per_num); - } - } - - const size_t x2_lods_size = x2_lods.size() - 1; - phi::MixVector mixv_x2_lods(&x2_lods); - - size_t* x2_lods_data = mixv_x2_lods.CUDAMutableData(gpu_place); - - Vector x1_lods; - if (!is_x1_lod) { - x1_lods.push_back(0); - for (int i = 0; i < x1->dims()[0]; i++) { - x1_lods.push_back(i + 1); - } - } else { - // x1_lods = context.Input("Ins")->lod()[0]; - // new: lod_level=0 => lod() return {} - if (x1->lod().size() != 0) { // lod_level = 1 - x1_lods = x1->lod()[0]; - } else { // lod_level = 0 - // x1_lods.resize(x1->dims()[0] + 1); - // move to cuda - x1_lods.push_back(0); - for (int i = 0; i < x1->dims()[0]; i++) { - x1_lods.push_back(i + 1); - } - } - } - - phi::MixVector mixv_x1_lods(&x1_lods); - - size_t* x1_lods_data = mixv_x1_lods.CUDAMutableData(gpu_place); - auto* x1_data = x1->data(); - - // set output value - // for those whose ins been dropout, set 0 for whole lines. - // otherwise, copy whole line - // Dim [local fc count, batch size, embedding size] - phi::DenseTensor* out = context.Output("Out"); - phi::DenseTensor* map = context.Output("IndexMap"); - phi::DenseTensor* loss_weight = - context.Output("LossWeight"); - - int out_first = x1_lods.back(); - - out->Resize(phi::make_ddim({(int64_t)out_first, (int64_t)x1_embed_size})); - map->Resize(phi::make_ddim({(int64_t)x2_lods_size, 3})); - loss_weight->Resize(phi::make_ddim({(int64_t)x2_lods_size, 1})); - - T* out_data = out->mutable_data(gpu_place); - int64_t* map_data = map->mutable_data(gpu_place); - float* loss_weight_data = loss_weight->mutable_data(gpu_place); - - int block_size = max_thread_num_per_block; - int ins_per_thread = (x2_lods_size + block_size - 1) / block_size; - dim3 block_dim(block_size); - dim3 grid_dim(1); - - Vector out_lods(x2_lods_size + 1, 0); - Vector map_lods(x2_lods_size + 1, 0); - - phi::MixVector mixv_out_lods(&out_lods); - phi::MixVector mixv_map_lods(&map_lods); - - // thrust::device_vector out_idx(1); - Vector out_idx(1, 0); - phi::MixVector mixv_out_idx(&out_idx); - - size_t* out_idx_data = mixv_out_idx.CUDAMutableData(gpu_place); - size_t* out_lods_data = mixv_out_lods.CUDAMutableData(gpu_place); - size_t* map_lods_data = mixv_map_lods.CUDAMutableData(gpu_place); - - float fill_value = 1.0; - - filter_copy_fuse_kernel<<>>( - x2_lods_size, - ins_per_thread, - x1_lods_data, - x2_lods_data, - x2_data, - x3_data, - x3->numel(), - out_data, - map_data, - map_lods_data, - out_lods_data, - out_idx_data, - x1_data, - x1_embed_size, - loss_weight_data, - fill_value); - - platform::GpuStreamSync(current_stream); - - mixv_out_lods.resize(mixv_out_idx[0]); - - if (mixv_out_lods.size() - 1 > 0) { - out->Resize(phi::make_ddim( - {(int64_t)mixv_out_lods.back(), (int64_t)x1_embed_size})); - - map->Resize(phi::make_ddim({(int64_t)mixv_out_lods.size() - 1, 3})); - loss_weight->Resize( - phi::make_ddim({(int64_t)mixv_out_lods.size() - 1, 1})); - - } else { - out->Resize(phi::make_ddim({1, (int64_t)x1_embed_size})); - map->Resize(phi::make_ddim({1, 3})); - loss_weight->Resize(phi::make_ddim({1, 1})); - } - - if (mixv_out_lods.size() - 1 > 0) { - map_lods.resize(mixv_out_lods.size()); - - mixv_map_lods.CopyToCPU(); - - std::vector> map_lod_info; - map_lod_info.emplace_back(map_lods); - - map->set_lod(map_lod_info); - loss_weight->set_lod(map_lod_info); - - mixv_out_lods.CopyToCPU(); - std::vector> out_lod_info; - out_lod_info.emplace_back(out_lods); - out->set_lod(out_lod_info); - - } else { - Vector map_lods(2, 0); - phi::MixVector mixv_map_lods(&map_lods); - thrust::device_ptr map_data_ptr(map_data); - - map_data_ptr[0] = 0; - map_data_ptr[1] = 1; - map_data_ptr[2] = 1; - - mixv_map_lods[0] = 0; - mixv_map_lods[1] = 1; - mixv_out_lods.push_back(1); - - mixv_map_lods.CopyToCPU(); - mixv_out_lods.CopyToCPU(); - - std::vector> map_lod_info; - map_lod_info.emplace_back(map_lods); - map->set_lod(map_lod_info); - - loss_weight->set_lod(map_lod_info); - - std::vector> out_lod_info; - out_lod_info.emplace_back(out_lods); - out->set_lod(out_lod_info); - - thrust::device_ptr out_data_ptr(out_data); - - // gpu kernel - if (std::is_same::value) { - thrust::fill(out_data_ptr, - out_data_ptr + out->numel(), - static_cast(out_val_if_empty)); - } else if (std::is_same::value) { - thrust::fill(out_data_ptr, - out_data_ptr + out->numel(), - static_cast(out_val_if_empty)); - } else if (std::is_same::value) { - thrust::fill(out_data_ptr, - out_data_ptr + out->numel(), - static_cast(out_val_if_empty)); - } else { - thrust::fill(out_data_ptr, - out_data_ptr + out->numel(), - static_cast(out_val_if_empty)); - } - - thrust::device_ptr loss_weight_data_ptr(loss_weight_data); - loss_weight_data_ptr[0] = 0; - } - -#endif - } -}; - -template -class FilterByInstagGradGPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { -#if defined(PADDLE_WITH_CUDA) - - auto gpu_place = context.GetPlace(); - gpuStream_t current_stream = context.cuda_device_context().stream(); - auto max_thread_num_per_block = 1024; - auto* output_grad = - context.Input(framework::GradVarName("Out")); - auto* x1_grad = - context.Output(framework::GradVarName("Ins")); - auto* loss_weight = context.Input("LossWeight"); - auto* mmap = context.Input("IndexMap"); - auto* x1 = context.Input("Ins"); - - x1_grad->set_lod(context.Input("Ins")->lod()); - x1_grad->Resize(x1->dims()); - - auto* mmap_data = mmap->data(); - // expected auto = T - auto* output_grad_data = output_grad->data(); - auto* loss_weight_data = loss_weight->data(); - - // expected auto = T - auto* x1_grad_data = x1_grad->mutable_data(gpu_place); - thrust::device_ptr x1_grad_data_ptr(x1_grad_data); - thrust::device_ptr loss_weight_data_ptr(loss_weight_data); - - thrust::fill( - x1_grad_data_ptr, x1_grad_data_ptr + x1->dims()[0] * x1->dims()[1], 0); - - if (loss_weight->numel() != 1 || loss_weight_data_ptr[0] != 0) { - auto output_dims = output_grad->dims(); - int x1_embed_size = output_dims[1]; - - // one thread for multi-instances - int block_size = max_thread_num_per_block; - - size_t N = mmap->dims()[0]; - dim3 block_dim(block_size); - - dim3 grid_dim((N + block_size - 1) / block_size); - - const int ins_per_thread = 1; - - copy_grad_kernel<<>>( - N, - ins_per_thread, - output_grad_data, - x1_grad_data, - mmap_data, - x1_embed_size); - - cudaStreamSynchronize(current_stream); - } - -#endif - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -PD_REGISTER_STRUCT_KERNEL(filter_by_instag, - GPU, - ALL_LAYOUT, - ops::FilterByInstagGPUKernel, - float, - double, - int32_t, - int64_t) {} - -PD_REGISTER_STRUCT_KERNEL(filter_by_instag_grad, - GPU, - ALL_LAYOUT, - ops::FilterByInstagGradGPUKernel, - float, - double, - int32_t, - int64_t) {} diff --git a/paddle/fluid/operators/filter_by_instag_op.h b/paddle/fluid/operators/filter_by_instag_op.h deleted file mode 100644 index 04dc713a4dcc9..0000000000000 --- a/paddle/fluid/operators/filter_by_instag_op.h +++ /dev/null @@ -1,231 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include -#include -#include -#include - -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/memory/memcpy.h" -#include "paddle/phi/core/mixed_vector.h" - -namespace paddle { -namespace operators { -using SelectedRows = phi::SelectedRows; - -template -using Vector = phi::Vector; - -template -class FilterByInstagKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - // X1 is global FC output - // Dim [batch size, embedding size] - auto* x1 = context.Input("Ins"); - bool is_x1_lod = context.Attr("is_lod"); - int64_t out_val_if_empty = context.Attr("out_val_if_empty"); - // X2 is ins tag list - // LoD [[0, Sum(ins1), Sum(ins1, ins2), ... ]] - auto* x2 = context.Input("Ins_tag"); - // X3 is local fc tag list - // LoD [[0, Sum(fc1), Sum(fc1, fc2) ...]] - auto* x3 = context.Input("Filter_tag"); - - std::unordered_set filter_tag; - auto* x3_data = x3->data(); - size_t len = x3->dims()[0]; - for (size_t i = 0; i < len; i++) { - filter_tag.insert(x3_data[i]); - } - - // expected auto = const int64_t - auto* x2_data = x2->data(); - // e.g get [0, 1, 2, 3, ...] - // size_t x2_lods_size = x2->dims()[0]; - // size_t instag_num_per_ins = x2->dims()[1]; - - Vector x2_lods(1, 0); - if (x2->lod().size() != 0) { // lod_level = 1 - x2_lods = x2->lod()[0]; - } else { // lod_level = 0 - const size_t x2_lods_size = x2->dims()[0]; - const size_t instag_num_per_ins = x2->dims()[1]; - for (size_t i = 0; i < x2_lods_size; i++) { - x2_lods.push_back(x2_lods.back() + instag_num_per_ins); - } - } - - Vector x1_lods(1, 0); - if (!is_x1_lod) { - for (int i = 0; i < x1->dims()[0]; i++) { - x1_lods.push_back(i + 1); - } - } else { - // new: lod_level=0 => lod() return {} - if (x1->lod().size() != 0) { - x1_lods = x1->lod()[0]; - } else { - for (int i = 0; i < x1->dims()[0]; i++) { - x1_lods.push_back(i + 1); - } - } - } - std::unordered_map mmap_aux; - Vector out_lods(1, 0); - for (size_t i = 0; i < x2_lods.size() - 1; i++) { - for (size_t j = x2_lods[i]; j < x2_lods[i + 1]; j++) { - if (filter_tag.find(x2_data[j]) != filter_tag.end()) { - size_t batch_len = x1_lods[i + 1] - x1_lods[i]; - mmap_aux[out_lods.back()] = x1_lods[i]; - out_lods.push_back(out_lods.back() + batch_len); - break; - } - } - } - // set output value - // for those whose ins been dropout, set 0 for whole lines. - // otherwise, copy whole line - // Dim [local fc count, batch size, embedding size] - phi::DenseTensor* out = context.Output("Out"); - phi::DenseTensor* map = context.Output("IndexMap"); - phi::DenseTensor* loss_weight = - context.Output("LossWeight"); - // expected auto = const T - auto* x1_data = x1->data(); - // expected auto = T - size_t x1_embed_size = x1->dims()[1]; - if (out_lods.size() - 1 > 0) { - out->Resize( - phi::make_ddim({(int64_t)out_lods.back(), (int64_t)x1_embed_size})); - map->Resize(phi::make_ddim({(int64_t)out_lods.size() - 1, 3})); - loss_weight->Resize(phi::make_ddim({(int64_t)out_lods.size() - 1, 1})); - } else { - out->Resize(phi::make_ddim({1, (int64_t)x1_embed_size})); - map->Resize(phi::make_ddim({1, 3})); - loss_weight->Resize(phi::make_ddim({1, 1})); - } - auto* out_data = out->mutable_data(context.GetPlace()); - auto* map_data = map->mutable_data(context.GetPlace()); - auto* loss_weight_data = - loss_weight->mutable_data(context.GetPlace()); - if (out_lods.size() - 1 > 0) { - Vector map_lods; - for (size_t i = 0; i < out_lods.size() - 1; i++) { - map_data[i * 3] = (int64_t)out_lods[i]; - map_data[i * 3 + 1] = mmap_aux[map_data[i * 3]]; - map_data[i * 3 + 2] = out_lods[i + 1] - out_lods[i]; - map_lods.push_back(i); - } - map_lods.push_back(out_lods.size() - 1); - std::vector> map_lod_info; - map_lod_info.push_back(map_lods); - - map->set_lod(map_lod_info); - loss_weight->set_lod(map_lod_info); - std::vector> out_lod_info; - out_lod_info.push_back(out_lods); - out->set_lod(out_lod_info); - memset(out_data, 0, out->numel() * sizeof(T)); - for (int i = 0; i < loss_weight->numel(); i++) { - loss_weight_data[i] = 1; - } - - for (size_t i = 0; i < out_lods.size() - 1; i++) { - size_t pos = out_lods[i]; - for (int k = map_data[i * 3 + 1]; - k < map_data[i * 3 + 1] + map_data[i * 3 + 2]; - k++) { - memcpy(out_data + pos * x1_embed_size, - x1_data + k * x1_embed_size, - x1_embed_size * sizeof(T)); - ++pos; - } - } - } else { - Vector map_lods; - map_data[0] = 0; - map_data[1] = 1; - map_data[2] = 1; - map_lods.push_back(0); - map_lods.push_back(1); - out_lods.push_back(1); - std::vector> map_lod_info; - map_lod_info.push_back(map_lods); - map->set_lod(map_lod_info); - loss_weight->set_lod(map_lod_info); - std::vector> out_lod_info; - out_lod_info.push_back(out_lods); - out->set_lod(out_lod_info); - for (int64_t oi = 0; oi < out->numel(); ++oi) { - if (std::is_same::value) { - out_data[oi] = (int32_t)out_val_if_empty; - } else if (std::is_same::value) { - out_data[oi] = (int64_t)out_val_if_empty; - } else if (std::is_same::value) { - out_data[oi] = static_cast(out_val_if_empty); - } else { - out_data[oi] = static_cast(out_val_if_empty); - } - } - loss_weight_data[0] = 0; - } - } -}; - -template -class FilterByInstagGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* output_grad = - context.Input(framework::GradVarName("Out")); - auto* x1_grad = - context.Output(framework::GradVarName("Ins")); - auto* loss_weight = context.Input("LossWeight"); - auto* mmap = context.Input("IndexMap"); - auto* x1 = context.Input("Ins"); - x1_grad->set_lod(context.Input("Ins")->lod()); - x1_grad->Resize(x1->dims()); - auto mmap_data = mmap->data(); - // expected auto = T - auto* output_grad_data = output_grad->data(); - - auto* loss_weight_data = loss_weight->data(); - // expected auto = T - auto* x1_grad_data = x1_grad->mutable_data(context.GetPlace()); - memset(x1_grad_data, 0, x1->dims()[0] * x1->dims()[1] * sizeof(T)); - if (loss_weight->numel() != 1 || loss_weight_data[0] != 0) { - auto output_dims = output_grad->dims(); - for (int i = 0; i < mmap->dims()[0]; i++) { - int src_ln = mmap_data[i * 3], dst_ln = mmap_data[i * 3 + 1]; - int line_cnt = mmap_data[i * 3 + 2]; - for (int l = 0; l < line_cnt; l++) { - for (int j = 0; j < output_dims[1]; j++) { - x1_grad_data[(dst_ln + l) * output_dims[1] + j] = - output_grad_data[(src_ln + l) * output_dims[1] + j]; - } - } - } - } - } -}; -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/prroi_pool_op.cc b/paddle/fluid/operators/prroi_pool_op.cc deleted file mode 100644 index 0f0dbf3c6888a..0000000000000 --- a/paddle/fluid/operators/prroi_pool_op.cc +++ /dev/null @@ -1,214 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/prroi_pool_op.h" - -#include - -namespace paddle { -namespace operators { - -class PRROIPoolOpMaker : public framework::OpProtoAndCheckerMaker { - public: - void Make() override { - AddInput("X", - "(Tensor), " - "the input of PRROIPoolOp. " - "The format of input tensor is NCHW. Where N is the batch size, " - "C is the number of input channels, " - "H is the height of the input feature map, and " - "W is the width."); - AddInput("ROIs", - "(phi::DenseTensor), " - "ROIs (Regions of Interest) to pool over. " - "should be a 2-D phi::DenseTensor of shape (num_rois, 4) " - "given as [(x1, y1, x2, y2), ...]. " - "where (x1, y1) is the top left coordinates, and " - "(x2, y2) is the bottom right coordinates. " - "The roi batch index can be calculated from LoD."); - AddInput("BatchRoINums", - "(Tensor), " - "1-D tensor with shape [N], the number of" - " rois for each image in batch, where N is the batch size") - .AsDispensable(); - AddOutput("Out", - "(Tensor), " - "the output of PRROIPoolOp is a 4-D Tensor with shape " - "(num_rois, output_channels, pooled_h, pooled_w)."); - AddAttr("spatial_scale", - "(float, default 1.0), " - "Multiplicative spatial scale factor " - "to translate ROI coords from their input scale " - "to the scale used when pooling.") - .SetDefault(1.0); - AddAttr("pooled_height", - "(int, default 1), " - "the pooled output height.") - .SetDefault(1); - AddAttr("pooled_width", - "(int, default 1), " - "the pooled output width.") - .SetDefault(1); - AddComment(R"Doc( -**PRROIPool Operator** - -Precise region of interest pooling (also known as PRROIPooling) is to perform - bilinear interpolation average pooling method for RoI Pooling. - -Please refer to https://arxiv.org/abs/1807.11590 for more details. - - )Doc"); - } -}; - -class PRROIPoolOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "prroi_pool"); - OP_INOUT_CHECK(ctx->HasInput("ROIs"), "Input", "ROIs", "prroi_pool"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Input", "Out", "prroi_pool"); - - auto input_dims = ctx->GetInputDim("X"); - auto rois_dims = ctx->GetInputDim("ROIs"); - - PADDLE_ENFORCE_EQ(input_dims.size(), - 4, - platform::errors::InvalidArgument( - "The format of input tensor is NCHW")); - PADDLE_ENFORCE_EQ( - rois_dims.size(), - 2, - platform::errors::InvalidArgument( - "ROIs should be a 2-D phi::DenseTensor of shape (num_rois, 4) " - "given as [(x1, y1, x2, y2), ...]")); - PADDLE_ENFORCE_EQ( - rois_dims[1], - 4, - platform::errors::InvalidArgument( - "ROIs should be a 2-D phi::DenseTensor of shape (num_rois, 4) " - "given as [(x1, y1, x2, y2), ...]")); - int pooled_height = ctx->Attrs().Get("pooled_height"); - int pooled_width = ctx->Attrs().Get("pooled_width"); - float spatial_scale = ctx->Attrs().Get("spatial_scale"); - - PADDLE_ENFORCE_GT(pooled_height, - 0, - platform::errors::InvalidArgument( - "The pooled output height must be greater than 0")); - PADDLE_ENFORCE_GT(pooled_width, - 0, - platform::errors::InvalidArgument( - "The pooled output width must be greater than 0")); - PADDLE_ENFORCE_GT(spatial_scale, - 0.0f, - platform::errors::InvalidArgument( - "The spatial scale must greater than 0.")); - - auto out_dims = input_dims; - out_dims[0] = rois_dims[0]; - out_dims[1] = input_dims[1]; - out_dims[2] = pooled_height; - out_dims[3] = pooled_width; - - if (ctx->HasInput("BatchRoINums")) { - auto rois_batch_index = ctx->GetInputDim("BatchRoINums"); - PADDLE_ENFORCE_EQ(rois_batch_index[0], - input_dims[0], - platform::errors::InvalidArgument( - "The length of BatchRoINums should equal to " - "first dim of inputs(X)")); - } - ctx->SetOutputDim("Out", out_dims); - } - - protected: - phi::KernelKey GetExpectedKernelType( - const framework::ExecutionContext& ctx) const override { - return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(ctx, "X"), - ctx.GetPlace()); - } -}; - -class PRROIPoolGradOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")), - "Input", - framework::GradVarName("Out"), - "prroi_pool"); - OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("X")), - "Output", - framework::GradVarName("X"), - "prroi_pool"); - ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); - ctx->SetOutputDim(framework::GradVarName("ROIs"), ctx->GetInputDim("ROIs")); - } - - protected: - phi::KernelKey GetExpectedKernelType( - const framework::ExecutionContext& ctx) const override { - return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(ctx, "X"), - ctx.GetPlace()); - } -}; - -template -class PRROIPoolGradMaker : public framework::SingleGradOpMaker { - public: - using framework::SingleGradOpMaker::SingleGradOpMaker; - - protected: - void Apply(GradOpPtr op) const override { - op->SetType("prroi_pool_grad"); - op->SetInput("X", this->Input("X")); - op->SetInput("Out", this->Output("Out")); - op->SetInput("ROIs", this->Input("ROIs")); - op->SetInput("BatchRoINums", this->Input("BatchRoINums")); - op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out")); - op->SetOutput(framework::GradVarName("X"), this->InputGrad("X")); - op->SetOutput(framework::GradVarName("ROIs"), this->InputGrad("ROIs")); - op->SetAttrMap(this->Attrs()); - } -}; -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OPERATOR(prroi_pool, - ops::PRROIPoolOp, - ops::PRROIPoolOpMaker, - ops::PRROIPoolGradMaker, - ops::PRROIPoolGradMaker); -REGISTER_OPERATOR(prroi_pool_grad, ops::PRROIPoolGradOp); - -PD_REGISTER_STRUCT_KERNEL(prroi_pool, - CPU, - ALL_LAYOUT, - ops::CPUPRROIPoolOpKernel, - float, - double, - int, - int64_t) {} -PD_REGISTER_STRUCT_KERNEL(prroi_pool_grad, - CPU, - ALL_LAYOUT, - ops::CPUPRROIPoolGradOpKernel, - float, - double, - int, - int64_t) {} diff --git a/paddle/fluid/operators/prroi_pool_op.cu b/paddle/fluid/operators/prroi_pool_op.cu deleted file mode 100644 index 5d1243964279b..0000000000000 --- a/paddle/fluid/operators/prroi_pool_op.cu +++ /dev/null @@ -1,439 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/prroi_pool_op.h" - -namespace paddle { -namespace operators { - -static constexpr int kNumCUDAThreads = 512; -static constexpr int kNumMaximumNumBlocks = 4096; - -static inline int NumBlocks(const int N) { - return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads, - kNumMaximumNumBlocks); -} - -template -__global__ void GPUPRROIPoolForward(const int nthreads, - const T* input_data, - const T* input_rois, - const float spatial_scale, - const int input_channels, - const int height, - const int width, - const int output_channels, - const int pooled_height, - const int pooled_width, - const int* rois_batch_id_data, - T* output_data) { - int index = blockIdx.x * blockDim.x + threadIdx.x; - int offset = blockDim.x * gridDim.x; - for (size_t i = index; i < nthreads; i += offset) { - // The output is in order (n, c, ph, pw) - int pw = i % pooled_width; - int ph = (i / pooled_width) % pooled_height; - int c = (i / pooled_width / pooled_height) % output_channels; - int n = i / pooled_width / pooled_height / output_channels; - - // set roi_batch_id - int roi_batch_id = rois_batch_id_data[n]; - - // [start, end) interval for spatial sampling - const T* offset_input_rois = input_rois + n * 4; - T roi_start_w = static_cast(offset_input_rois[0]) * spatial_scale; - T roi_start_h = static_cast(offset_input_rois[1]) * spatial_scale; - T roi_end_w = static_cast(offset_input_rois[2]) * spatial_scale; - T roi_end_h = static_cast(offset_input_rois[3]) * spatial_scale; - - T roi_width = max(roi_end_w - roi_start_w, static_cast(0.0)); - T roi_height = max(roi_end_h - roi_start_h, static_cast(0.0)); - - // Compute w and h at input feature map - T bin_size_h = roi_height / static_cast(pooled_height); - T bin_size_w = roi_width / static_cast(pooled_width); - - T win_start_w = roi_start_w + bin_size_w * pw; - T win_start_h = roi_start_h + bin_size_h * ph; - T win_end_w = win_start_w + bin_size_w; - T win_end_h = win_start_h + bin_size_h; - - T win_size = max(static_cast(0.0), bin_size_w * bin_size_h); - int input_channel = c; - const T* offset_input_data = - input_data + - (roi_batch_id * input_channels + input_channel) * height * width; - - if (win_size > static_cast(0.0)) { - int s_w = floor(win_start_w); - int e_w = ceil(win_end_w); - int s_h = floor(win_start_h); - int e_h = ceil(win_end_h); - T sum_out = 0; - - for (int w_iter = s_w; w_iter < e_w; ++w_iter) { - for (int h_iter = s_h; h_iter < e_h; ++h_iter) { - sum_out += PrRoIPoolingMatCalculation( - offset_input_data, - h_iter, - w_iter, - h_iter + 1, - w_iter + 1, - max(win_start_h, static_cast(h_iter)), - max(win_start_w, static_cast(w_iter)), - min(win_end_h, static_cast(h_iter) + static_cast(1.0)), - min(win_end_w, static_cast(w_iter) + static_cast(1.0)), - height, - width); - } - } - output_data[i] = sum_out / win_size; - } else { - output_data[i] = 0.; - } - } -} - -template -__global__ void GPUPRROIPoolBackward(const int nthreads, - const T* in_data, - const T* input_rois, - const T* output_grad_data, - const float spatial_scale, - const int input_channels, - const int height, - const int width, - const int output_channels, - const int pooled_height, - const int pooled_width, - const int* rois_batch_id_data, - T* input_grad_data, - const T* out_data, - T* input_roi_grad_data) { - int index = blockIdx.x * blockDim.x + threadIdx.x; - int offset = blockDim.x * gridDim.x; - for (int i = index; i < nthreads; i += offset) { - // The output is in order (n, c, ph, pw) - int pw = i % pooled_width; - int ph = (i / pooled_width) % pooled_height; - int c = (i / pooled_width / pooled_height) % output_channels; - int n = i / pooled_width / pooled_height / output_channels; - - // set roi_batch_id - int roi_batch_id = rois_batch_id_data[n]; - int input_channel = c; - int input_offset = - (roi_batch_id * input_channels + input_channel) * height * width; - T* offset_input_grad_data = input_grad_data + input_offset; - const T* offset_output_grad_data = output_grad_data + i; - - // [start, end) interval for spatial sampling - const T* offset_input_rois = input_rois + n * 4; - T roi_start_w = static_cast(offset_input_rois[0]) * spatial_scale; - T roi_start_h = static_cast(offset_input_rois[1]) * spatial_scale; - T roi_end_w = static_cast(offset_input_rois[2]) * spatial_scale; - T roi_end_h = static_cast(offset_input_rois[3]) * spatial_scale; - T* offset_input_roi_grad_data = input_roi_grad_data + n * 4; - - T roi_width = max(roi_end_w - roi_start_w, static_cast(0.0)); - T roi_height = max(roi_end_h - roi_start_h, static_cast(0.0)); - - // Compute w and h at input feature map - T bin_size_h = roi_height / static_cast(pooled_height); - T bin_size_w = roi_width / static_cast(pooled_width); - - T win_start_w = roi_start_w + bin_size_w * pw; - T win_start_h = roi_start_h + bin_size_h * ph; - T win_end_w = win_start_w + bin_size_w; - T win_end_h = win_start_h + bin_size_h; - - T win_size = max(static_cast(0.0), bin_size_w * bin_size_h); - int s_w = floor(win_start_w); - int e_w = ceil(win_end_w); - int s_h = floor(win_start_h); - int e_h = ceil(win_end_h); - - T sum_out = win_size == static_cast(0.) - ? static_cast(0.) - : *offset_output_grad_data / win_size; - - for (int w_iter = s_w; w_iter < e_w; ++w_iter) { - for (int h_iter = s_h; h_iter < e_h; ++h_iter) { - PrRoIPoolingMatDistributeDiff( - offset_input_grad_data, - sum_out, - h_iter, - w_iter, - h_iter + 1, - w_iter + 1, - max(win_start_h, static_cast(h_iter)), - max(win_start_w, static_cast(w_iter)), - min(win_end_h, static_cast(h_iter) + static_cast(1.0)), - min(win_end_w, static_cast(w_iter) + static_cast(1.0)), - height, - width); - } - } - - const T* offset_out_data = out_data + i; - const T* offset_in_data = in_data + input_offset; - PrRoIPoolingCoorBackward(s_w, - e_w, - s_h, - e_h, - width, - height, - win_start_w, - win_start_h, - win_end_w, - win_end_h, - pw, - ph, - pooled_width, - pooled_height, - win_size, - spatial_scale, - offset_in_data, - offset_out_data, - offset_input_roi_grad_data, - offset_output_grad_data); - } -} - -template -class GPUPRROIPoolOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* in = ctx.Input("X"); - auto* rois = ctx.Input("ROIs"); - auto* out = ctx.Output("Out"); - - auto pooled_height = ctx.Attr("pooled_height"); - auto pooled_width = ctx.Attr("pooled_width"); - auto spatial_scale = ctx.Attr("spatial_scale"); - - auto in_dims = in->dims(); - int batch_size = in_dims[0]; - int input_channels = in_dims[1]; - auto output_channels = input_channels; - int height = in_dims[2]; - int width = in_dims[3]; - - int rois_num = rois->dims()[0]; - if (rois_num == 0) return; - - // set rois batch id - phi::DenseTensor rois_batch_id_list; - rois_batch_id_list.Resize({rois_num}); - int* rois_batch_id_data = - rois_batch_id_list.mutable_data(platform::CPUPlace()); - - if (ctx.HasInput("BatchRoINums") || rois->lod().empty()) { - auto* batchroinum = ctx.Input("BatchRoINums"); - phi::DenseTensor batch_index_cpu; - framework::TensorCopySync( - *batchroinum, platform::CPUPlace(), &batch_index_cpu); - - int rois_batch_size = batchroinum->dims()[0]; - auto* batch_index = batch_index_cpu.data(); - size_t c = 0; - for (int n = 0; n < rois_batch_size; ++n) { - for (int64_t k = 0; k < batch_index[n]; ++k) { - rois_batch_id_data[c] = n; - c = c + 1; - } - } - - } else { - auto rois_lod = rois->lod().back(); - int rois_batch_size = rois_lod.size() - 1; - PADDLE_ENFORCE_EQ( - rois_batch_size, - batch_size, - platform::errors::InvalidArgument( - "The rois_batch_size and input(X) batch_size must be the same.")); - int rois_num_with_lod = rois_lod[rois_batch_size]; - PADDLE_ENFORCE_EQ( - rois_num, - rois_num_with_lod, - platform::errors::InvalidArgument( - "The rois_num from input and lod must be the same.")); - - for (int n = 0; n < rois_batch_size; ++n) { - for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) { - rois_batch_id_data[i] = n; - } - } - } - - int output_size = out->numel(); - int blocks = NumBlocks(output_size); - int threads = kNumCUDAThreads; - - auto cplace = platform::CPUPlace(); - auto& dev_ctx = ctx.cuda_device_context(); - int bytes = rois_batch_id_list.numel() * sizeof(int); - auto roi_ptr = memory::Alloc( - dev_ctx.GetPlace(), - bytes, - phi::Stream(reinterpret_cast(dev_ctx.stream()))); - int* roi_id_data = reinterpret_cast(roi_ptr->ptr()); - const auto gplace = ctx.GetPlace(); - memory::Copy(gplace, - roi_id_data, - cplace, - rois_batch_id_data, - bytes, - dev_ctx.stream()); - - // call cuda kernel function - GPUPRROIPoolForward<<>>( - output_size, - in->data(), - rois->data(), - spatial_scale, - input_channels, - height, - width, - output_channels, - pooled_height, - pooled_width, - roi_id_data, - out->mutable_data(ctx.GetPlace())); - } -}; - -template -class GPUPRROIPoolGradOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* in = ctx.Input("X"); - auto* rois = ctx.Input("ROIs"); - auto* out = ctx.Input("Out"); - - auto* output_grad = - ctx.Input(framework::GradVarName("Out")); - auto* input_grad = - ctx.Output(framework::GradVarName("X")); - auto* input_roi_grad = - ctx.Output(framework::GradVarName("ROIs")); - - auto pooled_height = ctx.Attr("pooled_height"); - auto pooled_width = ctx.Attr("pooled_width"); - auto spatial_scale = ctx.Attr("spatial_scale"); - - int rois_num = rois->dims()[0]; - int input_channels = in->dims()[1]; - auto output_channels = input_channels; - int height = in->dims()[2]; - int width = in->dims()[3]; - - if (input_grad || input_roi_grad) { - // set roi batch id - phi::DenseTensor rois_batch_id_list; - rois_batch_id_list.Resize({rois_num}); - int* rois_batch_id_data = - rois_batch_id_list.mutable_data(platform::CPUPlace()); - - if (ctx.HasInput("BatchRoINums") || rois->lod().empty()) { - auto* batchroinum = ctx.Input("BatchRoINums"); - phi::DenseTensor batch_index_cpu; - framework::TensorCopySync( - *batchroinum, platform::CPUPlace(), &batch_index_cpu); - - int rois_batch_size = batchroinum->dims()[0]; - auto* batch_index = batch_index_cpu.data(); - size_t c = 0; - for (int n = 0; n < rois_batch_size; ++n) { - for (int64_t k = 0; k < batch_index[n]; ++k) { - rois_batch_id_data[c] = n; - c = c + 1; - } - } - } else { - PADDLE_ENFORCE_EQ(rois->lod().empty(), - false, - platform::errors::InvalidArgument( - "the lod of Input ROIs should not be empty when " - "BatchRoINums is None!")); - auto rois_lod = rois->lod().back(); - int rois_batch_size = rois_lod.size() - 1; - for (int n = 0; n < rois_batch_size; ++n) { - for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) { - rois_batch_id_data[i] = n; - } - } - } - - auto cplace = platform::CPUPlace(); - auto& dev_ctx = ctx.cuda_device_context(); - int bytes = rois_batch_id_list.numel() * sizeof(int); - auto roi_ptr = memory::Alloc( - dev_ctx.GetPlace(), - bytes, - phi::Stream(reinterpret_cast(dev_ctx.stream()))); - int* roi_id_data = reinterpret_cast(roi_ptr->ptr()); - const auto gplace = ctx.GetPlace(); - memory::Copy(gplace, - roi_id_data, - cplace, - rois_batch_id_data, - bytes, - dev_ctx.stream()); - - input_grad->mutable_data(ctx.GetPlace()); - phi::funcs::SetConstant set_zero; - set_zero(ctx.cuda_device_context(), input_grad, static_cast(0)); - input_roi_grad->mutable_data(ctx.GetPlace()); - set_zero(ctx.cuda_device_context(), input_roi_grad, static_cast(0)); - - int output_grad_size = output_grad->numel(); - int blocks = NumBlocks(output_grad_size); - int threads = kNumCUDAThreads; - - if (output_grad_size > 0) { - GPUPRROIPoolBackward<<>>( - output_grad_size, - in->data(), - rois->data(), - output_grad->data(), - spatial_scale, - input_channels, - height, - width, - output_channels, - pooled_height, - pooled_width, - roi_id_data, - input_grad->mutable_data(ctx.GetPlace()), - out->data(), - input_roi_grad->mutable_data(ctx.GetPlace())); - } - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -PD_REGISTER_STRUCT_KERNEL( - prroi_pool, GPU, ALL_LAYOUT, ops::GPUPRROIPoolOpKernel, float, double) {} -PD_REGISTER_STRUCT_KERNEL(prroi_pool_grad, - GPU, - ALL_LAYOUT, - ops::GPUPRROIPoolGradOpKernel, - float, - double) {} diff --git a/paddle/fluid/operators/prroi_pool_op.h b/paddle/fluid/operators/prroi_pool_op.h deleted file mode 100644 index e2417a071ce88..0000000000000 --- a/paddle/fluid/operators/prroi_pool_op.h +++ /dev/null @@ -1,653 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/phi/kernels/funcs/math_function.h" -#if defined(__NVCC__) || defined(__HIPCC__) -#include "paddle/phi/backends/gpu/gpu_primitives.h" -#endif - -namespace paddle { -namespace operators { - -template -inline HOSTDEVICE T PrRoIPoolingGetData(const T* data, - const int h, - const int w, - const int height, - const int width) { - bool overflow = (h < 0) || (w < 0) || (h >= height) || (w >= width); - T retVal = overflow ? 0.0f : data[h * width + w]; - return retVal; -} - -template -inline HOSTDEVICE T PrRoIPoolingMatCalculation(const T* this_data, - const int s_h, - const int s_w, - const int e_h, - const int e_w, - const T y0, - const T x0, - const T y1, - const T x1, - const int h0, - const int w0) { - T alpha, beta, lim_alpha, lim_beta, tmp; - T sum_out = 0; - - alpha = x0 - static_cast(s_w); - beta = y0 - static_cast(s_h); - lim_alpha = x1 - static_cast(s_w); - lim_beta = y1 - static_cast(s_h); - tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha + - 0.5f * alpha * alpha) * - (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta); - sum_out += PrRoIPoolingGetData(this_data, s_h, s_w, h0, w0) * tmp; - - alpha = static_cast(e_w) - x1; - lim_alpha = static_cast(e_w) - x0; - tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha + - 0.5f * alpha * alpha) * - (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta); - sum_out += PrRoIPoolingGetData(this_data, s_h, e_w, h0, w0) * tmp; - - alpha = x0 - static_cast(s_w); - beta = static_cast(e_h) - y1; - lim_alpha = x1 - static_cast(s_w); - lim_beta = static_cast(e_h) - y0; - tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha + - 0.5f * alpha * alpha) * - (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta); - sum_out += PrRoIPoolingGetData(this_data, e_h, s_w, h0, w0) * tmp; - - alpha = static_cast(e_w) - x1; - lim_alpha = static_cast(e_w) - x0; - tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha + - 0.5f * alpha * alpha) * - (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta); - sum_out += PrRoIPoolingGetData(this_data, e_h, e_w, h0, w0) * tmp; - - return sum_out; -} - -#if defined(__NVCC__) || defined(__HIPCC__) -template -DEVICE void PrRoIPoolingDistributeDiff(T* diff, - const T top_diff, - const int h, - const int w, - const int height, - const int width, - const T coeff) { - bool overflow = (h < 0) || (w < 0) || (h >= height) || (w >= width); - if (!overflow) { - phi::CudaAtomicAdd(diff + h * width + w, top_diff * coeff); - } -} -#else -template -inline HOSTDEVICE void PrRoIPoolingDistributeDiff(T* diff, - const T top_diff, - const int h, - const int w, - const int height, - const int width, - const T coeff) { - bool overflow = (h < 0) || (w < 0) || (h >= height) || (w >= width); - if (!overflow) { - *(diff + h * width + w) += top_diff * coeff; - } -} -#endif - -template -HOSTDEVICE void PrRoIPoolingMatDistributeDiff(T* diff, - const T top_diff, - const int s_h, - const int s_w, - const int e_h, - const int e_w, - const T y0, - const T x0, - const T y1, - const T x1, - const int h0, - const int w0) { - T alpha, beta, lim_alpha, lim_beta, tmp; - - alpha = x0 - static_cast(s_w); - beta = y0 - static_cast(s_h); - lim_alpha = x1 - static_cast(s_w); - lim_beta = y1 - static_cast(s_h); - tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha + - 0.5f * alpha * alpha) * - (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta); - PrRoIPoolingDistributeDiff(diff, top_diff, s_h, s_w, h0, w0, tmp); - - alpha = static_cast(e_w) - x1; - lim_alpha = static_cast(e_w) - x0; - tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha + - 0.5f * alpha * alpha) * - (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta); - PrRoIPoolingDistributeDiff(diff, top_diff, s_h, e_w, h0, w0, tmp); - - alpha = x0 - static_cast(s_w); - beta = static_cast(e_h) - y1; - lim_alpha = x1 - static_cast(s_w); - lim_beta = static_cast(e_h) - y0; - tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha + - 0.5f * alpha * alpha) * - (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta); - PrRoIPoolingDistributeDiff(diff, top_diff, e_h, s_w, h0, w0, tmp); - - alpha = static_cast(e_w) - x1; - lim_alpha = static_cast(e_w) - x0; - tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha + - 0.5f * alpha * alpha) * - (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta); - PrRoIPoolingDistributeDiff(diff, top_diff, e_h, e_w, h0, w0, tmp); -} - -#if defined(__NVCC__) || defined(__HIPCC__) -template -DEVICE void AccumulateRois(T* offset, T data) { - phi::CudaAtomicAdd(offset, data); -} -#else -template -inline HOSTDEVICE void AccumulateRois(T* offset, T data) { - *offset += data; -} -#endif - -#if defined(__NVCC__) || defined(__HIPCC__) -template -DEVICE T MaxFunctor(const T x, const T y) { - return max(x, y); -} -template -DEVICE T MinFunctor(const T x, const T y) { - return min(x, y); -} -#else -template -inline HOSTDEVICE T MaxFunctor(const T x, const T y) { - return std::max(x, y); -} -template -inline HOSTDEVICE T MinFunctor(const T x, const T y) { - return std::max(x, y); -} -#endif - -template -inline HOSTDEVICE static T PrRoIPoolingGetCoeff(T dh, T dw) { - dw = dw > 0 ? dw : -dw; - dh = dh > 0 ? dh : -dh; - return (1.0f - dh) * (1.0f - dw); -} - -template -inline HOSTDEVICE static T PrRoIPoolingInterpolation( - const T* data, const H h, const W w, const int height, const int width) { - T retVal = 0.0f; - int h1 = floorf(h); - int w1 = floorf(w); - retVal += - PrRoIPoolingGetData(data, h1, w1, height, width) * - PrRoIPoolingGetCoeff(h - static_cast(h1), w - static_cast(w1)); - h1 = floorf(h) + 1; - w1 = floorf(w); - retVal += - PrRoIPoolingGetData(data, h1, w1, height, width) * - PrRoIPoolingGetCoeff(h - static_cast(h1), w - static_cast(w1)); - h1 = floorf(h); - w1 = floorf(w) + 1; - retVal += - PrRoIPoolingGetData(data, h1, w1, height, width) * - PrRoIPoolingGetCoeff(h - static_cast(h1), w - static_cast(w1)); - h1 = floorf(h) + 1; - w1 = floorf(w) + 1; - retVal += - PrRoIPoolingGetData(data, h1, w1, height, width) * - PrRoIPoolingGetCoeff(h - static_cast(h1), w - static_cast(w1)); - return retVal; -} - -template -inline HOSTDEVICE T PrRoIPoolingSingleCoorIntegral(T s, T t, T c1, T c2) { - return 0.5f * (t * t - s * s) * c2 + - (t - 0.5f * t * t - s + 0.5f * s * s) * c1; -} - -template -inline HOSTDEVICE void PrRoIPoolingCoorBackward(int s_w, - int e_w, - int s_h, - int e_h, - int width, - int height, - T win_start_w, - T win_start_h, - T win_end_w, - T win_end_h, - int pw, - int ph, - const int pooled_width, - const int pooled_height, - T win_size, - const float spatial_scale, - const T* this_bottom_data, - const T* this_top_data, - T* this_data_grad, - const T* this_out_grad) { - T g_x1_y = 0.f; - T g_x2_y = 0.f; - T g_x_y1 = 0.f; - T g_x_y2 = 0.f; - - for (int h_iter = s_h; h_iter < e_h; ++h_iter) { - g_x1_y += PrRoIPoolingSingleCoorIntegral( - MaxFunctor(win_start_h, static_cast(h_iter)) - h_iter, - MinFunctor(win_end_h, static_cast(h_iter + 1)) - h_iter, - PrRoIPoolingInterpolation( - this_bottom_data, h_iter, win_start_w, height, width), - PrRoIPoolingInterpolation( - this_bottom_data, h_iter + 1, win_start_w, height, width)); - - g_x2_y += PrRoIPoolingSingleCoorIntegral( - MaxFunctor(win_start_h, static_cast(h_iter)) - h_iter, - MinFunctor(win_end_h, static_cast(h_iter + 1)) - h_iter, - PrRoIPoolingInterpolation( - this_bottom_data, h_iter, win_end_w, height, width), - PrRoIPoolingInterpolation( - this_bottom_data, h_iter + 1, win_end_w, height, width)); - } - - for (int w_iter = s_w; w_iter < e_w; ++w_iter) { - g_x_y1 += PrRoIPoolingSingleCoorIntegral( - MaxFunctor(win_start_w, static_cast(w_iter)) - w_iter, - MinFunctor(win_end_w, static_cast(w_iter + 1)) - w_iter, - PrRoIPoolingInterpolation( - this_bottom_data, win_start_h, w_iter, height, width), - PrRoIPoolingInterpolation( - this_bottom_data, win_start_h, w_iter + 1, height, width)); - - g_x_y2 += PrRoIPoolingSingleCoorIntegral( - MaxFunctor(win_start_w, static_cast(w_iter)) - w_iter, - MinFunctor(win_end_w, static_cast(w_iter + 1)) - w_iter, - PrRoIPoolingInterpolation( - this_bottom_data, win_end_h, w_iter, height, width), - PrRoIPoolingInterpolation( - this_bottom_data, win_end_h, w_iter + 1, height, width)); - } - - float partial_x1 = -g_x1_y + (win_end_h - win_start_h) * (*this_top_data); - float partial_y1 = -g_x_y1 + (win_end_w - win_start_w) * (*this_top_data); - float partial_x2 = g_x2_y - (win_end_h - win_start_h) * (*this_top_data); - float partial_y2 = g_x_y2 - (win_end_w - win_start_w) * (*this_top_data); - - partial_x1 = partial_x1 / win_size * spatial_scale; - partial_x2 = partial_x2 / win_size * spatial_scale; - partial_y1 = partial_y1 / win_size * spatial_scale; - partial_y2 = partial_y2 / win_size * spatial_scale; - - AccumulateRois( - this_data_grad + 0, - (partial_x1 * (1.0 - static_cast(pw) / pooled_width) + - partial_x2 * (1.0 - static_cast(pw + 1) / pooled_width)) * - (*this_out_grad)); - AccumulateRois( - this_data_grad + 1, - (partial_y1 * (1.0 - static_cast(ph) / pooled_height) + - partial_y2 * (1.0 - static_cast(ph + 1) / pooled_height)) * - (*this_out_grad)); - AccumulateRois(this_data_grad + 2, - (partial_x2 * static_cast(pw + 1) / pooled_width + - partial_x1 * static_cast(pw) / pooled_width) * - (*this_out_grad)); - AccumulateRois(this_data_grad + 3, - (partial_y2 * static_cast(ph + 1) / pooled_height + - partial_y1 * static_cast(ph) / pooled_height) * - (*this_out_grad)); -} - -template -class CPUPRROIPoolOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* in = ctx.Input("X"); - auto* rois = ctx.Input("ROIs"); - auto* out = ctx.Output("Out"); - - auto pooled_height = ctx.Attr("pooled_height"); - auto pooled_width = ctx.Attr("pooled_width"); - auto spatial_scale = ctx.Attr("spatial_scale"); - auto in_dims = in->dims(); - int batch_size = in_dims[0]; - int input_channels = in_dims[1]; - auto output_channels = input_channels; - int height = in_dims[2]; - int width = in_dims[3]; - int rois_num = rois->dims()[0]; - if (rois_num == 0) return; - - auto in_stride = phi::stride(in_dims); - auto out_stride = phi::stride(out->dims()); - - const T* input_data = in->data(); - - phi::DenseTensor rois_batch_id_list; - rois_batch_id_list.Resize({rois_num}); - int* rois_batch_id_data = - rois_batch_id_list.mutable_data(ctx.GetPlace()); - if (ctx.HasInput("BatchRoINums") || rois->lod().empty()) { - auto* batchroinum = ctx.Input("BatchRoINums"); - auto* batch_index = batchroinum->data(); - int rois_batch_size = batchroinum->dims()[0]; - size_t c = 0; - for (int n = 0; n < rois_batch_size; ++n) { - for (int64_t k = 0; k < batch_index[n]; ++k) { - rois_batch_id_data[c] = n; - c = c + 1; - } - } - } else { - PADDLE_ENFORCE_EQ(rois->lod().empty(), - false, - platform::errors::InvalidArgument( - "The lod of Input ROIs should not be empty when " - "BatchRoINums is None!")); - auto rois_lod = rois->lod().back(); - int rois_batch_size = rois_lod.size() - 1; - PADDLE_ENFORCE_EQ(rois_batch_size, - batch_size, - platform::errors::InvalidArgument( - "The rois_batch_size and input(X)'s " - "batch_size should be the same but received" - "rois_batch_size: %d and batch_size: %d", - rois_batch_size, - batch_size)); - int rois_num_with_lod = rois_lod[rois_batch_size]; - PADDLE_ENFORCE_EQ( - rois_num_with_lod, - rois_num, - platform::errors::InvalidArgument("The rois_num from input should be " - "equal to the rois_num from lod, " - "but received rois_num from input: " - "%d and the rois_num from lod: %d.", - rois_num_with_lod, - rois_num)); - - // calculate batch id index for each roi according to LoD - for (int n = 0; n < rois_batch_size; ++n) { - for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) { - rois_batch_id_data[i] = n; - } - } - } - - T* output_data = out->mutable_data(ctx.GetPlace()); - const T* input_rois = rois->data(); - // calculate prroipooling, parallel processing can be implemented per ROI - for (int n = 0; n < rois_num; ++n) { - // set roi batch id - int roi_batch_id = rois_batch_id_data[n]; - - // [start, end) interval for spatial sampling - const T* offset_input_rois = input_rois + n * 4; - T roi_start_w = static_cast(offset_input_rois[0]) * spatial_scale; - T roi_start_h = static_cast(offset_input_rois[1]) * spatial_scale; - T roi_end_w = static_cast(offset_input_rois[2]) * spatial_scale; - T roi_end_h = static_cast(offset_input_rois[3]) * spatial_scale; - - T roi_width = std::max(roi_end_w - roi_start_w, static_cast(0.0)); - T roi_height = std::max(roi_end_h - roi_start_h, static_cast(0.0)); - - // Compute w and h at input feature map - T bin_size_h = roi_height / static_cast(pooled_height); - T bin_size_w = roi_width / static_cast(pooled_width); - T win_size = std::max(static_cast(0.0), bin_size_w * bin_size_h); - - // calculate each pixel of the output feature map. - int out_roi_offset = n * out_stride[0]; - for (int c = 0; c < output_channels; ++c) { - // per category - int out_plane_offset = out_roi_offset + c * out_stride[1]; - for (int ph = 0; ph < pooled_height; ++ph) { - int out_row_offset = out_plane_offset + ph * out_stride[2]; - for (int pw = 0; pw < pooled_width; ++pw) { - // calculate w and h at input feature map - T win_start_h = static_cast(ph) * bin_size_h + roi_start_h; - T win_start_w = static_cast(pw) * bin_size_w + roi_start_w; - T win_end_h = win_start_h + bin_size_h; - T win_end_w = win_start_w + bin_size_w; - // Add roi offsets and clip to input boundaries - int s_w = std::floor(win_start_w); - int e_w = std::ceil(win_end_w); - int s_h = std::floor(win_start_h); - int e_h = std::ceil(win_end_h); - - int output_index = out_row_offset + pw; - int input_channel = c; - int input_plane_offset = - roi_batch_id * in_stride[0] + input_channel * in_stride[1]; - const T* offset_input_data = input_data + input_plane_offset; - T sum_out = 0.; - - if (win_size > static_cast(0.0)) { - for (int w_iter = s_w; w_iter < e_w; ++w_iter) { - for (int h_iter = s_h; h_iter < e_h; ++h_iter) { - sum_out += PrRoIPoolingMatCalculation( - offset_input_data, - h_iter, - w_iter, - h_iter + 1, - w_iter + 1, - std::max(win_start_h, static_cast(h_iter)), - std::max(win_start_w, static_cast(w_iter)), - std::min(win_end_h, - static_cast(h_iter) + static_cast(1.0)), - std::min(win_end_w, - static_cast(w_iter) + static_cast(1.0)), - height, - width); - } - } - - output_data[output_index] = sum_out / win_size; - } else { - output_data[output_index] = 0.; - } - } - } - } - } - } -}; - -template -class CPUPRROIPoolGradOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* in = ctx.Input("X"); - auto* out = ctx.Input("Out"); - auto* rois = ctx.Input("ROIs"); - auto* output_grad = - ctx.Input(framework::GradVarName("Out")); - auto* input_grad = - ctx.Output(framework::GradVarName("X")); - auto* input_roi_grad = - ctx.Output(framework::GradVarName("ROIs")); - - auto pooled_height = ctx.Attr("pooled_height"); - auto pooled_width = ctx.Attr("pooled_width"); - auto spatial_scale = ctx.Attr("spatial_scale"); - - if (input_grad || input_roi_grad) { - auto in_dims = in->dims(); - auto* in_data = in->data(); - auto* out_data = out->data(); - - int input_channels = in_dims[1]; - auto output_channels = input_channels; - int height = in_dims[2]; - int width = in_dims[3]; - int rois_num = rois->dims()[0]; - - // set roi batch id - phi::DenseTensor rois_batch_id_list; - rois_batch_id_list.Resize({rois_num}); - int* rois_batch_id_data = - rois_batch_id_list.mutable_data(ctx.GetPlace()); - if (ctx.HasInput("BatchRoINums") || rois->lod().empty()) { - auto* batchroinum = ctx.Input("BatchRoINums"); - auto* batch_index = batchroinum->data(); - int rois_batch_size = batchroinum->dims()[0]; - size_t c = 0; - for (int n = 0; n < rois_batch_size; ++n) { - for (int64_t k = 0; k < batch_index[n]; ++k) { - rois_batch_id_data[c] = n; - c = c + 1; - } - } - } else { - auto rois_lod = rois->lod().back(); - int rois_batch_size = rois_lod.size() - 1; - // calculate batch id index for each roi according to LoD - for (int n = 0; n < rois_batch_size; ++n) { - for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) { - rois_batch_id_data[i] = n; - } - } - } - - const T* input_rois = rois->data(); - const T* output_grad_data = output_grad->data(); - - input_grad->mutable_data(ctx.GetPlace()); - input_roi_grad->mutable_data(ctx.GetPlace()); - // set gradient of X to be 0. before backpropagate. - phi::funcs::SetConstant set_zero; - set_zero(ctx.template device_context(), - input_grad, - static_cast(0)); - set_zero(ctx.template device_context(), - input_roi_grad, - static_cast(0)); - - T* input_grad_data = input_grad->mutable_data(ctx.GetPlace()); - T* input_roi_grad_data = input_roi_grad->mutable_data(ctx.GetPlace()); - - // backpropagate gradient per output pixel - int output_grad_size = output_grad->numel(); - for (int i = 0; i < output_grad_size; ++i) { - // The output is in order (n, c, ph, pw) - int pw = i % pooled_width; - int ph = (i / pooled_width) % pooled_height; - int c = (i / pooled_width / pooled_height) % output_channels; - int n = i / pooled_width / pooled_height / output_channels; - - // set roi_batch_id - int roi_batch_id = rois_batch_id_data[n]; - int input_channel = c; - int input_offset = - (roi_batch_id * input_channels + input_channel) * height * width; - T* offset_input_grad_data = input_grad_data + input_offset; - const T* offset_output_grad_data = output_grad_data + i; - const T* offset_out_data = out_data + i; - - // [start, end) interval for spatial sampling - const T* offset_input_rois = input_rois + n * 4; - T roi_start_w = static_cast(offset_input_rois[0]) * spatial_scale; - T roi_start_h = static_cast(offset_input_rois[1]) * spatial_scale; - T roi_end_w = static_cast(offset_input_rois[2]) * spatial_scale; - T roi_end_h = static_cast(offset_input_rois[3]) * spatial_scale; - T* offset_input_roi_grad_data = input_roi_grad_data + n * 4; - - T roi_width = std::max(roi_end_w - roi_start_w, static_cast(0.0)); - T roi_height = std::max(roi_end_h - roi_start_h, static_cast(0.0)); - - // Compute w and h at input feature map - T bin_size_h = roi_height / static_cast(pooled_height); - T bin_size_w = roi_width / static_cast(pooled_width); - - T win_start_w = roi_start_w + bin_size_w * pw; - T win_start_h = roi_start_h + bin_size_h * ph; - T win_end_w = win_start_w + bin_size_w; - T win_end_h = win_start_h + bin_size_h; - - T win_size = std::max(static_cast(0.0), bin_size_w * bin_size_h); - - T sum_out = win_size == static_cast(0.) - ? static_cast(0.) - : *offset_output_grad_data / win_size; - - int s_w = std::floor(win_start_w); - int e_w = std::ceil(win_end_w); - int s_h = std::floor(win_start_h); - int e_h = std::ceil(win_end_h); - - for (int w_iter = s_w; w_iter < e_w; ++w_iter) { - for (int h_iter = s_h; h_iter < e_h; ++h_iter) { - PrRoIPoolingMatDistributeDiff( - offset_input_grad_data, - sum_out, - h_iter, - w_iter, - h_iter + 1, - w_iter + 1, - std::max(win_start_h, static_cast(h_iter)), - std::max(win_start_w, static_cast(w_iter)), - std::min(win_end_h, - static_cast(h_iter) + static_cast(1.0)), - std::min(win_end_w, - static_cast(w_iter) + static_cast(1.0)), - height, - width); - } - } - - const T* offset_in_data = in_data + input_offset; - PrRoIPoolingCoorBackward(s_w, - e_w, - s_h, - e_h, - width, - height, - win_start_w, - win_start_h, - win_end_w, - win_end_h, - pw, - ph, - pooled_width, - pooled_height, - win_size, - spatial_scale, - offset_in_data, - offset_out_data, - offset_input_roi_grad_data, - offset_output_grad_data); - } - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/random_crop_op.cc b/paddle/fluid/operators/random_crop_op.cc deleted file mode 100644 index 62e805e323f84..0000000000000 --- a/paddle/fluid/operators/random_crop_op.cc +++ /dev/null @@ -1,107 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -#include "paddle/fluid/operators/random_crop_op.h" - -namespace paddle { -namespace operators { - -class RandomCropOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - protected: - void InferShape(framework::InferShapeContext* ctx) const override { - auto shape = ctx->Attrs().Get>("shape"); - auto x_dim = ctx->GetInputDim("X"); - PADDLE_ENFORCE_GT( - x_dim.size(), - static_cast(shape.size()), - platform::errors::InvalidArgument( - "The dimensions of Input(X) must be greater than the length of " - "Attr(shape)," - "But received dimensions of Input(X) is [%d], receivecd length" - "of Attr(shape) is [%d].", - x_dim.size(), - static_cast(shape.size()))); - auto out_dim = phi::vectorize(x_dim); - for (size_t i = 1; i <= shape.size(); ++i) { - size_t x_i = x_dim.size() - i; - size_t shape_i = shape.size() - i; - if (ctx->IsRuntime() || - (x_dim[static_cast(x_i)] > 0 && shape[shape_i] > 0)) { - PADDLE_ENFORCE_GE( - x_dim[x_i], - shape[shape_i], - platform::errors::InvalidArgument( - "The dimensions of Input(X) must be larger than Attr(shape)," - "But received dimensions of Input(X) is [%d], received" - "size of Attr(shape) is [%d].", - x_dim[x_i], - shape[shape_i])); - } - out_dim[x_i] = shape[shape_i]; - } - ctx->SetOutputDim("Out", phi::make_ddim(out_dim)); - } - - phi::KernelKey GetExpectedKernelType( - const framework::ExecutionContext& ctx) const override { - return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(ctx, "X"), - ctx.GetPlace()); - } -}; - -class RandomCropOpMaker : public framework::OpProtoAndCheckerMaker { - public: - void Make() override { - AddInput("X", "A batch of instances to random crop."); - AddInput("Seed", "The random seed."); - AddOutput("Out", "The cropped instance batch."); - AddOutput("SeedOut", "The random seed after random cropping.") - .AsIntermediate(); - AddAttr>("shape", "The shape of a cropped instance."); - AddAttr("startup_seed", - "If the input 'Seed' is not initialized, the 'startup_seed' " - "will be used to replace it. Even so, the seed after random " - "crop will also be outputed to the 'SeedOut'.") - .SetDefault(0); - AddComment(R"DOC( - This operator takes a batch of instance, and do random cropping on each instance. - It means that cropping positions differs on each instance, which is determined - by an uniform random generator. All cropped instances have the same shape, which - is determined by the operator's attribute 'shape'. - )DOC"); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OPERATOR( - random_crop, - ops::RandomCropOp, - ops::RandomCropOpMaker, - paddle::framework::EmptyGradOpMaker, - paddle::framework::EmptyGradOpMaker); - -PD_REGISTER_STRUCT_KERNEL(random_crop, - CPU, - ALL_LAYOUT, - ops::RandomCropKernel, - float, - int, - double, - uint8_t, - int16_t) {} diff --git a/paddle/fluid/operators/random_crop_op.cu b/paddle/fluid/operators/random_crop_op.cu deleted file mode 100644 index 33182dff93fa4..0000000000000 --- a/paddle/fluid/operators/random_crop_op.cu +++ /dev/null @@ -1,27 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/operators/random_crop_op.h" - -namespace ops = paddle::operators; - -PD_REGISTER_STRUCT_KERNEL(random_crop, - GPU, - ALL_LAYOUT, - ops::RandomCropKernel, - float, - int, - double, - uint8_t, - int16_t) {} diff --git a/paddle/fluid/operators/random_crop_op.h b/paddle/fluid/operators/random_crop_op.h deleted file mode 100644 index fc625826b9a91..0000000000000 --- a/paddle/fluid/operators/random_crop_op.h +++ /dev/null @@ -1,232 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/platform/device_context.h" -#include "paddle/fluid/platform/for_range.h" -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -#include -#endif - -namespace paddle { -namespace operators { - -template -struct Random; - -template <> -struct Random { - using Engine = std::minstd_rand; - - template - using UniformIntDist = std::uniform_int_distribution; -}; - -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -template <> -struct Random { - using Engine = thrust::minstd_rand; - - template - using UniformIntDist = thrust::uniform_int_distribution; -}; -#endif - -template -HOSTDEVICE inline void StridedMemcpy(const T* x, - const size_t* x_dims, - T* out, - const size_t* out_dims, - int i, - int rank, - size_t prod_x_remain, - size_t prod_out_remain, - const size_t* offsets) { - size_t x_dim_i = x_dims[i]; - size_t out_dim_i = out_dims[i]; - size_t x_stride = prod_x_remain / x_dim_i; - size_t out_stride = prod_out_remain / out_dim_i; - size_t offset_i = offsets[i]; - - if (i == rank - 1) { - x += offset_i; - for (size_t j = 0; j < out_dim_i; ++j) { - *out++ = *x++; - } - } else { - x += offset_i * x_stride; - for (size_t j = 0; j < out_dim_i; ++j) { - StridedMemcpy( - x, x_dims, out, out_dims, i + 1, rank, x_stride, out_stride, offsets); - x += x_stride; - out += out_stride; - } - } -} - -template -struct RandomCropFunctor { - const T* x_; - T* out_; - size_t x_dims_[9]; - size_t out_dims_[9]; - int num_batchsize_dims_; - int rank_; - int64_t seed_; - - size_t prod_batchsize_dims_; - size_t prod_x_ins_dims_; - size_t prod_out_ins_dims_; - - RandomCropFunctor(const T* x, - T* out, - const framework::DDim& x_dims, - const framework::DDim& out_dims, - int num_batchsize_dims, - int64_t seed) - : x_(x), - out_(out), - num_batchsize_dims_(num_batchsize_dims), - rank_(x_dims.size()), - seed_(seed) { - PADDLE_ENFORCE_EQ( - x_dims.size(), - out_dims.size(), - platform::errors::InvalidArgument( - "The dimensions of Input(X) must equal to be the dimensions" - "of Output(Out), but received dimensions of Input(X) is [%d]," - "received dimensions of Output(Out) is [%d].", - x_dims.size(), - out_dims.size())); - PADDLE_ENFORCE_GT( - rank_, - num_batchsize_dims_, - platform::errors::InvalidArgument( - "The dimensions of Input(X) must be greater than the diff" - "value of Input(X)'s dimensions minus Atrr(shape)'s dimensions," - "But received Input(X)'s dimensions is [%d], received value of" - "Input(X)'s dimensions minus Attr(shape)'s dimensions is [%d].", - rank_, - num_batchsize_dims_)); - prod_batchsize_dims_ = 1; - prod_x_ins_dims_ = 1; - prod_out_ins_dims_ = 1; - for (size_t i = 0; i < static_cast(rank_); ++i) { - size_t x_dim_i = x_dims[i]; - size_t out_dim_i = out_dims[i]; - x_dims_[i] = x_dim_i; - out_dims_[i] = out_dim_i; - if (i < static_cast(num_batchsize_dims_)) { - PADDLE_ENFORCE_EQ( - x_dim_i, - out_dim_i, - platform::errors::InvalidArgument( - "The first [%d] dimension value of Input(X) and Output(Out)" - "must be equal, but received the [%d] dimension value of" - "Input(X) and Output(Out) respectively are [%d] and [%d].", - num_batchsize_dims_, - i, - x_dim_i, - out_dim_i)); - prod_batchsize_dims_ *= x_dim_i; - } else { - prod_x_ins_dims_ *= x_dim_i; - prod_out_ins_dims_ *= out_dim_i; - } - } - } - - HOSTDEVICE void operator()(size_t ins_idx) { - typename Random::Engine engine(seed_); - engine.discard(ins_idx * (rank_ - num_batchsize_dims_)); - size_t offsets[9] = {}; - for (int i = num_batchsize_dims_; i < rank_; ++i) { - typename Random::template UniformIntDist dist( - 0, x_dims_[i] - out_dims_[i]); - offsets[i - num_batchsize_dims_] = dist(engine); - } - - const T* x = x_ + ins_idx * prod_x_ins_dims_; - T* out = out_ + ins_idx * prod_out_ins_dims_; - - StridedMemcpy(x, - x_dims_ + num_batchsize_dims_, - out, - out_dims_ + num_batchsize_dims_, - 0, - rank_ - num_batchsize_dims_, - prod_x_ins_dims_, - prod_out_ins_dims_, - offsets); - } -}; - -template -class RandomCropKernel : public framework::OpKernel { - public: - virtual void Compute(const framework::ExecutionContext& ctx) const { - int64_t seed = 0; - auto& seed_tensor = GET_DATA_SAFELY( - ctx.Input("Seed"), "Input", "Seed", "RandomCrop"); - if (seed_tensor.IsInitialized()) { - if (platform::is_cpu_place(seed_tensor.place())) { - seed = *seed_tensor.template data(); - } else { - LOG(WARNING) << "It is slow to place seed in GPU memory. Please verify " - "your program"; - phi::DenseTensor cpu_seed; - framework::TensorCopySync(seed_tensor, platform::CPUPlace(), &cpu_seed); - seed = *cpu_seed.data(); - } - } else { - VLOG(5) << "WARNING: The input 'Seed' is not initialized, use attribute " - "'startup_seed' instead."; - seed = ctx.Attr("startup_seed"); - } - auto shape = ctx.Attr>("shape"); - auto& x = GET_DATA_SAFELY( - ctx.Input("X"), "Input", "X", "RandomCrop"); - auto& out = GET_DATA_SAFELY( - ctx.Output("Out"), "Output", "Out", "RandomCrop"); - - int num_batchsize_dims = x.dims().size() - shape.size(); - RandomCropFunctor functor( - x.template data(), - out.template mutable_data(ctx.GetPlace()), - x.dims(), - out.dims(), - num_batchsize_dims, - seed); - platform::ForRange for_range( - ctx.template device_context(), - functor.prod_batchsize_dims_); - - for_range(functor); - - Random::Engine engine(seed); - engine.discard(functor.prod_batchsize_dims_ * - (functor.rank_ - functor.num_batchsize_dims_)); - *ctx.Output("SeedOut")->mutable_data( - phi::make_ddim({1}), platform::CPUPlace()) = engine(); - } -}; - -// TODO(fengjiayi): Backward of random crop op - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/space_to_depth_op.cc b/paddle/fluid/operators/space_to_depth_op.cc deleted file mode 100644 index da0c2e4a3cbb2..0000000000000 --- a/paddle/fluid/operators/space_to_depth_op.cc +++ /dev/null @@ -1,242 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/space_to_depth_op.h" - -#include -#include -#include - -#include "paddle/fluid/framework/no_need_buffer_vars_inference.h" - -namespace paddle { -namespace operators { - -class SpaceToDepthOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE(ctx->HasInput("X"), - platform::errors::InvalidArgument( - "Input(X) of SpaceToDepthOp should not be null.")); - PADDLE_ENFORCE(ctx->HasOutput("Out"), - platform::errors::InvalidArgument( - "Output(Out) of SpaceToDepthOp should not be null.")); - - auto x_dims = ctx->GetInputDim("X"); - PADDLE_ENFORCE_EQ( - x_dims.size(), - 4, - platform::errors::InvalidArgument("input should be a 4D tensor")); - auto blocksize = ctx->Attrs().Get("blocksize"); - - PADDLE_ENFORCE_GT(blocksize, - 1, - platform::errors::InvalidArgument( - "The blocksize should be Greater than 1")); - if (ctx->IsRuntime()) { - PADDLE_ENFORCE_GT(x_dims[1], - 0, - platform::errors::InvalidArgument( - "input channel should be Greater than 0")); - PADDLE_ENFORCE_GT(x_dims[2], - 0, - platform::errors::InvalidArgument( - "input Height should be Greater than 0")); - PADDLE_ENFORCE_GT(x_dims[3], - 0, - platform::errors::InvalidArgument( - "input Width should be Greater than 0")); - - PADDLE_ENFORCE_EQ( - x_dims[1] % (blocksize * blocksize), - 0, - platform::errors::InvalidArgument( - "input channel should be divisible of the square of " - "SpaceToDepthOp blocksize")); - PADDLE_ENFORCE_EQ(x_dims[2] % (blocksize), - 0, - platform::errors::InvalidArgument( - "input Height should be divisible of the square of " - "SpaceToDepthOp blocksize")); - PADDLE_ENFORCE_EQ(x_dims[3] % (blocksize), - 0, - platform::errors::InvalidArgument( - "input Width should be divisible of the square of " - "SpaceToDepthOp blocksize")); - } else { - if (x_dims[1] != -1) { - PADDLE_ENFORCE_GT(x_dims[1], - 0, - platform::errors::InvalidArgument( - "input channel should be Greater than 0")); - PADDLE_ENFORCE_EQ( - x_dims[1] % (blocksize * blocksize), - 0, - platform::errors::InvalidArgument( - "input channel should be divisible of the square of " - "SpaceToDepthOp blocksize")); - } - if (x_dims[2] != -1) { - PADDLE_ENFORCE_GT(x_dims[2], - 0, - platform::errors::InvalidArgument( - "input Height should be Greater than 0")); - PADDLE_ENFORCE_EQ( - x_dims[2] % (blocksize), - 0, - platform::errors::InvalidArgument( - "input Height should be divisible of the square of " - "SpaceToDepthOp blocksize")); - } - - if (x_dims[3] != -1) { - PADDLE_ENFORCE_GT(x_dims[3], - 0, - platform::errors::InvalidArgument( - "input Width should be Greater than 0")); - - PADDLE_ENFORCE_EQ( - x_dims[3] % (blocksize), - 0, - platform::errors::InvalidArgument( - "input Width should be divisible of the square of " - "SpaceToDepthOp blocksize")); - } - } - - VLOG(3) << "SpaceToDepthOp operator x.shape=" << x_dims - << "Attribute blocksize" << blocksize << std::endl; - - std::vector output_shape(4, 0); // [B,C,H,W] - output_shape[0] = x_dims[0]; - output_shape[1] = x_dims[1] * blocksize * blocksize; - output_shape[2] = x_dims[2] / blocksize; - output_shape[3] = x_dims[3] / blocksize; - - auto out_dims = phi::make_ddim(output_shape); - - ctx->SetOutputDim("Out", out_dims); - - if (x_dims[0] == out_dims[0]) { - // Only pass LoD when the first dimension of output and Input(X) - // are the same. - ctx->ShareLoD("X", /*->*/ "Out"); - } - } -}; - -class SpaceToDepthOpMaker : public framework::OpProtoAndCheckerMaker { - public: - void Make() override { - AddInput("X", - "(Tensor). The input should be a 4D tensor B * C * W * H of " - "SpaceToDepthOp " - "operator."); - AddOutput("Out", - "(Tensor), The output should be a 4D tensor B * C2 * W2 * H2 of " - "SpaceToDepthOp operator."); - AddAttr( - "blocksize", - "(int64_t, default 2) blocksize used to do change Space To Depth.") - .SetDefault(2) - .GreaterThan(1); - AddComment(R"DOC( - reorg operator used in Yolo v2. - The equation is: C2 = C1/blocksize * blocksize, W2 = W1 * blocksize + offset % blocksize, H2 = H1 * blocksize + offset / blocksize, - - Reshape Input(X) into the shape according to Attr(blocksize). The - data in Input(X) are unchanged. - - Examples: - - 1. Given a 4-D tensor Input(X) with a shape [128, 2048, 26, 26], and the blocksize is 2, the reorg operator will transform Input(X) - into a 4-D tensor with shape [128, 2048, 13, 13] and leaving Input(X)'s data unchanged. - - )DOC"); - } -}; - -DECLARE_NO_NEED_BUFFER_VARS_INFERER(SpaceToDepthGradOpNoBufferVarsInferer, "X"); - -template -class SpaceToDepthGradOpMaker : public framework::SingleGradOpMaker { - public: - using framework::SingleGradOpMaker::SingleGradOpMaker; - - protected: - void Apply(GradOpPtr op) const override { - op->SetType("space_to_depth_grad"); - - op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out")); - op->SetInput("X", this->Input("X")); - - op->SetOutput(framework::GradVarName("X"), this->InputGrad("X")); - - op->SetAttrMap(this->Attrs()); - } -}; - -class SpaceToDepthGradOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE( - ctx->HasInput("X"), - platform::errors::InvalidArgument("Input(X) shouldn't be null.")); - PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), - platform::errors::InvalidArgument( - "Input(Out@GRAD) shouldn't be null.")); - ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); - } - - protected: - phi::KernelKey GetExpectedKernelType( - const framework::ExecutionContext& ctx) const override { - return phi::KernelKey(OperatorWithKernel::IndicateVarDataType( - ctx, framework::GradVarName("Out")), - ctx.GetPlace()); - } -}; -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -REGISTER_OPERATOR(space_to_depth, - ops::SpaceToDepthOp, - ops::SpaceToDepthOpMaker, - ops::SpaceToDepthGradOpMaker, - ops::SpaceToDepthGradOpMaker); -REGISTER_OPERATOR(space_to_depth_grad, - ops::SpaceToDepthGradOp, - ops::SpaceToDepthGradOpNoBufferVarsInferer); -PD_REGISTER_STRUCT_KERNEL(space_to_depth, - CPU, - ALL_LAYOUT, - ops::SpaceToDepthKernel, - int, - int64_t, - float, - double) {} -PD_REGISTER_STRUCT_KERNEL(space_to_depth_grad, - CPU, - ALL_LAYOUT, - ops::SpaceToDepthGradKernel, - int, - int64_t, - float, - double) {} diff --git a/paddle/fluid/operators/space_to_depth_op.cu b/paddle/fluid/operators/space_to_depth_op.cu deleted file mode 100644 index 7f62509ee7d2e..0000000000000 --- a/paddle/fluid/operators/space_to_depth_op.cu +++ /dev/null @@ -1,35 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/operators/space_to_depth_op.h" - -namespace plat = paddle::platform; -namespace ops = paddle::operators; - -PD_REGISTER_STRUCT_KERNEL(space_to_depth, - GPU, - ALL_LAYOUT, - ops::SpaceToDepthKernel, - int, - int64_t, - float, - double) {} -PD_REGISTER_STRUCT_KERNEL(space_to_depth_grad, - GPU, - ALL_LAYOUT, - ops::SpaceToDepthGradKernel, - int, - int64_t, - float, - double) {} diff --git a/paddle/fluid/operators/space_to_depth_op.h b/paddle/fluid/operators/space_to_depth_op.h deleted file mode 100644 index 18ff67c6132be..0000000000000 --- a/paddle/fluid/operators/space_to_depth_op.h +++ /dev/null @@ -1,131 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#ifndef PADDLE_FLUID_OPERATORS_SPACE_TO_DEPTH_OP_H_ -#define PADDLE_FLUID_OPERATORS_SPACE_TO_DEPTH_OP_H_ -#endif // PADDLE_FLUID_OPERATORS_SPACE_TO_DEPTH_OP_H_ - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/platform/for_range.h" - -namespace paddle { -namespace operators { - -template -class space_to_depth_compute { - public: - HOSTDEVICE space_to_depth_compute(const T *x, - int64_t w, - int64_t h, - int64_t c, - int64_t batch, - int64_t blocksize, - int64_t forward, - T *out) - : x_(x), - w_(w), - h_(h), - c_(c), - batch_(batch), - blocksize_(blocksize), - forward_(forward), - out_(out) {} - - HOSTDEVICE void operator()(int64_t in_index) { - int64_t out_c = c_ / (blocksize_ * blocksize_); - // calculate each dim position with index of tensor - int64_t b = in_index / (c_ * h_ * w_); - int64_t k = (in_index % (c_ * h_ * w_)) / (h_ * w_); - int64_t j = ((in_index % (c_ * h_ * w_)) % (h_ * w_)) / w_; - int64_t i = ((in_index % (c_ * h_ * w_)) % (h_ * w_)) % w_; - - int64_t c2 = k % out_c; - int64_t offset = k / out_c; - int64_t w2 = i * blocksize_ + offset % blocksize_; - int64_t h2 = j * blocksize_ + offset / blocksize_; - int64_t out_index = - w2 + w_ * blocksize_ * (h2 + h_ * blocksize_ * (c2 + out_c * b)); - if (forward_) - out_[out_index] = x_[in_index]; - else - out_[in_index] = x_[out_index]; - } - - private: - const T *x_; - int64_t w_, h_, c_, batch_, blocksize_, forward_; - T *out_; -}; - -template -class SpaceToDepthKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &context) const override { - auto *out = context.Output("Out"); - auto *x = context.Input("X"); - auto blocksize = context.Attr("blocksize"); - auto in_dims = x->dims(); - out->mutable_data(context.GetPlace(), x->type()); - - auto out_dims = out->dims(); - auto B = in_dims[0]; - auto C = in_dims[1]; - auto H = in_dims[2]; - auto W = in_dims[3]; - platform::ForRange for_range( - context.template device_context(), - static_cast(x->numel())); - - auto *x_data = x->data(); - auto *out_data = out->data(); - paddle::operators::space_to_depth_compute computer( - x_data, W, H, C, B, blocksize, 1, out_data); - for_range(computer); - - out->Resize(out_dims); - } -}; - -template -class SpaceToDepthGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &context) const override { - auto *d_out = - context.Input(framework::GradVarName("Out")); - auto *d_x = context.Output(framework::GradVarName("X")); - auto blocksize = context.Attr("blocksize"); - auto in_dims = d_x->dims(); - d_x->mutable_data(context.GetPlace(), d_out->type()); - - auto B = in_dims[0]; - auto C = in_dims[1]; - auto H = in_dims[2]; - auto W = in_dims[3]; - - platform::ForRange for_range( - context.template device_context(), - static_cast(d_x->numel())); - - auto *dx_data = d_x->data(); - auto *dout_data = d_out->data(); - - paddle::operators::space_to_depth_compute computer( - dout_data, W, H, C, B, blocksize, 0, dx_data); - for_range(computer); - - d_x->Resize(in_dims); - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/squared_l2_distance_op.cc b/paddle/fluid/operators/squared_l2_distance_op.cc deleted file mode 100644 index 0f2f727dd9135..0000000000000 --- a/paddle/fluid/operators/squared_l2_distance_op.cc +++ /dev/null @@ -1,232 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/squared_l2_distance_op.h" - -#include - -#include "paddle/fluid/framework/no_need_buffer_vars_inference.h" - -namespace paddle { -namespace operators { - -class SquaredL2DistanceOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "SquaredL2DistanceOp"); - OP_INOUT_CHECK(ctx->HasInput("Y"), "Input", "Y", "SquaredL2DistanceOp"); - OP_INOUT_CHECK(ctx->HasOutput("sub_result"), - "Output", - "sub_result", - "SquaredL2DistanceOp"); - OP_INOUT_CHECK( - ctx->HasOutput("Out"), "Output", "Out", "SquaredL2DistanceOp"); - - auto x_dims = ctx->GetInputDim("X"); - auto y_dims = ctx->GetInputDim("Y"); - - PADDLE_ENFORCE_EQ(phi::arity(x_dims), - phi::arity(y_dims), - platform::errors::InvalidArgument( - "Input(X) and Input(X) of SquaredL2DistanceOp should " - "have same dimensions. " - "But received X's shape = [%s] and Y's shape = [%s], " - "the dimensions are %d and %d respectively", - x_dims, - y_dims, - phi::arity(x_dims), - phi::arity(y_dims))); - - int rank = phi::arity(x_dims); - PADDLE_ENFORCE_GE( - rank, - 2, - platform::errors::InvalidArgument( - "Input dimensions of SquaredL2DistanceOp should be at least 2." - "But received shape = [%s] and dimension is %d.", - x_dims, - rank)); - bool check = true; - if ((!ctx->IsRuntime()) && - (phi::product(x_dims) <= 0 || phi::product(y_dims) <= 0)) { - check = false; - } - if (check) { - PADDLE_ENFORCE_EQ( - product(x_dims) / x_dims[0], - product(y_dims) / y_dims[0], - platform::errors::InvalidArgument( - "Input(X) and Input(Y) of SquaredL2DistanceOp should " - "have same dimensions." - "But received X's shape = [%s] and Y's shape = [%s]" - ", the products are %d and %d respectively", - x_dims, - y_dims, - product(x_dims) / x_dims[0], - product(y_dims) / y_dims[0])); - } - check = true; - if ((!ctx->IsRuntime()) && (y_dims[0] <= 0 || x_dims[0] <= 0)) { - check = false; - } - if (check) { - PADDLE_ENFORCE_EQ( - y_dims[0] == 1 || y_dims[0] == x_dims[0], - true, - platform::errors::InvalidArgument( - "First dimension of Input(Y) of SquaredL2DistanceOp " - "must be equal to 1 or to first dimension of Input(X)." - "But received X's shape = [%s] and Y's shape = [%s]," - "the first dimensions are %d and %d respectively", - x_dims, - y_dims, - x_dims[0], - y_dims[0])); - } - ctx->SetOutputDim("sub_result", {x_dims[0], product(x_dims) / x_dims[0]}); - ctx->SetOutputDim("Out", {x_dims[0], 1}); - ctx->ShareLoD("X", /*->*/ "Out"); - } -}; - -DECLARE_NO_NEED_BUFFER_VARS_INFERER(SquaredL2DistanceGradOpNoBufferVarsInferer, - "X", - "Y"); - -template -class SquaredL2DistanceGradOpMaker : public framework::SingleGradOpMaker { - public: - using framework::SingleGradOpMaker::SingleGradOpMaker; - - protected: - void Apply(GradOpPtr op) const override { - op->SetType("squared_l2_distance_grad"); - - op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out")); - op->SetInput("sub_result", this->Output("sub_result")); - op->SetInput("X", this->Input("X")); - op->SetInput("Y", this->Input("Y")); - - op->SetOutput(framework::GradVarName("X"), this->InputGrad("X")); - op->SetOutput(framework::GradVarName("Y"), this->InputGrad("Y")); - - op->SetAttrMap(this->Attrs()); - } -}; - -class SquaredL2DistanceOpMaker : public framework::OpProtoAndCheckerMaker { - public: - void Make() override { - AddInput("X", "(Tensor) Input of SquaredL2DistanceOp."); - AddInput("Y", "(Tensor) Target of SquaredL2DistanceOp."); - AddOutput("sub_result", - "(Tensor) Buffering subtraction result which " - "will be reused in backward.") - .AsIntermediate(); - AddOutput("Out", "(Tensor) Squared l2 distance between input and target."); - AddComment(R"DOC( -SquaredL2Distance operator - -This operator will cacluate the squared L2 distance for the input and -the target. Number of distance value will be equal to the first dimension -of input. First dimension of the target could be equal to the input or to 1. -If the first dimension of target is 1, the operator will broadcast target's -first dimension to input's first dimension. During backward propagation, -the user can decide whether to calculate the gradient of the input or -the target or both. - -Both the input X and Y can carry the LoD (Level of Details) information. -However, the output only shares the LoD information with input X. - )DOC"); - } -}; - -class SquaredL2DistanceGradOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("sub_result"), - "Input", - "sub_result", - "SquaredL2DistanceGradOp"); - OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")), - "Input", - "Out@GRAD", - "SquaredL2DistanceGradOp"); - auto out_dims = ctx->GetInputDim(framework::GradVarName("Out")); - auto x_dims = ctx->GetInputDim("X"); - auto y_dims = ctx->GetInputDim("Y"); - if (ctx->IsRuntime()) { - PADDLE_ENFORCE_EQ( - out_dims[0], - x_dims[0], - platform::errors::InvalidArgument( - "First dimension of output gradient and Input(X) " - "of SquaredL2DistanceGradOp must be equal " - "But received X's shape = [%s] and grad's shape = [%s], " - "the first dimensions are %d and %d respectively", - x_dims, - out_dims, - x_dims[0], - out_dims[0])); - PADDLE_ENFORCE_EQ(out_dims[1], - 1, - platform::errors::InvalidArgument( - "Second dimension of output gradient of " - "SquaredL2DistanceGradOp must be 1. " - "But received grad's shape = [%s], " - "with second dimension %d", - out_dims, - out_dims[1])); - } - auto x_grad_name = framework::GradVarName("X"); - auto y_grad_name = framework::GradVarName("Y"); - if (ctx->HasOutput(x_grad_name)) ctx->SetOutputDim(x_grad_name, x_dims); - if (ctx->HasOutput(y_grad_name)) ctx->SetOutputDim(y_grad_name, y_dims); - } - - protected: - phi::KernelKey GetExpectedKernelType( - const framework::ExecutionContext& ctx) const override { - return phi::KernelKey( - OperatorWithKernel::IndicateVarDataType(ctx, "sub_result"), - ctx.GetPlace()); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OPERATOR( - squared_l2_distance, - ops::SquaredL2DistanceOp, - ops::SquaredL2DistanceOpMaker, - ops::SquaredL2DistanceGradOpMaker, - ops::SquaredL2DistanceGradOpMaker); -REGISTER_OPERATOR(squared_l2_distance_grad, - ops::SquaredL2DistanceGradOp, - ops::SquaredL2DistanceGradOpNoBufferVarsInferer); - -PD_REGISTER_STRUCT_KERNEL( - squared_l2_distance, CPU, ALL_LAYOUT, ops::SquaredL2DistanceKernel, float) { -} -PD_REGISTER_STRUCT_KERNEL(squared_l2_distance_grad, - CPU, - ALL_LAYOUT, - ops::SquaredL2DistanceGradKernel, - float) {} diff --git a/paddle/fluid/operators/squared_l2_distance_op.cu b/paddle/fluid/operators/squared_l2_distance_op.cu deleted file mode 100644 index 4411df4d9ab7f..0000000000000 --- a/paddle/fluid/operators/squared_l2_distance_op.cu +++ /dev/null @@ -1,24 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#include "paddle/fluid/operators/squared_l2_distance_op.h" - -namespace ops = paddle::operators; -PD_REGISTER_STRUCT_KERNEL( - squared_l2_distance, GPU, ALL_LAYOUT, ops::SquaredL2DistanceKernel, float) { -} -PD_REGISTER_STRUCT_KERNEL(squared_l2_distance_grad, - GPU, - ALL_LAYOUT, - ops::SquaredL2DistanceGradKernel, - float) {} diff --git a/paddle/fluid/operators/squared_l2_distance_op.h b/paddle/fluid/operators/squared_l2_distance_op.h deleted file mode 100644 index 18039835c55c3..0000000000000 --- a/paddle/fluid/operators/squared_l2_distance_op.h +++ /dev/null @@ -1,132 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -template -class SquaredL2DistanceKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* in0 = context.Input("X"); - auto* in1 = context.Input("Y"); - auto* out0 = context.Output("sub_result"); - auto* out1 = context.Output("Out"); - - auto in0_dims = in0->dims(); - auto in1_dims = in1->dims(); - - int cols = in0->numel() / in0_dims[0]; - // reduce dimensions except the first - auto x = framework::EigenMatrix::From( - *in0, phi::make_ddim({in0_dims[0], cols})); - auto y = framework::EigenMatrix::From( - *in1, phi::make_ddim({in1_dims[0], cols})); - - out0->mutable_data(context.GetPlace()); - out1->mutable_data(context.GetPlace()); - auto sub_result = framework::EigenMatrix::From(*out0); - auto z = framework::EigenVector::Flatten(*out1); - - auto& place = - *context.template device_context().eigen_device(); - auto x_dims = x.dimensions(); - auto y_dims = y.dimensions(); - // buffer the substraction result - if (y_dims[0] == 1 && x_dims[0] > y_dims[0]) { - sub_result.device(place) = - x - - y.broadcast(Eigen::array({{static_cast(x_dims[0]), 1}})); - } else { - sub_result.device(place) = x - y; - } - auto sub_res_pow2 = sub_result * sub_result; - z.device(place) = sub_res_pow2.sum(Eigen::array({{1}})); - } -}; - -template -class SquaredL2DistanceGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* in0 = context.Input("sub_result"); - auto* in1 = context.Input(framework::GradVarName("Out")); - auto* x_g = context.Output(framework::GradVarName("X")); - auto* y_g = context.Output(framework::GradVarName("Y")); - - PADDLE_ENFORCE_NOT_NULL( - x_g, - platform::errors::NotFound( - "variable(%s) cannot be found " - "in scope for operator 'squared_l2_distance_grad'.", - framework::GradVarName("X"))); - PADDLE_ENFORCE_NOT_NULL( - y_g, - platform::errors::NotFound( - "variable(%s) cannot be found " - "in scope for operator 'squared_l2_distance_grad'.", - framework::GradVarName("Y"))); - - auto sub_result = framework::EigenMatrix::From(*in0); - auto out_grad = framework::EigenMatrix::From(*in1); - - auto x_dims = x_g->dims(); - auto y_dims = y_g->dims(); - - int cols = x_g->numel() / x_dims[0]; - // calculate gradient - auto grad_mat = 2 * - (out_grad.broadcast(Eigen::array({{1, cols}}))) * - sub_result; - - // propagate back to input - auto& eigen_place = - *context.template device_context().eigen_device(); - - x_g->mutable_data(context.GetPlace()); - // eigen matrix - auto x_grad = framework::EigenMatrix::From( - *x_g, phi::make_ddim({x_dims[0], cols})); - // dimensions are same with subResult - x_grad.device(eigen_place) = grad_mat; - - y_g->mutable_data(context.GetPlace()); - - PADDLE_ENFORCE_GE(sub_result.dimensions()[0], - y_dims[0], - platform::errors::InvalidArgument( - "First dimension of gradient must be greater or " - "equal than first dimension of target. But received " - "gradient dimension = %d and target dimension is %d.", - sub_result.dimensions()[0], - y_dims[0])); - - if (sub_result.dimensions()[0] == y_dims[0]) { - auto y_grad = framework::EigenMatrix::From( - *y_g, phi::make_ddim({y_dims[0], cols})); - y_grad.device(eigen_place) = -1 * grad_mat; - } else { - auto col_sum_res = -1 * (grad_mat.sum(Eigen::array({{0}}))); - auto y_grad = framework::EigenVector::Flatten(*y_g); - y_grad.device(eigen_place) = col_sum_res; - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/tree_conv_op.cc b/paddle/fluid/operators/tree_conv_op.cc deleted file mode 100644 index 7265d966b9e2a..0000000000000 --- a/paddle/fluid/operators/tree_conv_op.cc +++ /dev/null @@ -1,240 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/operators/tree_conv_op.h" - -#include -#include - -namespace paddle { -namespace operators { -class TreeConvOpMaker : public framework::OpProtoAndCheckerMaker { - public: - void Make() override { - AddInput("NodesVector", - "(Tensor) The feature vector of every node on the tree. " - "The shape of the feature vector must be " - "[max_tree_node_size, feature_size]."); - AddInput("EdgeSet", - "(Tensor) The Edges of Tree. The edge must be directional. " - "The shape of the edge set must be [max_tree_node_size, 2]."); - AddInput("Filter", - "(Tensor) The feature detector. " - "The shape of the filter is " - "[feature_size, 3, output_size, num_filters]."); - AddOutput("Out", - "(Tensor) The feature vector of subtrees. " - "The shape of the output tensor is [max_tree_node_size, " - "output_size, num_filters]. " - "The output tensor could be a new feature " - "vector for next tree convolution layers."); - AddAttr("max_depth", - "(int, default: 2) The depth of feature detector.") - .SetDefault(2) - .GreaterThan(1); - AddComment(R"DOC( -**Tree-Based Convolution Operator** - -Tree-Based Convolution is a kind of convolution based on tree structure. -Tree-Based Convolution is a part of Tree-Based Convolution Neural Network(TBCNN), -which is used to classify tree structures, such as Abstract Syntax Tree. -Tree-Based Convolution proposed a kind of data structure called continuous binary tree, -which regards multiway tree as binary tree. -The paper of Tree-Based Convolution Operator is here: -https://arxiv.org/abs/1409.5718v1 -)DOC"); - } -}; -class TreeConvOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext *ctx) const override { - OP_INOUT_CHECK( - ctx->HasInput("NodesVector"), "Input", "NodesVector", "TreeConv"); - OP_INOUT_CHECK(ctx->HasInput("Filter"), "Input", "Filter", "TreeConv"); - OP_INOUT_CHECK(ctx->HasInput("EdgeSet"), "Input", "EdgeSet", "TreeConv"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "TreeConv"); - - auto edge_dims = ctx->GetInputDim("EdgeSet"); - auto vector_dims = ctx->GetInputDim("NodesVector"); - auto filter_dims = ctx->GetInputDim("Filter"); - - if (ctx->IsRuntime()) { - PADDLE_ENFORCE_EQ(edge_dims[2], - 2, - platform::errors::InvalidArgument( - "Input(EdgeSet) dim[2] should be 2. " - "But received Input(EdgeSet) dim[2] is %d.", - edge_dims[2])); - } else { - if (edge_dims[2] != -1) { - PADDLE_ENFORCE_EQ(edge_dims[2], - 2, - platform::errors::InvalidArgument( - "Input(EdgeSet) dim[2] should be 2. " - "But received Input(EdgeSet) dim[2] is %d.", - edge_dims[2])); - } - } - PADDLE_ENFORCE_EQ(edge_dims.size(), - 3, - platform::errors::InvalidArgument( - "The dimension of EdgeSet Tensor should be 3. " - "But received the dimension of EdgeSet Tensor is %d.", - edge_dims.size())); - PADDLE_ENFORCE_EQ( - vector_dims.size(), - 3, - platform::errors::InvalidArgument( - "The dimension of NodesVector Tensor should be 3. " - "But received the dimension of NodesVector Tensor is %d.", - vector_dims.size())); - PADDLE_ENFORCE_EQ(filter_dims.size(), - 4, - platform::errors::InvalidArgument( - "The dimension of Filter Tensor should be 4. " - "But received the dimension of Filter Tensor is %d.", - filter_dims.size())); - - if (ctx->IsRuntime()) { - PADDLE_ENFORCE_EQ(filter_dims[1], - 3, - platform::errors::InvalidArgument( - "Input(Filter) dim[1] should be 3. " - "But received Input(Filter) dim[1] is %d.", - filter_dims[1])); - PADDLE_ENFORCE_EQ( - filter_dims[0], - vector_dims[2], - platform::errors::InvalidArgument( - "Input(Filter) dim[0] must equal to Input(NodesVector) dim[2]. " - "But received Input(Filter) dim[0] = %d, Input(NodesVector) " - "dim[2] = %d.", - filter_dims[0], - vector_dims[2])); - } else { - if (filter_dims[1] != -1) { - PADDLE_ENFORCE_EQ(filter_dims[1], - 3, - platform::errors::InvalidArgument( - "Input(Filter) dim[1] should be 3. " - "But received Input(Filter) dim[1] is %d.", - filter_dims[1])); - } - - if (filter_dims[0] != -1 && vector_dims[2] != -1) { - PADDLE_ENFORCE_EQ( - filter_dims[0], - vector_dims[2], - platform::errors::InvalidArgument( - "Input(Filter) dim[0] must equal to Input(NodesVector) dim[2]. " - "But received Input(Filter) dim[0] = %d, Input(NodesVector) " - "dim[2] = %d.", - filter_dims[0], - vector_dims[2])); - } - } - auto output_dims = phi::make_ddim( - {vector_dims[0], vector_dims[1], filter_dims[2], filter_dims[3]}); - ctx->SetOutputDim("Out", output_dims); - } - - protected: - phi::KernelKey GetExpectedKernelType( - const framework::ExecutionContext &ctx) const override { - return phi::KernelKey( - OperatorWithKernel::IndicateVarDataType(ctx, "NodesVector"), - ctx.GetPlace()); - } -}; - -template -class TreeConvGradOpMaker : public framework::SingleGradOpMaker { - public: - using framework::SingleGradOpMaker::SingleGradOpMaker; - - protected: - void Apply(GradOpPtr op) const override { - op->SetType("tree_conv_grad"); - - op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out")); - op->SetInput("Filter", this->Input("Filter")); - op->SetInput("EdgeSet", this->Input("EdgeSet")); - op->SetInput("NodesVector", this->Input("NodesVector")); - - op->SetOutput(framework::GradVarName("NodesVector"), - this->InputGrad("NodesVector")); - op->SetOutput(framework::GradVarName("Filter"), this->InputGrad("Filter")); - - op->SetAttrMap(this->Attrs()); - } -}; - -class TreeConvGradOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext *ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("Filter"), "Input", "Filter", "grad_TreeConv"); - OP_INOUT_CHECK( - ctx->HasInput("EdgeSet"), "Input", "EdgeSet", "grad_TreeConv"); - OP_INOUT_CHECK( - ctx->HasInput("NodesVector"), "Input", "NodesVector", "grad_TreeConv"); - OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")), - "Input", - framework::GradVarName("Out"), - "grad_TreeConv"); - OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("NodesVector")), - "Output", - framework::GradVarName("NodesVector"), - "grad_TreeConv"); - OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("Filter")), - "Output", - framework::GradVarName("Filter"), - "grad_TreeConv"); - - auto vectors_dims = ctx->GetInputDim("NodesVector"); - auto filter_dims = ctx->GetInputDim("Filter"); - if (ctx->HasOutput(framework::GradVarName("Filter"))) { - ctx->SetOutputDim(framework::GradVarName("Filter"), filter_dims); - } - if (ctx->HasOutput(framework::GradVarName("NodesVector"))) { - ctx->SetOutputDim(framework::GradVarName("NodesVector"), vectors_dims); - } - } - - protected: - phi::KernelKey GetExpectedKernelType( - const framework::ExecutionContext &ctx) const override { - return phi::KernelKey( - OperatorWithKernel::IndicateVarDataType(ctx, "NodesVector"), - ctx.GetPlace()); - } -}; -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OPERATOR(tree_conv, - ops::TreeConvOp, - ops::TreeConvOpMaker, - ops::TreeConvGradOpMaker, - ops::TreeConvGradOpMaker); - -REGISTER_OPERATOR(tree_conv_grad, ops::TreeConvGradOp); - -PD_REGISTER_STRUCT_KERNEL( - tree_conv, CPU, ALL_LAYOUT, ops::TreeConvKernel, float, double) {} -PD_REGISTER_STRUCT_KERNEL( - tree_conv_grad, CPU, ALL_LAYOUT, ops::TreeConvGradKernel, float, double) {} diff --git a/paddle/fluid/operators/tree_conv_op.cu b/paddle/fluid/operators/tree_conv_op.cu deleted file mode 100644 index 1bfcb94013c2b..0000000000000 --- a/paddle/fluid/operators/tree_conv_op.cu +++ /dev/null @@ -1,22 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/operators/tree_conv_op.h" - -namespace ops = paddle::operators; - -PD_REGISTER_STRUCT_KERNEL( - tree_conv, GPU, ALL_LAYOUT, ops::TreeConvKernel, float, double) {} -PD_REGISTER_STRUCT_KERNEL( - tree_conv_grad, GPU, ALL_LAYOUT, ops::TreeConvGradKernel, float, double) {} diff --git a/paddle/fluid/operators/tree_conv_op.h b/paddle/fluid/operators/tree_conv_op.h deleted file mode 100644 index 18fd5bea29d30..0000000000000 --- a/paddle/fluid/operators/tree_conv_op.h +++ /dev/null @@ -1,148 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/math/tree2col.h" -#include "paddle/phi/kernels/funcs/blas/blas.h" - -namespace paddle { -namespace operators { -using DDim = framework::DDim; -template -class TreeConvKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - math::Tree2ColFunctor tree2col; - phi::funcs::SetConstant constant; - - auto *Edges = ctx.Input("EdgeSet"); - auto *Embeddings = ctx.Input("NodesVector"); - auto *Filter = ctx.Input("Filter"); - auto *output_emb = ctx.Output("Out"); - int max_depth = ctx.Attr("max_depth"); - - auto &dev_ctx = ctx.template device_context(); - auto blas = phi::funcs::GetBlas(dev_ctx); - - phi::DenseTensor W; - W.ShareDataWith(*Filter); - W.Resize(phi::flatten_to_2d(Filter->dims(), 2)); - - int batch_size = static_cast(Edges->dims()[0]); - int n = static_cast(Embeddings->dims()[1]); - int out_size = static_cast(Filter->dims()[2]); - int num_filters = static_cast(Filter->dims()[3]); - output_emb->mutable_data({batch_size, n, out_size, num_filters}, - ctx.GetPlace()); - - auto edge_set_slicedim = phi::slice_ddim( - Edges->dims(), 1, static_cast(Edges->dims().size())); - - auto embedding_slicedim = phi::slice_ddim( - Embeddings->dims(), 1, static_cast(Embeddings->dims().size())); - - auto output_slicedim = phi::slice_ddim( - output_emb->dims(), 1, static_cast(output_emb->dims().size())); - - output_slicedim = phi::flatten_to_2d(output_slicedim, 1); - - for (int idx = 0; idx < batch_size; idx++) { - auto edge_set = Edges->Slice(idx, idx + 1).Resize(edge_set_slicedim); - auto embeddings = - Embeddings->Slice(idx, idx + 1).Resize(embedding_slicedim); - auto out_vec = output_emb->Slice(idx, idx + 1).Resize(output_slicedim); - phi::DenseTensor patch; - tree2col(dev_ctx, edge_set, embeddings, &patch, max_depth); - constant(dev_ctx, &out_vec, 0); - blas.MatMul(patch, W, &out_vec); - } - } -}; -template -class TreeConvGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - auto *out_g = ctx.Input(framework::GradVarName("Out")); - auto *in_g = - ctx.Output(framework::GradVarName("NodesVector")); - auto *filter_g = - ctx.Output(framework::GradVarName("Filter")); - int max_depth = ctx.Attr("max_depth"); - auto *Embeddings = ctx.Input("NodesVector"); - auto *edges = ctx.Input("EdgeSet"); - auto *Filter = ctx.Input("Filter"); - math::Tree2ColFunctor tree2col; - math::Col2TreeFunctor col2tree; - phi::funcs::SetConstant constant; - auto &dev_ctx = ctx.template device_context(); - auto blas = phi::funcs::GetBlas(dev_ctx); - - phi::DenseTensor W; - W.ShareDataWith(*Filter); - W.Resize(phi::flatten_to_2d(Filter->dims(), 1)); - - int batch_size = static_cast(Embeddings->dims()[0]); - - auto edge_set_slicedim = phi::slice_ddim( - edges->dims(), 1, static_cast(edges->dims().size())); - - auto embedding_slicedim = phi::slice_ddim( - Embeddings->dims(), 1, static_cast(Embeddings->dims().size())); - - auto out_grad_dims = phi::slice_ddim( - out_g->dims(), 1, static_cast(out_g->dims().size())); - out_grad_dims = phi::flatten_to_2d(out_grad_dims, 1); - if (filter_g) { - filter_g->mutable_data(Filter->dims(), ctx.GetPlace()); - phi::DenseTensor f_g; - f_g.ShareDataWith(*filter_g); - f_g.Resize(phi::flatten_to_2d(Filter->dims(), 2)); - constant(dev_ctx, filter_g, 0); - for (int batch_id = 0; batch_id < batch_size; batch_id++) { - auto edge_set = - edges->Slice(batch_id, batch_id + 1).Resize(edge_set_slicedim); - auto embeddings = Embeddings->Slice(batch_id, batch_id + 1) - .Resize(embedding_slicedim); - auto out_grad = - out_g->Slice(batch_id, batch_id + 1).Resize(out_grad_dims); - phi::DenseTensor patch; - tree2col(dev_ctx, edge_set, embeddings, &patch, max_depth); - blas.MatMul(patch, true, out_grad, false, T(1.0), &f_g, T(1.0)); - } - } - if (in_g) { - auto input_grad_dims = phi::slice_ddim( - in_g->dims(), 1, static_cast(in_g->dims().size())); - in_g->mutable_data(Embeddings->dims(), ctx.GetPlace()); - constant(dev_ctx, in_g, 0); - for (int batch_id = 0; batch_id < batch_size; batch_id++) { - auto edge_set = - edges->Slice(batch_id, batch_id + 1).Resize(edge_set_slicedim); - auto out_grad = - out_g->Slice(batch_id, batch_id + 1).Resize(out_grad_dims); - auto in_grad = - in_g->Slice(batch_id, batch_id + 1).Resize(input_grad_dims); - phi::DenseTensor in_grad_temp; - col2tree(dev_ctx, edge_set, out_grad, &in_grad_temp, max_depth); - blas.MatMul(in_grad_temp, false, W, true, &in_grad); - } - } - } -}; -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/unity_build_rule.cmake b/paddle/fluid/operators/unity_build_rule.cmake index 6669d2bfe2235..c58b78cf3bc21 100644 --- a/paddle/fluid/operators/unity_build_rule.cmake +++ b/paddle/fluid/operators/unity_build_rule.cmake @@ -97,8 +97,7 @@ register_unity_group( fill_constant_batch_size_like_op.cc fill_constant_op.cc fill_op.cc - fill_zeros_like_op.cc - filter_by_instag_op.cc) + fill_zeros_like_op.cc) register_unity_group( cc flatten_op.cc @@ -202,7 +201,6 @@ register_unity_group( positive_negative_pair_op.cc prelu_op.cc print_op.cc - prroi_pool_op.cc psroi_pool_op.cc pull_box_extended_sparse_op.cc pull_box_sparse_op.cc @@ -214,7 +212,6 @@ register_unity_group( quantize_op.cc mkldnn/quantize_mkldnn_op.cc queue_generator_op.cc - random_crop_op.cc range_op.cc rank_attention_op.cc rank_loss_op.cc @@ -257,7 +254,6 @@ register_unity_group( slice_op.cc) register_unity_group( cc - space_to_depth_op.cc spectral_norm_op.cc split_lod_tensor_op.cc split_op.cc @@ -282,7 +278,6 @@ register_unity_group( trace_op.cc transpose_op.cc mkldnn/transpose_mkldnn_op.cc - tree_conv_op.cc unbind_op.cc unfold_op.cc) register_unity_group( @@ -327,7 +322,7 @@ register_unity_group( unbind_op.cu.cc unpool_op.cu.cc unsqueeze_op.cu.cc) -register_unity_group(cc arg_max_op.cc arg_min_op.cc squared_l2_distance_op.cc) +register_unity_group(cc arg_max_op.cc arg_min_op.cc) register_unity_group( cc linear_chain_crf_op.cc @@ -439,8 +434,7 @@ register_unity_group( margin_rank_loss_op.cu masked_select_op.cu shuffle_channel_op.cu - softmax_cudnn_op.cu - squared_l2_distance_op.cu) + softmax_cudnn_op.cu) register_unity_group( cu conv_shift_op.cu @@ -472,18 +466,11 @@ register_unity_group( partial_sum_op.cu pixel_shuffle_op.cu prelu_op.cu - prroi_pool_op.cu run_program_op.cu pull_box_extended_sparse_op.cu pull_box_sparse_op.cu) -register_unity_group( - cu - random_crop_op.cu - range_op.cu - reverse_op.cu - partial_concat_op.cu - kldiv_loss_op.cu - instance_norm_op.cu) +register_unity_group(cu range_op.cu reverse_op.cu partial_concat_op.cu + kldiv_loss_op.cu instance_norm_op.cu) register_unity_group( cu roi_align_op.cu @@ -507,7 +494,6 @@ register_unity_group( slice_op.cu) register_unity_group( cu - space_to_depth_op.cu spectral_norm_op.cu split_op.cu split_selected_rows_op.cu @@ -515,14 +501,8 @@ register_unity_group( sum_op.cu temporal_shift_op.cu arg_max_op.cu) -register_unity_group( - cu - row_conv_op.cu - tree_conv_op.cu - tril_triu_op.cu - unfold_op.cu - arg_min_op.cu - crop_tensor_op.cu) +register_unity_group(cu row_conv_op.cu tril_triu_op.cu unfold_op.cu + arg_min_op.cu crop_tensor_op.cu) register_unity_group( cu smooth_l1_loss_op.cu diff --git a/test/legacy_test/CMakeLists.txt b/test/legacy_test/CMakeLists.txt index c7dac2d5cb0c0..875f8164e380c 100644 --- a/test/legacy_test/CMakeLists.txt +++ b/test/legacy_test/CMakeLists.txt @@ -523,9 +523,7 @@ set(TEST_OPS_WITH_GC test_mean_op test_pad2d_op test_scatter_op - test_slice_op - test_space_to_depth_op - test_squared_l2_distance_op) + test_slice_op) foreach(TEST_OP ${TEST_OPS_WITH_GC}) list(REMOVE_ITEM TEST_OPS ${TEST_OP}) @@ -1047,7 +1045,6 @@ set_tests_properties(test_sigmoid_cross_entropy_with_logits_op set_tests_properties(test_imperative_optimizer_v2 PROPERTIES TIMEOUT 150) set_tests_properties(test_partial_sum_op PROPERTIES TIMEOUT 120) set_tests_properties(test_cond PROPERTIES TIMEOUT 120) -set_tests_properties(test_space_to_depth_op PROPERTIES TIMEOUT 200) set_tests_properties(test_sgd_op PROPERTIES TIMEOUT 250) set_tests_properties(test_parallel_executor_seresnext_base_gpu PROPERTIES TIMEOUT 120) diff --git a/test/legacy_test/test_filter_by_instag_op.py b/test/legacy_test/test_filter_by_instag_op.py deleted file mode 100644 index 211889feaa06b..0000000000000 --- a/test/legacy_test/test_filter_by_instag_op.py +++ /dev/null @@ -1,279 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""This is unit test of Test filter_instag Op.""" - -import unittest - -import numpy as np -from op_test import OpTest - -"""This is Test Case 1""" - - -class TestFilterByInstagOp(OpTest): - def setUp(self): - self.op_type = 'filter_by_instag' - x1 = np.zeros((36, 4), dtype=np.float64) - for i in range(36): - for j in range(4): - x1[i, j] = i - x1_lod = [[1, 2, 3, 4, 5, 6, 7, 8]] - - x2 = np.array([[1], [2], [1], [2], [1], [2], [1], [2]]).astype('int64') - x2_lod = [[1, 1, 1, 1, 1, 1, 1, 1]] - - x3 = np.array([2]).astype('int64') - - out = np.zeros((20, 4), dtype=np.float64) - out_lod = [[2, 4, 6, 8]] - start_num_lst = [1, 6, 15, 28] - - ln = 0 - for i in range(4): - start = start_num_lst[i] - len = out_lod[0][i] - for j in range(len): - cur = start + j - for k in range(4): - out[ln, k] = cur - ln += 1 - - mmap = np.array([[0, 1, 2], [2, 6, 4], [6, 15, 6], [12, 28, 8]]).astype( - 'int64' - ) - mmap_lod = [[1, 1, 1, 1]] - - loss_weight = np.array([[1], [1], [1], [1]]).astype('double') - - self.inputs = { - 'Ins': (x1, x1_lod), - 'Ins_tag': (x2, x2_lod), - 'Filter_tag': x3, - } - self.outputs = { - 'Out': (out, out_lod), - 'LossWeight': (loss_weight, mmap_lod), - 'IndexMap': (mmap, mmap_lod), - } - - self.attrs = {'is_lod': True, 'out_val_if_empty': 0} - - def test_check_output(self): - self.check_output() - - def test_check_grad(self): - self.check_grad(['Ins'], 'Out', no_grad_set={'Ins_tag', 'Filter_tag'}) - - -"""This is Test Case 2""" - - -class TestFilterByInstagOp2(OpTest): - def setUp(self): - self.op_type = 'filter_by_instag' - - x1 = np.random.random((4, 36)).astype('double') - x1_lod = [[1, 1, 1, 1]] - - x2 = np.array([[2], [1], [2], [1]]).astype('int64') - x2_lod = [[1, 1, 1, 1]] - - x3 = np.array([1]).astype('int64') - - out = np.zeros([2, 36]).astype('double') - out[0] = x1[1] - out[1] = x1[3] - out_lod = [[1, 1]] - - mmap = np.array([[0, 1, 1], [1, 3, 1]]).astype('int64') - mmap_lod = [[1, 1]] - - loss_weight = np.array([[1], [1]]).astype('double') - self.inputs = { - 'Ins': (x1, x1_lod), - 'Ins_tag': (x2, x2_lod), - 'Filter_tag': x3, - } - - self.outputs = { - 'Out': (out, out_lod), - 'LossWeight': (loss_weight, mmap_lod), - 'IndexMap': (mmap, mmap_lod), - } - self.attrs = {'is_lod': True, 'out_val_if_empty': 0} - - def test_check_output(self): - self.check_output() - - def test_check_grad(self): - self.check_grad(['Ins'], 'Out', no_grad_set={'Ins_tag', 'Filter_tag'}) - - -"""This is Test Case 3""" - - -class TestFilterByInstagOp3(OpTest): - def setUp(self): - self.op_type = 'filter_by_instag' - - x1 = np.random.random((4, 36)).astype('double') - x1_lod = [[1, 1, 1, 1]] - - x2 = np.array([[2], [1], [2], [1]]).astype('int64') - x2_lod = [[1, 1, 1, 1]] - - x3 = np.array([3]).astype('int64') - - out = np.zeros((1, 36)).astype('double') - out_lod = [[1]] - - mmap = np.array([[0, 1, 1]]).astype('int64') - mmap_lod = [[1]] - - loss_weight = np.array([[0]]).astype('double') - self.inputs = { - 'Ins': (x1, x1_lod), - 'Ins_tag': (x2, x2_lod), - 'Filter_tag': x3, - } - self.outputs = { - 'Out': (out, out_lod), - 'LossWeight': (loss_weight, mmap_lod), - 'IndexMap': (mmap, mmap_lod), - } - self.attrs = {'is_lod': True, 'out_val_if_empty': 0} - - def test_check_output(self): - self.check_output() - - def test_check_grad(self): - self.check_grad(['Ins'], 'Out', no_grad_set={'Ins_tag', 'Filter_tag'}) - - -"""This is Test Case 4""" - - -class TestFilterByInstagOp4(OpTest): - def setUp(self): - self.op_type = 'filter_by_instag' - - x1 = np.random.random((4, 36)).astype('double') - - x2 = np.array([[2], [1], [2], [1]]).astype('int64') - x2_lod = [[1, 1, 1, 1]] - - x3 = np.array([3]).astype('int64') - - out = np.zeros((1, 36)).astype('double') - out_lod = [[1]] - - mmap = np.array([[0, 1, 1]]).astype('int64') - mmap_lod = [[1]] - - loss_weight = np.array([[0]]).astype('double') - self.inputs = { - 'Ins': x1, - 'Ins_tag': (x2, x2_lod), - 'Filter_tag': x3, - } - self.outputs = { - 'Out': (out, out_lod), - 'LossWeight': (loss_weight, mmap_lod), - 'IndexMap': (mmap, mmap_lod), - } - self.attrs = {'is_lod': False, 'out_val_if_empty': 0} - - def test_check_output(self): - self.check_output() - - def test_check_grad(self): - self.check_grad(['Ins'], 'Out', no_grad_set={'Ins_tag', 'Filter_tag'}) - - -class TestFilterByInstagOp6(OpTest): - def setUp(self): - self.op_type = 'filter_by_instag' - - x1 = np.random.random((4, 36)).astype('int64') - - x2 = np.array([[2], [1], [2], [1]]).astype('int64') - x2_lod = [[1, 1, 1, 1]] - - x3 = np.array([3]).astype('int64') - - out = np.zeros((1, 36)).astype('double') - out_lod = [[1]] - - mmap = np.array([[0, 1, 1]]).astype('int64') - mmap_lod = [[1]] - - loss_weight = np.array([[0]]).astype('double') - self.inputs = { - 'Ins': x1, - 'Ins_tag': (x2, x2_lod), - 'Filter_tag': x3, - } - self.outputs = { - 'Out': (out, out_lod), - 'LossWeight': (loss_weight, mmap_lod), - 'IndexMap': (mmap, mmap_lod), - } - self.attrs = {'is_lod': False, 'out_val_if_empty': 0} - - def test_check_output(self): - self.check_output() - - def test_check_grad(self): - pass - - -class TestFilterByInstagOp7(OpTest): - def setUp(self): - self.op_type = 'filter_by_instag' - - x1 = np.random.random((4, 36)).astype('int32') - - x2 = np.array([[2], [1], [2], [1]]).astype('int64') - x2_lod = [[1, 1, 1, 1]] - - x3 = np.array([3]).astype('int64') - - out = np.zeros((1, 36)).astype('double') - out_lod = [[1]] - - mmap = np.array([[0, 1, 1]]).astype('int64') - mmap_lod = [[1]] - - loss_weight = np.array([[0]]).astype('double') - self.inputs = { - 'Ins': x1, - 'Ins_tag': (x2, x2_lod), - 'Filter_tag': x3, - } - self.outputs = { - 'Out': (out, out_lod), - 'LossWeight': (loss_weight, mmap_lod), - 'IndexMap': (mmap, mmap_lod), - } - self.attrs = {'is_lod': False, 'out_val_if_empty': 0} - - def test_check_output(self): - self.check_output() - - def test_check_grad(self): - pass - - -if __name__ == '__main__': - unittest.main() diff --git a/test/legacy_test/test_random_crop_op.py b/test/legacy_test/test_random_crop_op.py deleted file mode 100644 index 08355378207c1..0000000000000 --- a/test/legacy_test/test_random_crop_op.py +++ /dev/null @@ -1,48 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import unittest - -import numpy as np -from op_test import OpTest - - -class TestRandomCropOp(OpTest): - def setUp(self): - to_crop = np.array( - [[[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]]] * 5 - ).astype(np.int32) - self.possible_res = [ - np.array([[1, 2, 3], [5, 6, 7]]).astype(np.int32), - np.array([[2, 3, 4], [6, 7, 8]]).astype(np.int32), - np.array([[5, 6, 7], [9, 10, 11]]).astype(np.int32), - np.array([[6, 7, 8], [10, 11, 12]]).astype(np.int32), - ] - self.op_type = "random_crop" - self.inputs = {'X': to_crop, 'Seed': np.array([10]).astype('int64')} - self.outputs = {'Out': np.array([]), 'SeedOut': np.array([])} - self.attrs = {'shape': [2, 3]} - - def test_check_output(self): - self.check_output_customized(self.verify_output) - - def verify_output(self, outs): - out = np.array(outs[1]) - for ins in out[:]: - is_equal = [(ins == res).all() for res in self.possible_res] - self.assertIn(True, is_equal) - - -if __name__ == "__main__": - unittest.main() diff --git a/test/legacy_test/test_space_to_depth_op.py b/test/legacy_test/test_space_to_depth_op.py deleted file mode 100644 index c7cd6cae179db..0000000000000 --- a/test/legacy_test/test_space_to_depth_op.py +++ /dev/null @@ -1,152 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import unittest - -import numpy as np -from op_test import OpTest - -from paddle import base - - -class TestSpaceToDepthOp(OpTest): - @staticmethod - def helper(in_, width, height, channel, batch, blocksize, forward, out_): - channel_out = channel // (blocksize * blocksize) - for b in range(batch): - for k in range(channel): - for j in range(height): - for i in range(width): - in_index = i + width * (j + height * (k + channel * b)) - channel2 = k % channel_out - offset = k // channel_out - width2 = i * blocksize + offset % blocksize - height2 = j * blocksize + offset // blocksize - out_index = width2 + width * blocksize * ( - height2 - + height * blocksize * (channel2 + channel_out * b) - ) - if forward: - out_[out_index] = in_[in_index] - else: - out_[in_index] = in_[out_index] - - def setUp(self): - self.init_data() - - self.op_type = "space_to_depth" - self.inputs = {"X": self.x} - self.helper( - self.x_1d, - self.x.shape[3], - self.x.shape[2], - self.x.shape[1], - self.x.shape[0], - self.blocksize, - self.forward, - self.out_1d, - ) - self.out = np.reshape(self.out_1d, self.infered_shape) - self.attrs = {"blocksize": self.blocksize} - self.outputs = {"Out": self.out} - - def init_data(self): - self.ori_shape = (32, 12, 6, 6) - self.infered_shape = (32, 48, 3, 3) - self.one_d_len = 32 * 48 * 3 * 3 - - self.blocksize = 2 - self.x = np.random.random(self.ori_shape).astype('float64') - self.x_1d = np.reshape(self.x, self.one_d_len) - self.out = np.zeros(self.infered_shape).astype('float64') - self.out_1d = np.reshape(self.out, self.one_d_len) - self.forward = 1 - - def test_check_output(self): - place = ( - base.core.CUDAPlace(0) - if base.core.is_compiled_with_cuda() - else base.core.CPUPlace() - ) - self.check_output_with_place( - place=place, atol=1e-5, no_check_set=None, equal_nan=False - ) - - def test_check_grad(self): - place = ( - base.core.CUDAPlace(0) - if base.core.is_compiled_with_cuda() - else base.core.CPUPlace() - ) - self.check_grad_with_place(place, ['X'], 'Out') - - -class TestSpaceToDepthOpBasic(TestSpaceToDepthOp): - def init_data(self): - self.ori_shape = (32, 8, 6, 6) - self.infered_shape = (32, 32, 3, 3) - self.one_d_len = 32 * 32 * 3 * 3 - - self.blocksize = 2 - self.x = np.random.random(self.ori_shape).astype('float64') - self.x_1d = np.reshape(self.x, self.one_d_len) - self.out = np.zeros(self.infered_shape).astype('float64') - self.out_1d = np.reshape(self.out, self.one_d_len) - self.forward = 1 - - -class TestSpaceToDepthOpDoubleBasic(TestSpaceToDepthOp): - def init_data(self): - self.ori_shape = (32, 8, 6, 6) - self.infered_shape = (32, 32, 3, 3) - self.one_d_len = 32 * 32 * 3 * 3 - - self.blocksize = 2 - self.x = np.random.random(self.ori_shape).astype('float64') - self.x_1d = np.reshape(self.x, self.one_d_len) - self.out = np.zeros(self.infered_shape).astype('float64') - self.out_1d = np.reshape(self.out, self.one_d_len) - self.forward = 1 - - -class TestSpaceToDepthOpWithStride3(TestSpaceToDepthOp): - def init_data(self): - self.ori_shape = (32, 9, 6, 6) - self.infered_shape = (32, 81, 2, 2) - self.one_d_len = 32 * 81 * 2 * 2 - - self.blocksize = 3 - self.x = np.random.random(self.ori_shape).astype('float64') - self.x_1d = np.reshape(self.x, self.one_d_len) - self.out = np.zeros(self.infered_shape).astype('float64') - self.out_1d = np.reshape(self.out, self.one_d_len) - self.forward = 1 - - -class TestSpaceToDepthOpWithNotSquare(TestSpaceToDepthOp): - def init_data(self): - self.ori_shape = (32, 9, 9, 6) - self.infered_shape = (32, 81, 3, 2) - self.one_d_len = 32 * 81 * 3 * 2 - - self.blocksize = 3 - self.x = np.random.random(self.ori_shape).astype('float64') - self.x_1d = np.reshape(self.x, self.one_d_len) - self.out = np.zeros(self.infered_shape).astype('float64') - self.out_1d = np.reshape(self.out, self.one_d_len) - self.forward = 1 - - -if __name__ == '__main__': - unittest.main() diff --git a/test/legacy_test/test_squared_l2_distance_op.py b/test/legacy_test/test_squared_l2_distance_op.py deleted file mode 100644 index 579681ab0c098..0000000000000 --- a/test/legacy_test/test_squared_l2_distance_op.py +++ /dev/null @@ -1,86 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import unittest - -import numpy as np -from op_test import OpTest - - -class TestSquaredL2DistanceOp_f0(OpTest): - def setUp(self): - self.op_type = "squared_l2_distance" - self.inputs = { - 'X': np.random.uniform(0.1, 0.6, (5, 20)).astype("float32"), - 'Y': np.random.uniform(0.1, 0.6, (5, 20)).astype("float32"), - } - sub_res = self.inputs['X'] - self.inputs['Y'] - output = sub_res * sub_res - self.outputs = { - 'sub_result': sub_res, - 'Out': np.expand_dims(output.sum(1), 1), - } - - def test_check_output(self): - self.check_output() - - def test_check_grad(self): - self.check_grad(['X', 'Y'], 'Out') - - -class TestSquaredL2DistanceOp_f1(OpTest): - def setUp(self): - self.op_type = "squared_l2_distance" - self.inputs = { - 'X': np.random.uniform(0.1, 0.6, (2, 3)).astype("float32"), - 'Y': np.random.uniform(0.1, 0.6, (1, 3)).astype("float32"), - } - sub_res = self.inputs['X'] - self.inputs['Y'] - output = sub_res * sub_res - self.outputs = { - 'sub_result': sub_res, - 'Out': np.expand_dims(output.sum(1), 1), - } - - def test_check_output(self): - self.check_output() - - def test_check_grad(self): - self.check_grad(['X', 'Y'], 'Out') - - -class TestSquaredL2DistanceOp_f2(OpTest): - def setUp(self): - self.op_type = "squared_l2_distance" - self.inputs = { - 'X': np.random.uniform(0.1, 0.6, (2, 3, 4)).astype("float32"), - 'Y': np.random.uniform(0.1, 0.6, (1, 3, 4)).astype("float32"), - } - sub_res = self.inputs['X'] - self.inputs['Y'] - sub_res = sub_res.reshape((2, 3 * 4)) - output = sub_res * sub_res - self.outputs = { - 'sub_result': sub_res, - 'Out': np.expand_dims(output.sum(1), 1), - } - - def test_check_output(self): - self.check_output() - - def test_check_grad(self): - self.check_grad(['X', 'Y'], 'Out') - - -if __name__ == "__main__": - unittest.main() diff --git a/test/legacy_test/test_tree_conv_op.py b/test/legacy_test/test_tree_conv_op.py deleted file mode 100644 index e05ee1a4d4cdf..0000000000000 --- a/test/legacy_test/test_tree_conv_op.py +++ /dev/null @@ -1,167 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import unittest - -import numpy as np -from op_test import OpTest - - -def collect_node_patch(og, max_depth): - """ - The naive method to construct patches - :param og: original graph - :param max_depth: the depth of convolution filters - :return: convolution patches - """ - - def gen(node, max_depth): - collected = [(node, 1, 1, 0, max_depth)] - - def recurse_helper(node, depth): - if depth > max_depth: - return - l = len(og[node]) - for idx, c in enumerate(og[node], 1): - if depth + 1 < max_depth: - collected.append((c, idx, l, depth + 1, max_depth)) - recurse_helper(c, depth + 1) - - recurse_helper(node, 0) - return collected - - res = [] - for u in range(1, len(og)): - lis = gen(u, max_depth) - if len(lis) > 0: - res.append(lis) - return res - - -class TestTreeConvOp(OpTest): - def setUp(self): - self.n = 17 - self.fea_size = 3 - self.output_size = 1 - self.max_depth = 2 - self.batch_size = 2 - self.num_filters = 1 - adj_array = [ - 1, - 2, - 1, - 3, - 1, - 4, - 1, - 5, - 2, - 6, - 2, - 7, - 2, - 8, - 4, - 9, - 4, - 10, - 5, - 11, - 6, - 12, - 6, - 13, - 9, - 14, - 9, - 15, - 9, - 16, - 9, - 17, - ] - adj = np.array(adj_array).reshape((1, self.n - 1, 2)).astype('int32') - adj = np.tile(adj, (self.batch_size, 1, 1)) - self.op_type = 'tree_conv' - vectors = np.random.random( - (self.batch_size, self.n, self.fea_size) - ).astype('float64') - self.inputs = { - 'EdgeSet': adj, - 'NodesVector': vectors, - 'Filter': np.random.random( - (self.fea_size, 3, self.output_size, self.num_filters) - ).astype('float64'), - } - self.attrs = {'max_depth': self.max_depth} - vectors = [] - for i in range(self.batch_size): - vector = self.get_output_naive(i) - vectors.append(vector) - self.outputs = { - 'Out': np.array(vectors).astype('float64'), - } - - def test_check_output(self): - self.check_output() - - def test_check_grad(self): - self.check_grad( - ['NodesVector', 'Filter'], 'Out', max_relative_error=0.5 - ) - - def get_output_naive(self, batch_id): - og = [[] for i in range(1, self.n + 2)] - st = np.array(self.inputs['EdgeSet'][batch_id]).tolist() - for e in st: - og[e[0]].append(e[1]) - patches = collect_node_patch(og, self.max_depth) - W = np.array(self.inputs['Filter']).astype('float64') - W = np.transpose(W, axes=[1, 0, 2, 3]) - vec = [] - for i, patch in enumerate(patches, 1): - result = np.zeros((1, W.shape[2], W.shape[3])) - for v in patch: - eta_t = float(v[4] - v[3]) / float(v[4]) - eta_l = (1.0 - eta_t) * ( - 0.5 if v[2] == 1 else float(v[1] - 1.0) / float(v[2] - 1.0) - ) - eta_r = (1.0 - eta_t) * (1.0 - eta_l) - x = self.inputs['NodesVector'][batch_id][v[0] - 1] - eta = ( - np.array([eta_l, eta_r, eta_t]) - .reshape((3, 1)) - .astype('float64') - ) - Wconvi = np.tensordot(eta, W, axes=([0], [0])) - x = np.array(x).reshape((1, 1, self.fea_size)) - res = np.tensordot(x, Wconvi, axes=2) - result = result + res - vec.append(result) - vec = np.concatenate(vec, axis=0) - vec = np.concatenate( - [ - vec, - np.zeros( - (self.n - vec.shape[0], W.shape[2], W.shape[3]), - dtype='float64', - ), - ], - axis=0, - ) - return vec - - -if __name__ == "__main__": - unittest.main() diff --git a/test/white_list/check_shape_white_list.py b/test/white_list/check_shape_white_list.py index 0994b18973059..5785a51372e79 100644 --- a/test/white_list/check_shape_white_list.py +++ b/test/white_list/check_shape_white_list.py @@ -22,8 +22,6 @@ 'matmul', 'scatter', 'soft_relu', - 'squared_l2_distance', - 'tree_conv', 'cvm', 'cudnn_lstm', 'rnn', diff --git a/test/white_list/compile_vs_runtime_white_list.py b/test/white_list/compile_vs_runtime_white_list.py index a00c1a720aa5c..0c74eb327a853 100644 --- a/test/white_list/compile_vs_runtime_white_list.py +++ b/test/white_list/compile_vs_runtime_white_list.py @@ -29,7 +29,6 @@ 'gru', 'rpn_target_assign', 'retinanet_target_assign', - 'filter_by_instag', 'im2sequence', 'generate_proposal_labels', 'detection_map', diff --git a/test/white_list/no_grad_set_white_list.py b/test/white_list/no_grad_set_white_list.py index ade5ea12f6654..23c9994715f7d 100644 --- a/test/white_list/no_grad_set_white_list.py +++ b/test/white_list/no_grad_set_white_list.py @@ -43,7 +43,6 @@ 'elementwise_pow', 'elementwise_fmin', 'elementwise_fmax', - 'filter_by_instag', 'fused_elemwise_activation', 'fused_emb_seq_pool', 'fused_embedding_seq_pool', diff --git a/test/white_list/op_accuracy_white_list.py b/test/white_list/op_accuracy_white_list.py index 063b0dcffcc0c..f145e9e1f62e2 100644 --- a/test/white_list/op_accuracy_white_list.py +++ b/test/white_list/op_accuracy_white_list.py @@ -66,7 +66,6 @@ 'smooth_l1_loss', 'softmax', 'spectral_norm', - 'squared_l2_distance', 'squared_l2_norm', 'tanh', 'mish', diff --git a/tools/enforce/count_enforce_by_file.sh b/tools/enforce/count_enforce_by_file.sh index fafc3516904d8..c79d486c62838 100644 --- a/tools/enforce/count_enforce_by_file.sh +++ b/tools/enforce/count_enforce_by_file.sh @@ -53,7 +53,6 @@ fi FILE_WHITE_LIST="\ box_clip_op.cc \ box_clip_op.h \ - random_crop_op.h \ elementwise_op_function.cu.h \ fused_elemwise_activation_op.cc \ auc_op.cu \ diff --git a/tools/parallel_UT_rule.py b/tools/parallel_UT_rule.py index cb715c64dd48a..8755ef4d13ffb 100755 --- a/tools/parallel_UT_rule.py +++ b/tools/parallel_UT_rule.py @@ -298,7 +298,6 @@ 'test_ps_dispatcher', 'test_analyzer_rnn2', 'test_multi_gru_seq_fuse_pass', - 'test_filter_by_instag_op', 'test_switch', 'test_matmul_transpose_reshape_fuse_pass', 'test_mkldnn_caching', @@ -915,7 +914,6 @@ 'test_transformer', 'test_for_enumerate', 'test_variable_trans_func', - 'test_squared_l2_distance_op', 'test_quantize_transpiler_v2', 'test_im2sequence_op', 'test_reader_reset', @@ -1101,7 +1099,6 @@ 'test_empty_like_op', 'test_imperative_layer_children', 'nccl_op_test', - 'test_tree_conv_op', 'test_share_data_op', 'test_ir_memory_optimize_transformer', 'test_math_op_patch', @@ -1381,7 +1378,6 @@ 'test_complex_abs', 'test_gradient_accmulator', 'test_instance_norm_op_v2', - 'test_random_crop_op', 'test_mobile_net', 'test_parallel_executor_transformer', 'test_tensor_scalar_type_promotion_dynamic', @@ -1528,7 +1524,6 @@ 'test_imperative_transformer_sorted_gradient', 'test_bicubic_interp_v2_op', 'test_rank_attention_op', - 'test_space_to_depth_op', 'test_image_classification', 'test_custom_relu_op_setup', 'test_sgd_op', @@ -1867,7 +1862,6 @@ 'test_fleet', 'test_flags_use_mkldnn', 'test_flags_mkldnn_ops_on_off', - 'test_filter_by_instag_op', 'test_fetch_var', 'test_fetch_handler', 'test_feed_fetch_method', @@ -2654,7 +2648,6 @@ 'test_unfold_op', 'test_conv_bn_fuse_pass', 'test_truncated_gaussian_random_op', - 'test_tree_conv_op', 'test_traced_layer_err_msg', 'test_unique_with_counts', 'test_auc_single_pred_op', @@ -2686,7 +2679,6 @@ 'test_optimizer', 'test_deformable_conv_op', 'test_py_reader_push_pop', - 'test_random_crop_op', 'test_shuffle_channel_op', 'test_center_loss', 'test_temporal_shift_op', @@ -2703,13 +2695,10 @@ 'test_top_k_op', 'test_batch_fc_op', 'test_tensor_scalar_type_promotion_static', - 'test_squared_l2_distance_op', 'test_bicubic_interp_op', 'test_spp_op', - 'test_space_to_depth_op', 'test_callbacks', 'test_sigmoid_focal_loss_op', - 'test_collect_fpn_proposals_op', 'test_sequence_unpad_op', 'test_conv1d_transpose_layer', 'test_sequence_pool', diff --git a/tools/static_mode_white_list.py b/tools/static_mode_white_list.py index a7e7ad08e3ab1..228218e46ecf4 100755 --- a/tools/static_mode_white_list.py +++ b/tools/static_mode_white_list.py @@ -39,8 +39,6 @@ 'test_sequence_scatter_op', 'test_sequence_slice_op', 'test_slice_op', - 'test_space_to_depth_op', - 'test_squared_l2_distance_op', 'test_accuracy_op', 'test_activation_nn_grad', 'test_adadelta_op', @@ -202,7 +200,6 @@ 'test_fill_constant_op', 'test_fill_op', 'test_fill_zeros_like_op', - 'test_filter_by_instag_op', 'test_flatten2_op', 'test_flatten_contiguous_range_op', 'test_flatten_op', @@ -412,7 +409,6 @@ 'test_queue', 'test_randint_op', 'test_randn_op', - 'test_random_crop_op', 'test_randperm_op', 'test_range', 'test_rank_loss_op', @@ -492,7 +488,6 @@ 'test_trace_op', 'test_trainable', 'test_transpose_op', - 'test_tree_conv_op', 'test_tril_triu_op', 'test_trilinear_interp_op', 'test_trilinear_interp_v2_op', diff --git a/tools/windows/run_unittests.sh b/tools/windows/run_unittests.sh index f570fca753e58..b21910e0ae366 100644 --- a/tools/windows/run_unittests.sh +++ b/tools/windows/run_unittests.sh @@ -216,7 +216,6 @@ long_time_test="^test_gru_op$|\ ^test_gather_op$|\ ^test_gather_nd_op$|\ ^test_sequence_conv$|\ -^test_space_to_depth_op$|\ ^test_activation_nn_grad$|\ ^test_activation_op$|\ ^test_bicubic_interp_v2_op$|\ From 0a09db32a0f0eb06fa942619e36b418ce5d472a8 Mon Sep 17 00:00:00 2001 From: Kai Song <50285351+USTCKAY@users.noreply.github.com> Date: Mon, 9 Oct 2023 14:12:39 +0800 Subject: [PATCH 26/62] fix compile bugs on aarch64 platform (#57931) --- cmake/external/brpc.cmake | 4 +++- paddle/fluid/pybind/CMakeLists.txt | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/cmake/external/brpc.cmake b/cmake/external/brpc.cmake index d647e9116b586..c1c514def7619 100755 --- a/cmake/external/brpc.cmake +++ b/cmake/external/brpc.cmake @@ -13,7 +13,9 @@ # limitations under the License. include(ExternalProject) -set(OPENSSL_USE_STATIC_LIBS ON) +if(NOT WITH_ARM) + set(OPENSSL_USE_STATIC_LIBS ON) +endif() find_package(OpenSSL REQUIRED) message(STATUS "ssl:" ${OPENSSL_SSL_LIBRARY}) diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt index 46bfb0ee005a4..2dfeb89bef5c4 100755 --- a/paddle/fluid/pybind/CMakeLists.txt +++ b/paddle/fluid/pybind/CMakeLists.txt @@ -277,7 +277,9 @@ if(WITH_PYTHON) eager_legacy_op_function_generator.cc) set(GENERATOR_DEPS ${PYBIND_DEPS}) list(REMOVE_DUPLICATES GENERATOR_DEPS) - list(REMOVE_ITEM GENERATOR_DEPS python) + if(NOT WITH_ARM) + list(REMOVE_ITEM GENERATOR_DEPS python) + endif() target_link_libraries(eager_legacy_op_function_generator ${GENERATOR_DEPS}) if(NOT WIN32) add_executable(kernel_signature_generator kernel_signature_generator.cc) From 18bd6d3dc3b14f67aa7f70583fa222b1b97402dd Mon Sep 17 00:00:00 2001 From: Sonder <55493212+AndSonder@users.noreply.github.com> Date: Mon, 9 Oct 2023 16:47:29 +0800 Subject: [PATCH 27/62] remove batch norm from StaticBuildBlackList (#57510) * remove batch norm from StaticBuildBlackList * turn off the flag * fix batch_norm register kernel info * remove dequantize_linear to phi * fix build error * add sig file * update test and cmakelist * fix test_split_program error in static_build mode * move fuse_bn_add_act to phi * fix error in test_cudnn_bn_add_relu * fix shape error * fix date * close the static_build flag --- .../new_executor/interpreter/static_build.cc | 17 +- .../fused/fused_bn_add_activation_op.cu | 387 ------------------ .../fused/fused_bn_add_activation_op.h | 12 - paddle/fluid/operators/quantize_linear_op.cc | 8 - paddle/fluid/operators/quantize_linear_op.cu | 9 - paddle/fluid/operators/quantize_linear_op.h | 69 ---- .../phi/kernels/cpu/quantize_linear_kernel.cc | 109 +++++ .../fused_bn_add_activation_grad_kernel.h | 39 ++ .../kernels/fused_bn_add_activation_kernel.h | 39 ++ .../fused_bn_add_activation_grad_kernel.cu | 223 ++++++++++ .../gpu/fused_bn_add_activation_kernel.cu | 227 ++++++++++ paddle/phi/kernels/gpu/batch_norm_kernel.cu | 6 + .../phi/kernels/gpu/quantize_linear_kernel.cu | 130 ++++++ .../phi/kernels/impl/quantize_linear_impl.h | 127 ++++++ paddle/phi/kernels/quantize_linear_kernel.h | 40 ++ paddle/phi/kernels/xpu/batch_norm_kernel.cc | 7 +- .../ops/compat/fused_bn_add_activation_sig.cc | 52 +++ paddle/phi/ops/compat/quantize_linear_sig.cc | 31 ++ test/legacy_test/CMakeLists.txt | 2 + test/legacy_test/test_fake_dequantize_op.py | 4 +- 20 files changed, 1049 insertions(+), 489 deletions(-) delete mode 100644 paddle/fluid/operators/fused/fused_bn_add_activation_op.cu create mode 100644 paddle/phi/kernels/cpu/quantize_linear_kernel.cc create mode 100644 paddle/phi/kernels/fused_bn_add_activation_grad_kernel.h create mode 100644 paddle/phi/kernels/fused_bn_add_activation_kernel.h create mode 100644 paddle/phi/kernels/fusion/gpu/fused_bn_add_activation_grad_kernel.cu create mode 100644 paddle/phi/kernels/fusion/gpu/fused_bn_add_activation_kernel.cu create mode 100644 paddle/phi/kernels/gpu/quantize_linear_kernel.cu create mode 100644 paddle/phi/kernels/impl/quantize_linear_impl.h create mode 100644 paddle/phi/kernels/quantize_linear_kernel.h create mode 100644 paddle/phi/ops/compat/fused_bn_add_activation_sig.cc create mode 100644 paddle/phi/ops/compat/quantize_linear_sig.cc diff --git a/paddle/fluid/framework/new_executor/interpreter/static_build.cc b/paddle/fluid/framework/new_executor/interpreter/static_build.cc index 3751ee0a03db4..e8e5a1ef29aed 100644 --- a/paddle/fluid/framework/new_executor/interpreter/static_build.cc +++ b/paddle/fluid/framework/new_executor/interpreter/static_build.cc @@ -54,7 +54,6 @@ std::set OpsCanSkipedFakeAllocInStaticBuild = { "nop"}; std::set StaticBuildBlackList = { - "batch_norm" /*: to handle reserve_space output*/, "cinn_instruction_run" /*: to handle subgraph infermeta*/, "cinn_launch" /*: to handle subgraph infermeta*/, "run_program" /*: to handle scope output*/, @@ -206,6 +205,14 @@ bool TensorShouldBeFakeInitialized(const OperatorBase& op, } } + if (op_type == "batch_norm" && parameter_name == "ReserveSpace") { + if (dynamic_cast(&op)->kernel_type()->place_ == + phi::CPUPlace()) { + VLOG(2) << "Skip fake initialization for: " << parameter_name; + return false; + } + } + if (op_type == "coalesce_tensor" && parameter_name == "Output") { VLOG(2) << "Skip fake initialization for: " << parameter_name; return false; @@ -250,6 +257,12 @@ bool TensorShouldBeFakeInitialized(const OperatorBase& op, } } + if ((op_type == "flatten" || op_type == "flatten_contiguous_range") && + parameter_name == "XShape") { + VLOG(2) << "Skip fake initialization for: " << parameter_name; + return false; + } + if (op_type == "segment_pool" && parameter_name == "SummedIds") { return op.Attr("pooltype") == "MEAN" && dynamic_cast(&op) @@ -856,6 +869,8 @@ void FakeInitializeOutputsForFunctionKernel( dtype = InferDTypeFromAttr(op, runtime_ctx, "dtype"); } else if (op_type == "bincount" || op_type == "reduce_sum_grad") { dtype = GetInputDType(runtime_ctx, "X"); + } else if (op_type == "dequantize_linear") { + dtype = GetInputDType(runtime_ctx, "Scale"); } else if (op_type == "lamb") { bool multi_precision = op.Attr("multi_precision"); dtype = GetInputDType(runtime_ctx, "Moment1"); diff --git a/paddle/fluid/operators/fused/fused_bn_add_activation_op.cu b/paddle/fluid/operators/fused/fused_bn_add_activation_op.cu deleted file mode 100644 index 1fa7ff1826b07..0000000000000 --- a/paddle/fluid/operators/fused/fused_bn_add_activation_op.cu +++ /dev/null @@ -1,387 +0,0 @@ -// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include -#include -#include - -#include "paddle/fluid/framework/data_layout.h" -#include "paddle/fluid/operators/activation_op.h" -#include "paddle/fluid/operators/fused/fused_bn_add_activation_op.h" -#include "paddle/fluid/platform/device/gpu/gpu_dnn.h" -#include "paddle/fluid/platform/float16.h" -#include "paddle/phi/common/data_type.h" -#include "paddle/phi/core/flags.h" -#include "paddle/phi/kernels/funcs/math_function.h" -#include "paddle/phi/kernels/funcs/norm_utils.h" - -PHI_DECLARE_bool(cudnn_batchnorm_spatial_persistent); - -namespace paddle { -namespace operators { -template -using CudnnDataType = platform::CudnnDataType; -template -using BatchNormParamType = typename CudnnDataType::BatchNormParamType; - -template -class FusedBatchNormAddActKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { -#if CUDNN_VERSION < 7401 - PADDLE_THROW(phi::errors::Unimplemented( - "The fused_bn_add_activation operator is not supported on GPU " - "when CUDNN version < 7.4.1")); -#endif - PADDLE_ENFORCE_EQ( - platform::is_gpu_place(ctx.GetPlace()), - true, - platform::errors::PreconditionNotMet("It must use CUDAPlace.")); - auto &dev_ctx = ctx.template device_context(); - double epsilon = static_cast(ctx.Attr("epsilon")); - float momentum = ctx.Attr("momentum"); - std::string act_type = ctx.Attr("act_type"); - - if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) { - LOG(ERROR) << "Provided epsilon is smaller than " - << "CUDNN_BN_MIN_EPSILON. Setting it to " - << "CUDNN_BN_MIN_EPSILON instead."; - } - epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON); - - // Get the size for each dimension. - // NHWC [batch_size, in_height, in_width, in_channels] - const auto *x = ctx.Input("X"); - const auto *z = ctx.Input("Z"); - const auto &in_dims = x->dims(); - - const auto *scale = ctx.Input("Scale"); - const auto *bias = ctx.Input("Bias"); - - auto *mean_out = ctx.Output("MeanOut"); - auto *variance_out = ctx.Output("VarianceOut"); - dev_ctx.Alloc>( - mean_out, mean_out->numel() * sizeof(BatchNormParamType)); - dev_ctx.Alloc>( - variance_out, variance_out->numel() * sizeof(BatchNormParamType)); - - auto *saved_mean = ctx.Output("SavedMean"); - auto *saved_variance = ctx.Output("SavedVariance"); - dev_ctx.Alloc>( - saved_mean, saved_mean->numel() * sizeof(BatchNormParamType)); - dev_ctx.Alloc>( - saved_variance, - saved_variance->numel() * sizeof(BatchNormParamType)); - - auto *y = ctx.Output("Y"); - dev_ctx.Alloc(y, y->numel() * sizeof(T)); - - int N, C, H, W, D; - const DataLayout data_layout = DataLayout::kNHWC; - phi::funcs::ExtractNCWHD(in_dims, data_layout, &N, &C, &H, &W, &D); - - // ------------------- cudnn descriptors --------------------- - auto handle = dev_ctx.cudnn_handle(); - cudnnTensorDescriptor_t data_desc_; - cudnnTensorDescriptor_t bn_param_desc_; - cudnnBatchNormMode_t mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT; - - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnCreateTensorDescriptor(&data_desc_)); - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_)); - - std::vector dims = {N, C, H, W, D}; - std::vector strides = {H * W * D * C, 1, W * D * C, D * C, C}; - - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor( - data_desc_, - CudnnDataType::type, - in_dims.size() > 3 ? in_dims.size() : 4, - dims.data(), - strides.data())); - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnDeriveBNTensorDescriptor( - bn_param_desc_, data_desc_, mode_)); - - double this_factor = 1. - momentum; - cudnnBatchNormOps_t bnOps_ = CUDNN_BATCHNORM_OPS_BN_ADD_ACTIVATION; - platform::ScopedActivationDescriptor scope_act_desc; - cudnnActivationDescriptor_t activation_desc_ = - scope_act_desc.descriptor(act_type); - size_t workspace_size = 0; - size_t reserve_space_size = 0; - void *reserve_space_ptr = nullptr; - void *workspace_ptr = nullptr; - phi::DenseTensor workspace_tensor; - // Create reserve space and workspace for batch norm. - // Create tensor for each batchnorm op, it will be used in the - // backward. Thus this tensor shouldn't be temp. - auto *reserve_space = ctx.Output("ReserveSpace"); - PADDLE_ENFORCE_NOT_NULL( - reserve_space, - platform::errors::NotFound( - "The argument ReserveSpace of batch_norm op is not found.")); - - // --------------- cudnn batchnorm workspace --------------- - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload:: - cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize( - /*handle=*/handle, - /*mode=*/mode_, - /*bnOps=*/bnOps_, - /*xDesc=*/data_desc_, - /*zDesc=*/data_desc_, - /*yDesc=*/data_desc_, - /*bnScaleBiasMeanVarDesc=*/bn_param_desc_, - /*activationDesc=*/activation_desc_, - /*sizeInBytes=*/&workspace_size)); - - // -------------- cudnn batchnorm reserve space -------------- - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnGetBatchNormalizationTrainingExReserveSpaceSize( - /*handle=*/handle, - /*mode=*/mode_, - /*bnOps=*/bnOps_, - /*activationDesc=*/activation_desc_, - /*xDesc=*/data_desc_, - /*sizeInBytes=*/&reserve_space_size)); - - reserve_space->Resize({static_cast( - (reserve_space_size + phi::SizeOf(x->dtype()) - 1) / - phi::SizeOf(x->dtype()))}); - reserve_space_ptr = - dev_ctx.Alloc(reserve_space, reserve_space->numel() * sizeof(T)); - workspace_tensor.Resize( - {static_cast((workspace_size + phi::SizeOf(x->dtype()) - 1) / - phi::SizeOf(x->dtype()))}); - workspace_ptr = dev_ctx.Alloc(&workspace_tensor, - workspace_tensor.numel() * sizeof(T)); - - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnBatchNormalizationForwardTrainingEx( - handle, - mode_, - bnOps_, - CudnnDataType::kOne(), - CudnnDataType::kZero(), - data_desc_, - x->template data(), - data_desc_, - z->template data(), - data_desc_, - y->template data(), - bn_param_desc_, - scale->template data>(), - bias->template data>(), - this_factor, - dev_ctx.template Alloc>( - mean_out, mean_out->numel() * sizeof(BatchNormParamType)), - dev_ctx.template Alloc>( - variance_out, - variance_out->numel() * sizeof(BatchNormParamType)), - epsilon, - dev_ctx.template Alloc>( - saved_mean, - saved_mean->numel() * sizeof(BatchNormParamType)), - dev_ctx.template Alloc>( - saved_variance, - saved_variance->numel() * sizeof(BatchNormParamType)), - activation_desc_, - workspace_ptr, - workspace_size, - reserve_space_ptr, - reserve_space_size)); - - // clean when exit. - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnDestroyTensorDescriptor(data_desc_)); - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_)); - } -}; - -template -class FusedBatchNormAddActGradKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { -#if CUDNN_VERSION < 7401 - PADDLE_THROW(phi::errors::Unimplemented( - "The fused_bn_add_activation operator is not supported on GPU " - "when CUDNN version < 7.4.1")); -#endif - PADDLE_ENFORCE_EQ( - platform::is_gpu_place(ctx.GetPlace()), - true, - platform::errors::PreconditionNotMet("It must use CUDAPlace.")); - double epsilon = static_cast(ctx.Attr("epsilon")); - std::string act_type = ctx.Attr("act_type"); - - const auto *x = ctx.Input("X"); - const auto *y = ctx.Input("Y"); - const auto *d_y = ctx.Input(framework::GradVarName("Y")); - const auto *scale = ctx.Input("Scale"); - const auto *bias = ctx.Input("Bias"); - const auto *reserve_space = ctx.Input("ReserveSpace"); - - auto &dev_ctx = ctx.template device_context(); - const auto &in_dims = x->dims(); - - int N, C, H, W, D; - const DataLayout data_layout = DataLayout::kNHWC; - phi::funcs::ExtractNCWHD(in_dims, data_layout, &N, &C, &H, &W, &D); - - // init output - auto *d_x = ctx.Output(framework::GradVarName("X")); - auto *d_z = ctx.Output(framework::GradVarName("Z")); - auto *d_scale = - ctx.Output(framework::GradVarName("Scale")); - auto *d_bias = ctx.Output(framework::GradVarName("Bias")); - - d_x->mutable_data(ctx.GetPlace()); - d_z->mutable_data(ctx.GetPlace()); - PADDLE_ENFORCE_EQ( - d_scale && d_bias, - true, - platform::errors::PreconditionNotMet( - "Both the scale grad and the bias grad must not be null.")); - d_scale->mutable_data>(ctx.GetPlace()); - d_bias->mutable_data>(ctx.GetPlace()); - PADDLE_ENFORCE_EQ(scale->dims().size(), - 1UL, - platform::errors::PreconditionNotMet( - "The scale only has one dimension.")); - PADDLE_ENFORCE_EQ( - scale->dims()[0], - C, - platform::errors::PreconditionNotMet( - "The size of scale is equal to the channel of Input(X).")); - - std::vector dims = {N, C, H, W, D}; - std::vector strides = {H * W * C * D, 1, W * D * C, D * C, C}; - // ------------------- cudnn descriptors --------------------- - cudnnTensorDescriptor_t data_desc_; - cudnnTensorDescriptor_t bn_param_desc_; - cudnnBatchNormMode_t mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT; - - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnCreateTensorDescriptor(&data_desc_)); - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_)); - if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) { - LOG(ERROR) << "Provided epsilon is smaller than " - << "CUDNN_BN_MIN_EPSILON. Setting it to " - << "CUDNN_BN_MIN_EPSILON instead."; - } - epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON); - - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor( - data_desc_, - CudnnDataType::type, - in_dims.size() > 3 ? in_dims.size() : 4, - dims.data(), - strides.data())); - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnDeriveBNTensorDescriptor( - bn_param_desc_, data_desc_, mode_)); - - const auto *saved_mean = ctx.Input("SavedMean"); - const auto *saved_var = ctx.Input("SavedVariance"); - const auto *saved_mean_data = - saved_mean->template data>(); - const auto *saved_var_data = - saved_var->template data>(); - - size_t workspace_size = 0; - void *workspace_ptr = nullptr; - phi::DenseTensor workspace_tensor; - auto reserve_space_size = reserve_space->memory_size(); - cudnnBatchNormOps_t bnOps_ = CUDNN_BATCHNORM_OPS_BN_ADD_ACTIVATION; - platform::ScopedActivationDescriptor scope_act_desc; - cudnnActivationDescriptor_t activation_desc_ = - scope_act_desc.descriptor(act_type); - // --------------- cudnn batchnorm workspace --------------- - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnGetBatchNormalizationBackwardExWorkspaceSize( - /*handle=*/dev_ctx.cudnn_handle(), - /*mode=*/mode_, - /*bnOps=*/bnOps_, - /*xDesc=*/data_desc_, - /*yDesc=*/data_desc_, - /*dyDesc=*/data_desc_, - /*dzDesc=*/data_desc_, - /*dxDesc=*/data_desc_, - /*bnScaleBiasMeanVarDesc=*/bn_param_desc_, - /*activationDesc=*/activation_desc_, - /*sizeInBytes=*/&workspace_size)); - - workspace_ptr = workspace_tensor.mutable_data( - ctx.GetPlace(), x->dtype(), workspace_size); - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnBatchNormalizationBackwardEx( - /*handle=*/dev_ctx.cudnn_handle(), - /*mode=*/mode_, - /*bnOps=*/bnOps_, - /*alphaDataDiff=*/CudnnDataType::kOne(), - /*betaDataDiff=*/CudnnDataType::kZero(), - /*alphaParamDiff=*/CudnnDataType::kOne(), - /*betaParamDiff=*/CudnnDataType::kZero(), - /*xDesc=*/data_desc_, - /*xData=*/x->template data(), - /*yDesc=*/data_desc_, - /*yData=*/y->template data(), - /*dyDesc=*/data_desc_, - /*dyData=*/d_y->template data(), - /*dzDesc=*/data_desc_, - /*dzData=*/d_z->template data(), - /*dxDesc=*/data_desc_, - /*dxData=*/d_x->template data(), - /*dBnScaleBiasDesc=*/bn_param_desc_, - /*bnScaleData=*/scale->template data>(), - /*bnBiasData=*/bias->template data>(), - /*dBnScaleData=*/d_scale->template data>(), - /*dBnBiasData=*/d_bias->template data>(), - /*epsilon=*/epsilon, - /*savedMean=*/saved_mean_data, - /*savedInvVariance=*/saved_var_data, - /*activationDesmc=*/activation_desc_, - /*workspace=*/workspace_ptr, - /*workSpaceSizeInBytes=*/workspace_size, - /*reserveSpace=*/const_cast(reserve_space->template data()), - /*reserveSpaceSizeInBytes=*/reserve_space_size)); - - // clean when exit. - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnDestroyTensorDescriptor(data_desc_)); - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_)); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; -PD_REGISTER_STRUCT_KERNEL(fused_bn_add_activation, - GPU, - ALL_LAYOUT, - ops::FusedBatchNormAddActKernel, - plat::float16) {} -PD_REGISTER_STRUCT_KERNEL(fused_bn_add_activation_grad, - GPU, - ALL_LAYOUT, - ops::FusedBatchNormAddActGradKernel, - plat::float16) {} diff --git a/paddle/fluid/operators/fused/fused_bn_add_activation_op.h b/paddle/fluid/operators/fused/fused_bn_add_activation_op.h index 215ccfdde5e02..82967b043d89e 100644 --- a/paddle/fluid/operators/fused/fused_bn_add_activation_op.h +++ b/paddle/fluid/operators/fused/fused_bn_add_activation_op.h @@ -89,17 +89,5 @@ class FusedBatchNormAddActOpInferVarType } }; -template -class FusedBatchNormAddActKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override; -}; - -template -class FusedBatchNormAddActGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override; -}; - } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/quantize_linear_op.cc b/paddle/fluid/operators/quantize_linear_op.cc index e2c2eb7768e1b..c0ef288b5134b 100644 --- a/paddle/fluid/operators/quantize_linear_op.cc +++ b/paddle/fluid/operators/quantize_linear_op.cc @@ -239,11 +239,3 @@ REGISTER_OPERATOR( ops::QuantizeLinearOpMaker, paddle::framework::EmptyGradOpMaker, paddle::framework::EmptyGradOpMaker); - -PD_REGISTER_STRUCT_KERNEL(dequantize_linear, - CPU, - ALL_LAYOUT, - ops::DeQuantizeLinearKernel, - float, - int8_t, - double) {} diff --git a/paddle/fluid/operators/quantize_linear_op.cu b/paddle/fluid/operators/quantize_linear_op.cu index f0d6523d054c2..8bcbc1107e9d1 100644 --- a/paddle/fluid/operators/quantize_linear_op.cu +++ b/paddle/fluid/operators/quantize_linear_op.cu @@ -123,15 +123,6 @@ template struct ChannelDequantizeFunctorV2; namespace ops = paddle::operators; -PD_REGISTER_STRUCT_KERNEL(dequantize_linear, - GPU, - ALL_LAYOUT, - ops::DeQuantizeLinearKernel, - float, - float16, - int8_t, - double) {} - PD_REGISTER_STRUCT_KERNEL(quantize_linear, GPU, ALL_LAYOUT, diff --git a/paddle/fluid/operators/quantize_linear_op.h b/paddle/fluid/operators/quantize_linear_op.h index 276d1507a4aef..d6c3b3d2e50ae 100644 --- a/paddle/fluid/operators/quantize_linear_op.h +++ b/paddle/fluid/operators/quantize_linear_op.h @@ -130,74 +130,5 @@ class QuantizeLinearKernel : public framework::OpKernel { } }; -template -class DeQuantizeLinearKernel : public framework::OpKernel { - public: - template - void ComputeImpl(const framework::ExecutionContext& context) const { - auto& dev_ctx = context.template device_context(); - auto* in = context.Input("X"); - - auto in_tmp = phi::Cast( - static_cast::TYPE&>(dev_ctx), - *in, - phi::CppTypeToDataType::Type()); - - auto* scale = context.Input("Scale"); - auto* out = context.Output("Y"); - int bit_length = context.Attr("bit_length"); - auto quant_axis = context.Attr("quant_axis"); - dev_ctx.template Alloc(out, out->numel() * sizeof(D)); - bool only_observer = context.Attr("only_observer"); - - if (only_observer) { - framework::TensorCopy(*in, context.GetPlace(), dev_ctx, out); - return; - } - - if (quant_axis < 0) { - float max_range = (std::pow(2, bit_length - 1) - 1); - DequantizeFunctor()( - dev_ctx, &in_tmp, scale, static_cast(max_range), out); - } else { - PADDLE_ENFORCE_EQ( - scale->numel(), - in_tmp.dims()[quant_axis], - platform::errors::PreconditionNotMet( - "The number of first scale values must be the same with " - "quant_axis dimension value of Input(X) when the `scale` has " - "only one element, but %ld != %ld here.", - scale->numel(), - in_tmp.dims()[quant_axis])); - int max_range = (std::pow(2, bit_length - 1) - 1); - - ChannelDequantizeFunctorV2()( - dev_ctx, &in_tmp, scale, static_cast(max_range), quant_axis, out); - } - } - - void Compute(const framework::ExecutionContext& context) const override { - auto* scale = context.Input("Scale"); - switch (scale->dtype()) { - case phi::DataType::FLOAT64: - ComputeImpl(context); - break; - case phi::DataType::FLOAT32: - ComputeImpl(context); - break; - case phi::DataType::FLOAT16: - ComputeImpl(context); - break; - default: - PADDLE_THROW(platform::errors::Unimplemented( - "In DeQuantizeLinearKernel, " - "data type %d for scale/output is not supported ", - scale->dtype())); - break; - } - } -}; - } // namespace operators } // namespace paddle diff --git a/paddle/phi/kernels/cpu/quantize_linear_kernel.cc b/paddle/phi/kernels/cpu/quantize_linear_kernel.cc new file mode 100644 index 0000000000000..a7f3954407a52 --- /dev/null +++ b/paddle/phi/kernels/cpu/quantize_linear_kernel.cc @@ -0,0 +1,109 @@ +// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include "paddle/phi/kernels/quantize_linear_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/common/type_traits.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" +#include "paddle/phi/kernels/impl/quantize_linear_impl.h" + +namespace phi { + +template +struct DequantizeFunctor { + void operator()(const phi::CPUContext& dev_ctx, + const phi::DenseTensor* in, + const phi::DenseTensor* scale, + T max_range, + phi::DenseTensor* out) { + auto in_e = phi::EigenVector::Flatten(*in); + const T* scale_factor = scale->data(); + auto out_e = phi::EigenVector::Flatten(*out); + + auto& dev = *dev_ctx.eigen_device(); + out_e.device(dev) = in_e * scale_factor[0] / max_range; + } +}; + +template +struct ChannelDequantizeFunctorV2 { + void operator()(const phi::CPUContext& dev_ctx, + const phi::DenseTensor* in, + const phi::DenseTensor* scale, + T max_range, + const int quant_axis, + phi::DenseTensor* out) { + // Dequant op is before quantized op + // Dequantize the weight of quantized op + auto in_dims = in->dims(); + const int64_t channel = in_dims[quant_axis]; + const T* scale_factor = scale->data(); + if (quant_axis == 0) { + for (int64_t i = 0; i < channel; i++) { + T s = scale_factor[i]; + phi::DenseTensor one_channel_in = in->Slice(i, i + 1); + phi::DenseTensor one_channel_out = out->Slice(i, i + 1); + auto in_e = phi::EigenVector::Flatten(one_channel_in); + auto out_e = phi::EigenVector::Flatten(one_channel_out); + auto& dev = *dev_ctx.eigen_device(); + out_e.device(dev) = in_e * s / max_range; + } + } else if (quant_axis == 1) { + int64_t out_iter = 1; + for (int i = 0; i < quant_axis; i++) { + out_iter *= in_dims[i]; + } + int64_t step_i = in->numel() / out_iter; + int64_t step_j = in->numel() / (out_iter * channel); + auto* in_data = in->data(); + auto* out_data = dev_ctx.Alloc(out, out->numel() * sizeof(T)); + for (int64_t i = 0; i < out_iter; i++) { + for (int64_t j = 0; j < channel; j++) { + auto* cur_in = in_data + i * step_i + j * step_j; + auto* cur_out = out_data + i * step_i + j * step_j; + T s = scale_factor[j]; + for (int64_t k = 0; k < step_j; k++) { + *cur_out = (*cur_in) * s / max_range; + ++cur_in; + ++cur_out; + } + } + } + } + } +}; + +template struct DequantizeFunctor; +template struct DequantizeFunctor; +template struct DequantizeFunctor; +template struct ChannelDequantizeFunctorV2; +template struct ChannelDequantizeFunctorV2; +template struct ChannelDequantizeFunctorV2; + +} // namespace phi + +PD_REGISTER_KERNEL(dequantize_linear, + CPU, + ALL_LAYOUT, + phi::DeQuantizeLinearKernel, + float, + int8_t, + double) { + kernel->OutputAt(0).SetDataType(phi::DataType::UNDEFINED); +} diff --git a/paddle/phi/kernels/fused_bn_add_activation_grad_kernel.h b/paddle/phi/kernels/fused_bn_add_activation_grad_kernel.h new file mode 100644 index 0000000000000..c98a5f69ae0d6 --- /dev/null +++ b/paddle/phi/kernels/fused_bn_add_activation_grad_kernel.h @@ -0,0 +1,39 @@ +// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once + +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +template +void FusedBatchNormAddActGradKernel(const Context &dev_ctx, + const DenseTensor &x, + const DenseTensor &y, + const DenseTensor &y_grad, + const DenseTensor &scale, + const DenseTensor &bias, + const DenseTensor &saved_mean, + const DenseTensor &saved_variance, + const DenseTensor &reserve_space, + float momentum, + float epsilon, + const std::string &act_type, + DenseTensor *x_grad, + DenseTensor *z_grad, + DenseTensor *scale_grad, + DenseTensor *bias_grad); + +} // namespace phi diff --git a/paddle/phi/kernels/fused_bn_add_activation_kernel.h b/paddle/phi/kernels/fused_bn_add_activation_kernel.h new file mode 100644 index 0000000000000..9d4f468a261ee --- /dev/null +++ b/paddle/phi/kernels/fused_bn_add_activation_kernel.h @@ -0,0 +1,39 @@ +// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once + +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +template +void FusedBatchNormAddActKernel(const Context &dev_ctx, + const DenseTensor &x, + const DenseTensor &z, + const DenseTensor &scale, + const DenseTensor &bias, + const DenseTensor &mean, + const DenseTensor &variance, + float momentum, + float epsilon, + const std::string &act_type, + DenseTensor *y, + DenseTensor *mean_out, + DenseTensor *variance_out, + DenseTensor *saved_mean, + DenseTensor *saved_variance, + DenseTensor *reserve_space); + +} // namespace phi diff --git a/paddle/phi/kernels/fusion/gpu/fused_bn_add_activation_grad_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_bn_add_activation_grad_kernel.cu new file mode 100644 index 0000000000000..e19b468b54a35 --- /dev/null +++ b/paddle/phi/kernels/fusion/gpu/fused_bn_add_activation_grad_kernel.cu @@ -0,0 +1,223 @@ +// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include + +#ifdef __NVCC__ +#include "cub/cub.cuh" +#endif + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/backends/gpu/gpu_dnn.h" +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/flags.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/activation_functor.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" +#include "paddle/phi/kernels/funcs/math_function.h" +#include "paddle/phi/kernels/funcs/norm_utils.h" +#include "paddle/phi/kernels/fused_bn_add_activation_grad_kernel.h" + +PHI_DECLARE_bool(cudnn_batchnorm_spatial_persistent); + +namespace phi { +namespace fusion { + +template +using CudnnDataType = phi::backends::gpu::CudnnDataType; +template +using BatchNormParamType = typename CudnnDataType::BatchNormParamType; + +template +void FusedBatchNormAddActGradKernel(const Context &dev_ctx, + const DenseTensor &x, + const DenseTensor &y, + const DenseTensor &y_grad, + const DenseTensor &scale, + const DenseTensor &bias, + const DenseTensor &saved_mean, + const DenseTensor &saved_variance, + const DenseTensor &reserve_space, + float momentum, + float epsilon, + const std::string &act_type, + DenseTensor *x_grad, + DenseTensor *z_grad, + DenseTensor *scale_grad, + DenseTensor *bias_grad) { +#if CUDNN_VERSION < 7401 + PADDLE_THROW(phi::errors::Unimplemented( + "The fused_bn_add_activation operator is not supported on GPU " + "when CUDNN version < 7.4.1")); +#endif + bool is_gpu_place = dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU; + PADDLE_ENFORCE_EQ(is_gpu_place, + true, + phi::errors::PreconditionNotMet("It must use CUDAPlace.")); + double epsilon1 = static_cast(epsilon); + + const auto *x_ptr = &x; + const auto *y_ptr = &y; + const auto *d_y = &y_grad; + const auto *scale_ptr = &scale; + const auto *bias_ptr = &bias; + const auto *reserve_space_ptr = &reserve_space; + + const auto &in_dims = x_ptr->dims(); + + int N, C, H, W, D; + const DataLayout data_layout = DataLayout::kNHWC; + phi::funcs::ExtractNCWHD(in_dims, data_layout, &N, &C, &H, &W, &D); + + // init output + auto *d_x = x_grad; + auto *d_z = z_grad; + auto *d_scale = scale_grad; + auto *d_bias = bias_grad; + + dev_ctx.template Alloc(d_x); + dev_ctx.template Alloc(d_z); + + PADDLE_ENFORCE_EQ( + d_scale && d_bias, + true, + phi::errors::PreconditionNotMet( + "Both the scale grad and the bias grad must not be null.")); + + dev_ctx.template Alloc>(d_scale); + dev_ctx.template Alloc>(d_bias); + + PADDLE_ENFORCE_EQ( + scale_ptr->dims().size(), + 1UL, + phi::errors::PreconditionNotMet("The scale only has one dimension.")); + PADDLE_ENFORCE_EQ( + scale_ptr->dims()[0], + C, + phi::errors::PreconditionNotMet( + "The size of scale is equal to the channel of Input(X).")); + + std::vector dims = {N, C, H, W, D}; + std::vector strides = {H * W * C * D, 1, W * D * C, D * C, C}; + // ------------------- cudnn descriptors --------------------- + cudnnTensorDescriptor_t data_desc_; + cudnnTensorDescriptor_t bn_param_desc_; + cudnnBatchNormMode_t mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT; + + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::cudnnCreateTensorDescriptor(&data_desc_)); + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_)); + if (epsilon1 <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) { + LOG(ERROR) << "Provided epsilon is smaller than " + << "CUDNN_BN_MIN_EPSILON. Setting it to " + << "CUDNN_BN_MIN_EPSILON instead."; + } + epsilon1 = std::max(epsilon1, CUDNN_BN_MIN_EPSILON); + + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSetTensorNdDescriptor( + data_desc_, + CudnnDataType::type, + in_dims.size() > 3 ? in_dims.size() : 4, + dims.data(), + strides.data())); + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnDeriveBNTensorDescriptor( + bn_param_desc_, data_desc_, mode_)); + + const auto *saved_mean_ptr = &saved_mean; + const auto *saved_var_ptr = &saved_variance; + const auto *saved_mean_data = + saved_mean_ptr->template data>(); + const auto *saved_var_data = + saved_var_ptr->template data>(); + + size_t workspace_size = 0; + void *workspace_ptr = nullptr; + phi::DenseTensor workspace_tensor; + auto reserve_space_size = reserve_space_ptr->memory_size(); + cudnnBatchNormOps_t bnOps_ = CUDNN_BATCHNORM_OPS_BN_ADD_ACTIVATION; + phi::backends::gpu::ScopedActivationDescriptor scope_act_desc; + cudnnActivationDescriptor_t activation_desc_ = + scope_act_desc.descriptor(act_type); + // --------------- cudnn batchnorm workspace --------------- + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::cudnnGetBatchNormalizationBackwardExWorkspaceSize( + /*handle=*/dev_ctx.cudnn_handle(), + /*mode=*/mode_, + /*bnOps=*/bnOps_, + /*xDesc=*/data_desc_, + /*yDesc=*/data_desc_, + /*dyDesc=*/data_desc_, + /*dzDesc=*/data_desc_, + /*dxDesc=*/data_desc_, + /*bnScaleBiasMeanVarDesc=*/bn_param_desc_, + /*activationDesc=*/activation_desc_, + /*sizeInBytes=*/&workspace_size)); + + workspace_tensor.Resize({static_cast(workspace_size)}); + workspace_ptr = dev_ctx.template Alloc(&workspace_tensor); + + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnBatchNormalizationBackwardEx( + /*handle=*/dev_ctx.cudnn_handle(), + /*mode=*/mode_, + /*bnOps=*/bnOps_, + /*alphaDataDiff=*/CudnnDataType::kOne(), + /*betaDataDiff=*/CudnnDataType::kZero(), + /*alphaParamDiff=*/CudnnDataType::kOne(), + /*betaParamDiff=*/CudnnDataType::kZero(), + /*xDesc=*/data_desc_, + /*xData=*/x_ptr->template data(), + /*yDesc=*/data_desc_, + /*yData=*/y_ptr->template data(), + /*dyDesc=*/data_desc_, + /*dyData=*/d_y->template data(), + /*dzDesc=*/data_desc_, + /*dzData=*/d_z->template data(), + /*dxDesc=*/data_desc_, + /*dxData=*/d_x->template data(), + /*dBnScaleBiasDesc=*/bn_param_desc_, + /*bnScaleData=*/scale_ptr->template data>(), + /*bnBiasData=*/bias_ptr->template data>(), + /*dBnScaleData=*/d_scale->template data>(), + /*dBnBiasData=*/d_bias->template data>(), + /*epsilon=*/epsilon1, + /*savedMean=*/saved_mean_data, + /*savedInvVariance=*/saved_var_data, + /*activationDesmc=*/activation_desc_, + /*workspace=*/workspace_ptr, + /*workSpaceSizeInBytes=*/workspace_size, + /*reserveSpace=*/const_cast(reserve_space_ptr->template data()), + /*reserveSpaceSizeInBytes=*/reserve_space_size)); + + // clean when exit. + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::cudnnDestroyTensorDescriptor(data_desc_)); + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_)); +} + +} // namespace fusion +} // namespace phi + +PD_REGISTER_KERNEL(fused_bn_add_activation_grad, + GPU, + ALL_LAYOUT, + phi::fusion::FusedBatchNormAddActGradKernel, + phi::dtype::float16) { + kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32); + kernel->OutputAt(3).SetDataType(phi::DataType::FLOAT32); +} diff --git a/paddle/phi/kernels/fusion/gpu/fused_bn_add_activation_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_bn_add_activation_kernel.cu new file mode 100644 index 0000000000000..7b5b4119cf970 --- /dev/null +++ b/paddle/phi/kernels/fusion/gpu/fused_bn_add_activation_kernel.cu @@ -0,0 +1,227 @@ +// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include + +#ifdef __NVCC__ +#include "cub/cub.cuh" +#endif + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/backends/gpu/gpu_dnn.h" +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/flags.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/activation_functor.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" +#include "paddle/phi/kernels/funcs/math_function.h" +#include "paddle/phi/kernels/funcs/norm_utils.h" +#include "paddle/phi/kernels/fused_bn_add_activation_kernel.h" + +PHI_DECLARE_bool(cudnn_batchnorm_spatial_persistent); + +namespace phi { +namespace fusion { + +template +using CudnnDataType = phi::backends::gpu::CudnnDataType; +template +using BatchNormParamType = typename CudnnDataType::BatchNormParamType; + +template +void FusedBatchNormAddActKernel(const Context &dev_ctx, + const DenseTensor &x, + const DenseTensor &z, + const DenseTensor &scale, + const DenseTensor &bias, + const DenseTensor &mean, + const DenseTensor &variance, + float momentum, + float epsilon, + const std::string &act_type, + DenseTensor *y, + DenseTensor *mean_out, + DenseTensor *variance_out, + DenseTensor *saved_mean, + DenseTensor *saved_variance, + DenseTensor *reserve_space) { +#if CUDNN_VERSION < 7401 + PADDLE_THROW(phi::errors::Unimplemented( + "The fused_bn_add_activation operator is not supported on GPU " + "when CUDNN version < 7.4.1")); +#endif + bool is_gpu_place = dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU; + PADDLE_ENFORCE_EQ(is_gpu_place, + true, + phi::errors::PreconditionNotMet("It must use CUDAPlace.")); + + double epsilon1 = static_cast(epsilon); + if (epsilon1 <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) { + LOG(ERROR) << "Provided epsilon is smaller than " + << "CUDNN_BN_MIN_EPSILON. Setting it to " + << "CUDNN_BN_MIN_EPSILON instead."; + } + epsilon1 = std::max(static_cast(epsilon1), CUDNN_BN_MIN_EPSILON); + + // Get the size for each dimension. + // NHWC [batch_size, in_height, in_width, in_channels] + const auto &in_dims = x.dims(); + + dev_ctx.template Alloc>( + mean_out, mean_out->numel() * sizeof(BatchNormParamType)); + dev_ctx.template Alloc>( + variance_out, variance_out->numel() * sizeof(BatchNormParamType)); + + dev_ctx.template Alloc>( + saved_mean, saved_mean->numel() * sizeof(BatchNormParamType)); + dev_ctx.template Alloc>( + saved_variance, saved_variance->numel() * sizeof(BatchNormParamType)); + + dev_ctx.template Alloc(y, y->numel() * sizeof(T)); + + int N, C, H, W, D; + const DataLayout data_layout = DataLayout::kNHWC; + phi::funcs::ExtractNCWHD(in_dims, data_layout, &N, &C, &H, &W, &D); + + // ------------------- cudnn descriptors --------------------- + auto handle = dev_ctx.cudnn_handle(); + cudnnTensorDescriptor_t data_desc_; + cudnnTensorDescriptor_t bn_param_desc_; + cudnnBatchNormMode_t mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT; + + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::cudnnCreateTensorDescriptor(&data_desc_)); + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_)); + + std::vector dims = {N, C, H, W, D}; + std::vector strides = {H * W * D * C, 1, W * D * C, D * C, C}; + + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSetTensorNdDescriptor( + data_desc_, + CudnnDataType::type, + in_dims.size() > 3 ? in_dims.size() : 4, + dims.data(), + strides.data())); + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnDeriveBNTensorDescriptor( + bn_param_desc_, data_desc_, mode_)); + + double this_factor = 1. - momentum; + cudnnBatchNormOps_t bnOps_ = CUDNN_BATCHNORM_OPS_BN_ADD_ACTIVATION; + phi::backends::gpu::ScopedActivationDescriptor scope_act_desc; + cudnnActivationDescriptor_t activation_desc_ = + scope_act_desc.descriptor(act_type); + size_t workspace_size = 0; + size_t reserve_space_size = 0; + void *reserve_space_ptr = nullptr; + void *workspace_ptr = nullptr; + phi::DenseTensor workspace_tensor; + // Create reserve space and workspace for batch norm. + // Create tensor for each batchnorm op, it will be used in the + // backward. Thus this tensor shouldn't be temp. + PADDLE_ENFORCE_NOT_NULL( + reserve_space, + phi::errors::NotFound( + "The argument ReserveSpace of batch_norm op is not found.")); + + // --------------- cudnn batchnorm workspace --------------- + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize( + /*handle=*/handle, + /*mode=*/mode_, + /*bnOps=*/bnOps_, + /*xDesc=*/data_desc_, + /*zDesc=*/data_desc_, + /*yDesc=*/data_desc_, + /*bnScaleBiasMeanVarDesc=*/bn_param_desc_, + /*activationDesc=*/activation_desc_, + /*sizeInBytes=*/&workspace_size)); + + // -------------- cudnn batchnorm reserve space -------------- + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::cudnnGetBatchNormalizationTrainingExReserveSpaceSize( + /*handle=*/handle, + /*mode=*/mode_, + /*bnOps=*/bnOps_, + /*activationDesc=*/activation_desc_, + /*xDesc=*/data_desc_, + /*sizeInBytes=*/&reserve_space_size)); + + reserve_space->Resize( + {static_cast((reserve_space_size + phi::SizeOf(x.dtype()) - 1) / + phi::SizeOf(x.dtype()))}); + reserve_space_ptr = dev_ctx.template Alloc( + reserve_space, reserve_space->numel() * sizeof(T)); + workspace_tensor.Resize({static_cast( + (workspace_size + phi::SizeOf(x.dtype()) - 1) / phi::SizeOf(x.dtype()))}); + workspace_ptr = dev_ctx.template Alloc( + &workspace_tensor, workspace_tensor.numel() * sizeof(T)); + + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::cudnnBatchNormalizationForwardTrainingEx( + handle, + mode_, + bnOps_, + CudnnDataType::kOne(), + CudnnDataType::kZero(), + data_desc_, + x.template data(), + data_desc_, + z.template data(), + data_desc_, + y->template data(), + bn_param_desc_, + scale.template data>(), + bias.template data>(), + this_factor, + dev_ctx.template Alloc>( + mean_out, mean_out->numel() * sizeof(BatchNormParamType)), + dev_ctx.template Alloc>( + variance_out, + variance_out->numel() * sizeof(BatchNormParamType)), + epsilon1, + dev_ctx.template Alloc>( + saved_mean, saved_mean->numel() * sizeof(BatchNormParamType)), + dev_ctx.template Alloc>( + saved_variance, + saved_variance->numel() * sizeof(BatchNormParamType)), + activation_desc_, + workspace_ptr, + workspace_size, + reserve_space_ptr, + reserve_space_size)); + + // clean when exit. + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::cudnnDestroyTensorDescriptor(data_desc_)); + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_)); +} + +} // namespace fusion +} // namespace phi + +PD_REGISTER_KERNEL(fused_bn_add_activation, + GPU, + ALL_LAYOUT, + phi::fusion::FusedBatchNormAddActKernel, + phi::dtype::float16) { + kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32); + kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32); + kernel->OutputAt(3).SetDataType(phi::DataType::FLOAT32); + kernel->OutputAt(4).SetDataType(phi::DataType::FLOAT32); +} diff --git a/paddle/phi/kernels/gpu/batch_norm_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_kernel.cu index ad276ec6f1812..3b73935699bab 100644 --- a/paddle/phi/kernels/gpu/batch_norm_kernel.cu +++ b/paddle/phi/kernels/gpu/batch_norm_kernel.cu @@ -1255,6 +1255,9 @@ PD_REGISTER_KERNEL(batch_norm, kernel->OutputAt(3).SetDataType(phi::DataType::FLOAT32); kernel->OutputAt(4).SetDataType(phi::DataType::FLOAT32); } +#if CUDNN_VERSION_MIN(7, 4, 1) + kernel->OutputAt(5).SetDataType(phi::DataType::UINT8); +#endif } #else PD_REGISTER_KERNEL(batch_norm, @@ -1274,6 +1277,9 @@ PD_REGISTER_KERNEL(batch_norm, kernel->OutputAt(3).SetDataType(phi::DataType::FLOAT32); kernel->OutputAt(4).SetDataType(phi::DataType::FLOAT32); } +#if CUDNN_VERSION_MIN(7, 4, 1) + kernel->OutputAt(5).SetDataType(phi::DataType::UINT8); +#endif } #endif diff --git a/paddle/phi/kernels/gpu/quantize_linear_kernel.cu b/paddle/phi/kernels/gpu/quantize_linear_kernel.cu new file mode 100644 index 0000000000000..11c043e76f464 --- /dev/null +++ b/paddle/phi/kernels/gpu/quantize_linear_kernel.cu @@ -0,0 +1,130 @@ +// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include "paddle/phi/kernels/quantize_linear_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/common/type_traits.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/quantize_linear_impl.h" + +namespace phi { + +template +__global__ void KeDequantize( + const T* in, const T* scale, T max_range, int64_t num, T* out) { + int64_t idx = threadIdx.x + blockIdx.x * blockDim.x; + for (int64_t i = idx; i < num; i += blockDim.x * gridDim.x) { + out[i] = in[i] * scale[0] / max_range; + } +} + +template +__global__ void DequantizeOneScaleQuantAxisN(const T* in, + const T* scale, + const T max_range, + const int64_t num, + const int n_scales, + const int quant_stride, + T* out) { + int64_t idx = blockDim.x * blockIdx.x + threadIdx.x; + for (int64_t i = idx; i < num; i += blockDim.x * gridDim.x) { + T s = scale[(i / quant_stride) % n_scales]; + out[i] = in[i] * s / max_range; + } +} + +template +struct ChannelDequantizeFunctorV2 { + void operator()(const phi::GPUContext& dev_ctx, + const phi::DenseTensor* in, + const phi::DenseTensor* scale, + T max_range, + const int quant_axis, + phi::DenseTensor* out) { + auto in_dims = in->dims(); + const T* in_data = in->data(); + T* out_data = dev_ctx.template Alloc(out, out->numel() * sizeof(T)); + int64_t num = in->numel(); + const T* scale_factor = scale->data(); + int64_t block_size = std::min( + num, static_cast(dev_ctx.GetMaxThreadsPerBlock() / 4)); + int64_t max_threads = + dev_ctx.GetMaxPhysicalThreadCount(); // SM * block_per_SM + const int64_t max_blocks = + std::max(((max_threads - 1) / block_size + 1), static_cast(1)); + const int64_t grid_size = + std::min(max_blocks, (num + block_size - 1) / block_size); + + int quant_stride = 1; + for (int i = quant_axis + 1; i < in_dims.size(); i++) { + quant_stride *= in_dims[i]; + } + + DequantizeOneScaleQuantAxisN + <<>>(in_data, + scale_factor, + max_range, + num, + in_dims[quant_axis], + quant_stride, + out_data); + } +}; + +template +struct DequantizeFunctor { + void operator()(const phi::GPUContext& dev_ctx, + const phi::DenseTensor* in, + const phi::DenseTensor* scale, + T max_range, + phi::DenseTensor* out) { + const T* in_data = in->data(); + const T* scale_factor = scale->data(); + T* out_data = dev_ctx.template Alloc(out, out->numel() * sizeof(T)); + + int64_t num = in->numel(); + int64_t block_size = std::min( + num, static_cast(dev_ctx.GetMaxThreadsPerBlock() / 4)); + int64_t max_threads = + dev_ctx.GetMaxPhysicalThreadCount(); // SM * block_per_SM + const int64_t max_blocks = + std::max(((max_threads - 1) / block_size + 1), static_cast(1)); + const int64_t grid_size = + std::min(max_blocks, (num + block_size - 1) / block_size); + KeDequantize<<>>( + in_data, scale_factor, max_range, num, out_data); + } +}; + +template struct DequantizeFunctor; +template struct DequantizeFunctor; +template struct DequantizeFunctor; +template struct ChannelDequantizeFunctorV2; +template struct ChannelDequantizeFunctorV2; +template struct ChannelDequantizeFunctorV2; +} // namespace phi + +PD_REGISTER_KERNEL(dequantize_linear, + GPU, + ALL_LAYOUT, + phi::DeQuantizeLinearKernel, + float, + int8_t, + double, + phi::dtype::float16) { + kernel->OutputAt(0).SetDataType(phi::DataType::UNDEFINED); +} diff --git a/paddle/phi/kernels/impl/quantize_linear_impl.h b/paddle/phi/kernels/impl/quantize_linear_impl.h new file mode 100644 index 0000000000000..9f86fd07447ee --- /dev/null +++ b/paddle/phi/kernels/impl/quantize_linear_impl.h @@ -0,0 +1,127 @@ +// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +#include "paddle/phi/kernels/quantize_linear_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_launch_config.h" +#include "paddle/phi/common/place.h" +#include "paddle/phi/core/hostdevice.h" +#include "paddle/phi/core/tensor_utils.h" +#include "paddle/phi/kernels/cast_kernel.h" + +namespace phi { + +template +struct DequantizeFunctor { + void operator()(const Context& dev_ctx, + const phi::DenseTensor* in, + const phi::DenseTensor* scale, + T max_range, + phi::DenseTensor* out); +}; + +template +struct ChannelDequantizeFunctorV2 { + void operator()(const Context& dev_ctx, + const phi::DenseTensor* in, + const phi::DenseTensor** scales, + const int scale_num, + T max_range, + const int quant_axis, + phi::DenseTensor* out); +}; + +template +void DeQuantizeLinearImpl(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& scale, + int quant_axis, + int bit_length, + bool only_observer, + DenseTensor* out) { + auto* in = &x; + + auto in_tmp = phi::Cast(dev_ctx, *in, phi::CppTypeToDataType::Type()); + + dev_ctx.template Alloc(out, out->numel() * sizeof(D)); + + if (only_observer) { + phi::Copy(dev_ctx, *in, dev_ctx.GetPlace(), false, out); + return; + } + + if (quant_axis < 0) { + float max_range = (std::pow(2, bit_length - 1) - 1); + DequantizeFunctor()( + dev_ctx, &in_tmp, &scale, static_cast(max_range), out); + } else { + PADDLE_ENFORCE_EQ( + scale.numel(), + in_tmp.dims()[quant_axis], + phi::errors::PreconditionNotMet( + "The number of first scale values must be the same with " + "quant_axis dimension value of Input(X) when the `scale` has " + "only one element, but %ld != %ld here.", + scale.numel(), + in_tmp.dims()[quant_axis])); + int max_range = (std::pow(2, bit_length - 1) - 1); + + ChannelDequantizeFunctorV2()( + dev_ctx, &in_tmp, &scale, static_cast(max_range), quant_axis, out); + } +} + +template +void DeQuantizeLinearKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& scale, + const DenseTensor& zero_point, + const paddle::optional& in_accum, + const paddle::optional& in_state, + int quant_axis, + int bit_length, + int round_type, + bool is_test, + bool only_observer, + DenseTensor* out, + DenseTensor* out_state, + DenseTensor* out_accum, + DenseTensor* out_scale) { + switch (scale.dtype()) { + case phi::DataType::FLOAT64: + DeQuantizeLinearImpl( + dev_ctx, x, scale, quant_axis, bit_length, only_observer, out); + break; + case phi::DataType::FLOAT32: + DeQuantizeLinearImpl( + dev_ctx, x, scale, quant_axis, bit_length, only_observer, out); + break; + case phi::DataType::FLOAT16: + DeQuantizeLinearImpl( + dev_ctx, x, scale, quant_axis, bit_length, only_observer, out); + break; + default: + PADDLE_THROW(phi::errors::Unimplemented( + "In DeQuantizeLinearKernel, " + "data type %d for scale/output is not supported ", + scale.dtype())); + break; + } +} + +} // namespace phi diff --git a/paddle/phi/kernels/quantize_linear_kernel.h b/paddle/phi/kernels/quantize_linear_kernel.h new file mode 100644 index 0000000000000..c10a67f51e603 --- /dev/null +++ b/paddle/phi/kernels/quantize_linear_kernel.h @@ -0,0 +1,40 @@ +// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/device_context.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +template +void DeQuantizeLinearKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& scale, + const DenseTensor& zero_point, + const paddle::optional& in_accum, + const paddle::optional& in_state, + int quant_axis, + int bit_length, + int round_type, + bool is_test, + bool only_observer, + DenseTensor* out, + DenseTensor* out_state, + DenseTensor* out_accum, + DenseTensor* out_scale); + +} // namespace phi diff --git a/paddle/phi/kernels/xpu/batch_norm_kernel.cc b/paddle/phi/kernels/xpu/batch_norm_kernel.cc index b95dda1fed13d..e2f2d28182b67 100644 --- a/paddle/phi/kernels/xpu/batch_norm_kernel.cc +++ b/paddle/phi/kernels/xpu/batch_norm_kernel.cc @@ -140,4 +140,9 @@ PD_REGISTER_KERNEL(batch_norm, ALL_LAYOUT, phi::BatchNormKernel, float, - phi::dtype::float16) {} + phi::dtype::float16) { + kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32); + kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32); + kernel->OutputAt(3).SetDataType(phi::DataType::FLOAT32); + kernel->OutputAt(4).SetDataType(phi::DataType::FLOAT32); +} diff --git a/paddle/phi/ops/compat/fused_bn_add_activation_sig.cc b/paddle/phi/ops/compat/fused_bn_add_activation_sig.cc new file mode 100644 index 0000000000000..a9adffca84700 --- /dev/null +++ b/paddle/phi/ops/compat/fused_bn_add_activation_sig.cc @@ -0,0 +1,52 @@ +/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature FusedBatchNormAddActOpArgumentMapping( + const ArgumentMappingContext& ctx UNUSED) { + return KernelSignature("fused_bn_add_activation", + {"X", "Z", "Scale", "Bias", "Mean", "Variance"}, + {"momentum", "epsilon", "act_type"}, + {"Y", + "MeanOut", + "VarianceOut", + "SavedMean", + "SavedVariance", + "ReserveSpace"}); +} + +KernelSignature FusedBatchNormAddActGradOpArgumentMapping( + const ArgumentMappingContext& ctx UNUSED) { + return KernelSignature("fused_bn_add_activation_grad", + {"X", + "Y", + "Y@GRAD", + "Scale", + "Bias", + "SavedMean", + "SavedVariance", + "ReserveSpace"}, + {"momentum", "epsilon", "act_type"}, + {"X@GRAD", "Z@GRAD", "Scale@GRAD", "Bias@GRAD"}); +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(fused_bn_add_activation, + phi::FusedBatchNormAddActOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(fused_bn_add_activation_grad, + phi::FusedBatchNormAddActGradOpArgumentMapping); diff --git a/paddle/phi/ops/compat/quantize_linear_sig.cc b/paddle/phi/ops/compat/quantize_linear_sig.cc new file mode 100644 index 0000000000000..75e523bf55367 --- /dev/null +++ b/paddle/phi/ops/compat/quantize_linear_sig.cc @@ -0,0 +1,31 @@ +/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature DeQuantizeLinearOpArgumentMapping( + const ArgumentMappingContext& ctx UNUSED) { + return KernelSignature( + "dequantize_linear", + {"X", "Scale", "ZeroPoint", "InAccum", "InState"}, + {"quant_axis", "bit_length", "round_type", "is_test", "only_observer"}, + {"Y", "OutState", "OutAccum", "OutScale"}); +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(dequantize_linear, + phi::DeQuantizeLinearOpArgumentMapping); diff --git a/test/legacy_test/CMakeLists.txt b/test/legacy_test/CMakeLists.txt index 875f8164e380c..2768babd07f13 100644 --- a/test/legacy_test/CMakeLists.txt +++ b/test/legacy_test/CMakeLists.txt @@ -1279,6 +1279,7 @@ set(STATIC_BUILD_TESTS test_adamw_op test_arg_min_max_op test_assign_pos_op + test_batch_norm_op test_bucketize_api test_bincount_op test_c_embedding_op @@ -1286,6 +1287,7 @@ set(STATIC_BUILD_TESTS test_decoupled_py_reader test_eig_op test_eigh_op + test_fake_dequantize_op test_fake_quantize_op test_fetch_lod_tensor_array test_ftrl_op diff --git a/test/legacy_test/test_fake_dequantize_op.py b/test/legacy_test/test_fake_dequantize_op.py index ee2f7f7b0820a..9fc5f3500844f 100644 --- a/test/legacy_test/test_fake_dequantize_op.py +++ b/test/legacy_test/test_fake_dequantize_op.py @@ -247,7 +247,7 @@ def setUp(self): self.outputs = {'Y': ydq} def test_check_output(self): - self.check_output() + self.check_output(check_dygraph=False) class TestChannelWiseDequantizeOp1(TestChannelWiseDequantizeOp): @@ -281,7 +281,7 @@ def setUp(self): self.outputs = {'Y': ydq} def test_check_output(self): - self.check_output() + self.check_output(check_dygraph=False) class TestDequantizeOpDouble(TestDequantizeOp): From 74e7a6caab9d85a31889b910d565edd943ccd8d8 Mon Sep 17 00:00:00 2001 From: Android zhang <53324261+zade23@users.noreply.github.com> Date: Mon, 9 Oct 2023 16:56:40 +0800 Subject: [PATCH 28/62] [Docathon] Fix NO.13-NO.18 API label (#57658) * fix 13_18 * Update python/paddle/optimizer/optimizer.py Co-authored-by: ooo oo <106524776+ooooo-create@users.noreply.github.com> * Update python/paddle/optimizer/optimizer.py Co-authored-by: ooo oo <106524776+ooooo-create@users.noreply.github.com> * Update python/paddle/nn/clip.py Co-authored-by: ooo oo <106524776+ooooo-create@users.noreply.github.com> * Update python/paddle/incubate/optimizer/modelaverage.py Co-authored-by: ooo oo <106524776+ooooo-create@users.noreply.github.com> * Update python/paddle/incubate/optimizer/lookahead.py Co-authored-by: ooo oo <106524776+ooooo-create@users.noreply.github.com> * Update python/paddle/distributed/fleet/fleet.py Co-authored-by: ooo oo <106524776+ooooo-create@users.noreply.github.com> * Update base.py fix ref_dygraph --------- Co-authored-by: ooo oo <106524776+ooooo-create@users.noreply.github.com> --- python/paddle/base/dygraph/base.py | 5 ++--- python/paddle/base/framework.py | 2 +- python/paddle/distributed/fleet/fleet.py | 2 +- python/paddle/incubate/optimizer/lookahead.py | 2 +- python/paddle/incubate/optimizer/modelaverage.py | 2 +- python/paddle/nn/clip.py | 2 +- python/paddle/optimizer/optimizer.py | 4 ++-- 7 files changed, 9 insertions(+), 10 deletions(-) diff --git a/python/paddle/base/dygraph/base.py b/python/paddle/base/dygraph/base.py index 3c89b56d66006..52055fc8f55e0 100644 --- a/python/paddle/base/dygraph/base.py +++ b/python/paddle/base/dygraph/base.py @@ -161,9 +161,8 @@ def _convert_into_variable(tensor): def enabled(): """ This function checks whether the program runs in dynamic graph mode or not. - You can enter dynamic graph mode with :ref:`api_base_dygraph_guard` api, - or enable and disable dynamic graph mode with :ref:`api_base_dygraph_enable_dygraph` - and :ref:`api_base_dygraph_disable_dygraph` api . + You can enable dynamic graph mode with :ref:`api_paddle_disable_static` api, + or disable dynamic graph mode with :ref:`api_paddle_enable_static` . **Note**: ``base.dygraph.enabled`` is the alias of ``base.in_dygraph_mode``, and diff --git a/python/paddle/base/framework.py b/python/paddle/base/framework.py index 0e1c62f4fb850..92f06227a0a1d 100644 --- a/python/paddle/base/framework.py +++ b/python/paddle/base/framework.py @@ -1385,7 +1385,7 @@ class Variable(metaclass=VariableMetaClass): In Static Graph Mode: Please use ** `Block.create_var` ** to create a Static variable which has no data until being feed. - In Dygraph Mode: Please use ** :ref:`api_base_dygraph_to_variable` ** to create a dygraph variable with real data. + In Dygraph Mode: Please use ** :ref:`api_paddle_to_tensor` ** to create a dygraph variable with real data. In Fluid, every input and output of an OP is a variable. In most cases, variables are used for holding different kinds of data or training diff --git a/python/paddle/distributed/fleet/fleet.py b/python/paddle/distributed/fleet/fleet.py index eee2ae02c9c88..5e90584b25b5e 100755 --- a/python/paddle/distributed/fleet/fleet.py +++ b/python/paddle/distributed/fleet/fleet.py @@ -1254,7 +1254,7 @@ def minimize( loss (Tensor): A ``Tensor`` containing the value to minimize. startup_program (Program, optional): :ref:`api_paddle_static_Program` for initializing parameters in ``parameter_list``. The default value - is None, at this time :ref:`api_base_default_startup_program` will be used. + is None, at this time :ref:`api_paddle_static_default_startup_program` will be used. parameter_list (Iterable, optional): Iterable of ``Tensor`` or ``Tensor.name`` to update to minimize ``loss``. The default value is None, at this time all parameters will be updated. diff --git a/python/paddle/incubate/optimizer/lookahead.py b/python/paddle/incubate/optimizer/lookahead.py index cb6e10fefc61e..821b5c3ce036c 100644 --- a/python/paddle/incubate/optimizer/lookahead.py +++ b/python/paddle/incubate/optimizer/lookahead.py @@ -252,7 +252,7 @@ def minimize( loss (Tensor): A ``Tensor`` containing the value to minimize. startup_program (Program, optional): :ref:`api_paddle_static_Program` for initializing parameters in ``parameters``. The default value - is None, at this time :ref:`api_base_default_startup_program` will be used. + is None, at this time :ref:`api_paddle_static_default_startup_program` will be used. parameters (list, optional): List of ``Tensor`` or ``Tensor.name`` to update to minimize ``loss``. The default value is None, at this time all parameters will be updated. diff --git a/python/paddle/incubate/optimizer/modelaverage.py b/python/paddle/incubate/optimizer/modelaverage.py index ecab0f307304d..8de533f9f0a4b 100644 --- a/python/paddle/incubate/optimizer/modelaverage.py +++ b/python/paddle/incubate/optimizer/modelaverage.py @@ -302,7 +302,7 @@ def minimize( loss (Tensor): A ``Tensor`` containing the value to minimize. startup_program (Program, optional): :ref:`api_paddle_static_Program` for initializing parameters in ``parameters``. The default value - is None, at this time :ref:`api_base_default_startup_program` will be used. + is None, at this time :ref:`api_paddle_static_default_startup_program` will be used. parameters (list, optional): List of ``Tensor`` or ``Tensor.name`` to update to minimize ``loss``. The default value is None, at this time all parameters will be updated. diff --git a/python/paddle/nn/clip.py b/python/paddle/nn/clip.py index 5fda0adff5efa..13742ae6d9be8 100644 --- a/python/paddle/nn/clip.py +++ b/python/paddle/nn/clip.py @@ -965,7 +965,7 @@ def set_gradient_clip(clip, param_list=None, program=None): It can be a list of parameter or a list of parameter's name. Default None, meaning that all parameters in the program will be included. program (Program, optional): The program where parameters are located. - Default None, meaning that using :ref:`api_base_default_main_program` . + Default None, meaning that using :ref:`api_paddle_static_default_main_program` . Returns: None diff --git a/python/paddle/optimizer/optimizer.py b/python/paddle/optimizer/optimizer.py index 2e1314a3a1536..6d6ecfb220c69 100644 --- a/python/paddle/optimizer/optimizer.py +++ b/python/paddle/optimizer/optimizer.py @@ -1243,7 +1243,7 @@ def backward( loss (Tensor): ``loss`` tensor to run optimizations. startup_program (Program, optional): :ref:`api_paddle_static_Program` for initializing parameters in ``parameters``. The default value - is None, at this time :ref:`api_base_default_startup_program` will be used. + is None, at this time :ref:`api_paddle_static_default_startup_program` will be used. parameters (list, optional): List of ``Tensor`` or ``Tensor.name`` to update to minimize ``loss``. The default value is None, at this time all parameters will be updated. @@ -1604,7 +1604,7 @@ def minimize( loss (Tensor): A ``Tensor`` containing the value to minimize. startup_program (Program, optional): :ref:`api_paddle_static_Program` for initializing parameters in ``parameters``. The default value - is None, at this time :ref:`api_base_default_startup_program` will be used. + is None, at this time :ref:`api_paddle_static_default_startup_program` will be used. parameters (list, optional): List of ``Tensor`` or ``Tensor.name`` to update to minimize ``loss``. The default value is None, at this time all parameters will be updated. From ac3ae610c0f27dc6c2066b858f8ac1115bfd6966 Mon Sep 17 00:00:00 2001 From: Yiqun Liu Date: Mon, 9 Oct 2023 17:04:51 +0800 Subject: [PATCH 29/62] Fix NaN for softmax with long softmax_dim. (#57851) * Fix nan for softmax with long softmax_dim. * Simplify codes. * Remove the PADDLE_THROW in backward. --- paddle/phi/kernels/funcs/aligned_vector.h | 5 + paddle/phi/kernels/funcs/broadcast_function.h | 5 - paddle/phi/kernels/gpudnn/softmax_gpudnn.h | 92 ++++++------------- 3 files changed, 33 insertions(+), 69 deletions(-) diff --git a/paddle/phi/kernels/funcs/aligned_vector.h b/paddle/phi/kernels/funcs/aligned_vector.h index 558e7dc999cf8..753aa44b0aa3a 100644 --- a/paddle/phi/kernels/funcs/aligned_vector.h +++ b/paddle/phi/kernels/funcs/aligned_vector.h @@ -30,6 +30,11 @@ struct NeedVectorized { static constexpr bool value = sizeof(T) <= sizeof(float); }; +template +struct MaxWithOne { + static constexpr auto kValue = (N >= 1 ? N : 1); +}; + // Aligned vector generates vectorized load/store on CUDA. template struct alignas(sizeof(T) * Size) AlignedVector { diff --git a/paddle/phi/kernels/funcs/broadcast_function.h b/paddle/phi/kernels/funcs/broadcast_function.h index 2ba3271d2c7df..83c5c7d9c3c1b 100644 --- a/paddle/phi/kernels/funcs/broadcast_function.h +++ b/paddle/phi/kernels/funcs/broadcast_function.h @@ -530,11 +530,6 @@ HOSTDEVICE static int64_t ConvertSrcIdxToDstIdx( return dst_idx; } -template -struct MaxWithOne { - static constexpr auto kValue = (N >= 1 ? N : 1); -}; - template struct ReadVecDataWithInt64Index { template diff --git a/paddle/phi/kernels/gpudnn/softmax_gpudnn.h b/paddle/phi/kernels/gpudnn/softmax_gpudnn.h index a4571b83e39e7..9f8dd99b20d6a 100644 --- a/paddle/phi/kernels/gpudnn/softmax_gpudnn.h +++ b/paddle/phi/kernels/gpudnn/softmax_gpudnn.h @@ -30,29 +30,6 @@ limitations under the License. */ #define MATRIX_SOFTMAX_ALIGN_BYTES 16 #define MATRIX_SOFTMAX_THREAHOLD 100000 -#define FIXED_BLOCK_DIM_BASE(dim, ...) \ - case (dim): { \ - constexpr auto kBlockDim = (dim); \ - __VA_ARGS__; \ - } break - -#define FIXED_VEC_SIZE_BASE(vec_size, ...) \ - case (vec_size): { \ - constexpr auto VecSize = (vec_size); \ - __VA_ARGS__; \ - } break - -#define FIXED_BLOCK_DIM(...) \ - FIXED_BLOCK_DIM_BASE(512, ##__VA_ARGS__); \ - FIXED_BLOCK_DIM_BASE(256, ##__VA_ARGS__); \ - FIXED_BLOCK_DIM_BASE(128, ##__VA_ARGS__); \ - FIXED_BLOCK_DIM_BASE(64, ##__VA_ARGS__); \ - FIXED_BLOCK_DIM_BASE(32, ##__VA_ARGS__) - -#define FIXED_VEC_SIZE(...) \ - FIXED_VEC_SIZE_BASE(8, ##__VA_ARGS__); \ - FIXED_VEC_SIZE_BASE(4, ##__VA_ARGS__) - namespace phi { using ScopedTensorDescriptor = phi::backends::gpu::ScopedTensorDescriptor; @@ -112,7 +89,7 @@ static inline int Log2Ceil(int value) { return log2_value; } -inline int getBlockSize(int vec_size, uint64_t dim_size) { +inline int CalcBlockSize(int vec_size, uint64_t dim_size) { uint64_t block_size = 1; uint64_t max_block_size = std::min(dim_size / vec_size, static_cast(1024)); @@ -461,14 +438,11 @@ __device__ __forceinline__ void ThreadVecWrite(T* out, } } -template +template __global__ void KeMatrixSoftmaxForward(T* softmax, const T* src, int dim_size) { - using VecT = phi::AlignedVector; + constexpr int kVecSize = + MaxWithOne::kValue; + using VecT = phi::AlignedVector; int bid = blockIdx.x; T* batch_input = const_cast(src) + bid * dim_size; @@ -480,16 +454,16 @@ __global__ void KeMatrixSoftmaxForward(T* softmax, const T* src, int dim_size) { ((uint64_t)batch_output) % MATRIX_SOFTMAX_ALIGN_BYTES / sizeof(T); // get max value - AccT thread_max = ThreadVecReduce( + AccT thread_max = ThreadVecReduce( batch_input, dim_size, input_align_shift, MaxFunctor(), - std::numeric_limits::min()); + -std::numeric_limits::infinity()); BlockReduceMax(&thread_max); // get exp value and sum all - AccT thread_exp = ThreadVecReduce( + AccT thread_exp = ThreadVecReduce( batch_input, dim_size, input_align_shift, @@ -501,19 +475,19 @@ __global__ void KeMatrixSoftmaxForward(T* softmax, const T* src, int dim_size) { if (LogMode) { LogSoftmaxForwardFunctor reduction(thread_max, thread_exp); if (input_align_shift == output_align_shift) { - ThreadVecWriteVec( + ThreadVecWriteVec( batch_output, batch_input, dim_size, input_align_shift, reduction); } else { - ThreadVecWrite( + ThreadVecWrite( batch_output, batch_input, dim_size, reduction); } } else { SoftmaxForwardFunctor reduction(thread_max, thread_exp); if (input_align_shift == output_align_shift) { - ThreadVecWriteVec( + ThreadVecWriteVec( batch_output, batch_input, dim_size, input_align_shift, reduction); } else { - ThreadVecWrite( + ThreadVecWrite( batch_output, batch_input, dim_size, reduction); } } @@ -785,9 +759,9 @@ void SwitchWarpSoftmaxForward(const IndexType blocks, const IndexType batch_size, const IndexType stride, const IndexType element_count, - IndexType Log2Elements) { + IndexType log2_element_count) { using AccT = typename phi::dtype::MPTypeTrait::Type; - switch (Log2Elements) { + switch (log2_element_count) { SOFTMAX_WARP_FORWARD_CASE(0, AccT); SOFTMAX_WARP_FORWARD_CASE(1, AccT); SOFTMAX_WARP_FORWARD_CASE(2, AccT); @@ -800,6 +774,10 @@ void SwitchWarpSoftmaxForward(const IndexType blocks, SOFTMAX_WARP_FORWARD_CASE(9, AccT); SOFTMAX_WARP_FORWARD_CASE(10, AccT); default: + PADDLE_THROW(phi::errors::Unimplemented( + "Unsupported softmax dim: element_count=%d, log2_element_count=%d!", + element_count, + log2_element_count)); break; } } @@ -824,9 +802,9 @@ void SwitchWarpSoftmaxBackward(const int blocks, const int batch_size, const int stride, const int element_count, - int Log2Elements) { + int log2_element_count) { using AccT = typename phi::dtype::MPTypeTrait::Type; - switch (Log2Elements) { + switch (log2_element_count) { SOFTMAX_WARP_BACKWARD_CASE(0, AccT); SOFTMAX_WARP_BACKWARD_CASE(1, AccT); SOFTMAX_WARP_BACKWARD_CASE(2, AccT); @@ -839,6 +817,9 @@ void SwitchWarpSoftmaxBackward(const int blocks, SOFTMAX_WARP_BACKWARD_CASE(9, AccT); SOFTMAX_WARP_BACKWARD_CASE(10, AccT); default: + // PADDLE_THROW(phi::errors::Unimplemented( + // "Unsupported softmax dim: element_count=%d, + // log2_element_count=%d!", element_count, log2_element_count)); break; } } @@ -1202,24 +1183,11 @@ template void LaunchKeMatrixSoftmaxForwardKernel( const GPUContext& dev_ctx, T* out, const T* input, int N, int dim_size) { using AccT = typename phi::dtype::MPTypeTrait::Type; - const int vec_size = MATRIX_SOFTMAX_ALIGN_BYTES / sizeof(T); - switch (getBlockSize(vec_size, dim_size)) { - FIXED_BLOCK_DIM(switch (vec_size) { - FIXED_VEC_SIZE( - KeMatrixSoftmaxForward - <<>>(out, input, dim_size)); - default: - break; - }); - default: - PADDLE_THROW( - errors::Fatal("the input dim has error in the softmax cuda kernel.")); - } + constexpr int kVecSize = + MaxWithOne::kValue; + int block_dim = CalcBlockSize(kVecSize, dim_size); + KeMatrixSoftmaxForward + <<>>(out, input, dim_size); } #if CUDNN_VERSION < 8100 @@ -1450,9 +1418,5 @@ void SoftmaxBackwardCUDAKernelDriver(const GPUContext& dev_ctx, dev_ctx, dx_data, dout.data(), out.data(), N, dim, D); } } -#undef FIXED_BLOCK_DIM_BASE -#undef FIXED_BLOCK_DIM -#undef FIXED_VEC_SIZE_BASE -#undef FIXED_VEC_SIZE } // namespace phi From f858ec7ba7059773f91ad964c5e5878bc209194f Mon Sep 17 00:00:00 2001 From: wanghuancoder Date: Mon, 9 Oct 2023 17:25:02 +0800 Subject: [PATCH 30/62] [CleanOps]del_rnn_memory_helper_op (#57926) * del_rnn_memory_helper_op --- .../interpreter/interpreter_util.cc | 2 - paddle/fluid/framework/prune_test.cc | 77 -------- .../analysis/ir_passes/dlnne_subgraph_pass.cc | 1 - .../fluid/operators/rnn_memory_helper_op.cc | 184 ------------------ paddle/fluid/operators/unity_build_rule.cmake | 1 - python/paddle/base/backward.py | 13 +- python/paddle/base/framework.py | 1 - test/ir/inference/program_config.py | 1 - test/legacy_test/test_rnn_memory_helper_op.py | 146 -------------- tools/parallel_UT_rule.py | 2 - tools/static_mode_white_list.py | 1 - 11 files changed, 2 insertions(+), 427 deletions(-) delete mode 100644 paddle/fluid/operators/rnn_memory_helper_op.cc delete mode 100644 test/legacy_test/test_rnn_memory_helper_op.py diff --git a/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc b/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc index b2fbed43f02fa..480d445017c21 100644 --- a/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc +++ b/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc @@ -620,8 +620,6 @@ void BuildOpFuncList(const platform::Place& place, "pylayer", "pylayer_grad" "recurrent_grad", - "rnn_memory_helper", - "rnn_memory_helper_grad", "while", "while_grad"}; bool allow_var_not_in_program = ops_with_var_not_in_program.count(op_type); diff --git a/paddle/fluid/framework/prune_test.cc b/paddle/fluid/framework/prune_test.cc index 9702805e176c2..8da75413e9d6d 100644 --- a/paddle/fluid/framework/prune_test.cc +++ b/paddle/fluid/framework/prune_test.cc @@ -198,80 +198,3 @@ TEST(Prune, multi_target) { f::Prune(*pdesc, feed_var_names, &pruned); EXPECT_EQ(pruned.blocks(0).ops_size(), 3); } - -TEST(Prune, recurrrent_op) { - f::ProgramDesc program; - f::BlockDesc *block = program.MutableBlock(0); - f::BlockDesc *sub_block = program.AppendBlock(*block); - AddOp("one_two", - {{"input", {"a"}}}, - {{"output", {"b", "c"}}}, - f::AttributeMap{}, - block); - - std::vector state_var_name(1, "y"); - AddOp("recurrent", - {{"input", {"b", "c"}}}, - {{"output", {"b1, c1"}}}, - {{"ex_states", state_var_name}, - {"states", state_var_name}, - {"sub_block", sub_block}}, - block); - - EXPECT_TRUE(sub_block != nullptr); - AddOp("rnn_memory_helper", - {{"input", {"x"}}}, - {{"output", {"y"}}}, - f::AttributeMap{}, - sub_block); - - f::proto::ProgramDesc *pdesc = program.Proto(); - pdesc->mutable_blocks(0)->mutable_ops(1)->set_is_target(true); - - f::proto::ProgramDesc pruned; - std::set feed_var_names = {"a"}; - - f::Prune(*pdesc, feed_var_names, &pruned); - EXPECT_EQ(pruned.blocks_size(), 2); - EXPECT_EQ(pruned.blocks(0).ops_size(), 2); - EXPECT_EQ(pruned.blocks(1).ops_size(), 1); -} - -// If the output of an op modifies feed vars, the op should not clip. -TEST(Prune, recurrrent_op_2) { - f::ProgramDesc program; - f::BlockDesc *block = program.MutableBlock(0); - f::BlockDesc *sub_block = program.AppendBlock(*block); - AddOp("one_two", - {{"input", {"a"}}}, - {{"output", {"b", "c"}}}, - f::AttributeMap{}, - block); - - std::vector state_var_name(1, "y"); - AddOp("recurrent", - {{"input", {"b", "c"}}}, - {{"output", {"b1, c1"}}}, - {{"ex_states", state_var_name}, - {"states", state_var_name}, - {"sub_block", sub_block}}, - block); - - EXPECT_TRUE(sub_block != nullptr); - AddOp("rnn_memory_helper", - {{"input", {"x"}}}, - {{"output", {"a"}}}, - f::AttributeMap{}, - sub_block); - - f::proto::ProgramDesc *pdesc = program.Proto(); - pdesc->mutable_blocks(0)->mutable_ops(1)->set_is_target(true); - - f::proto::ProgramDesc pruned; - std::set feed_var_names = {"x", "a"}; - - f::Prune(*pdesc, feed_var_names, &pruned); - EXPECT_EQ(pruned.blocks_size(), 2); - EXPECT_EQ(pruned.blocks(0).ops_size(), 2); - EXPECT_EQ(pruned.blocks(1).ops_size(), 1); -} diff --git a/paddle/fluid/inference/analysis/ir_passes/dlnne_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/dlnne_subgraph_pass.cc index 1f840999c07ef..70a5b7b6b7d48 100644 --- a/paddle/fluid/inference/analysis/ir_passes/dlnne_subgraph_pass.cc +++ b/paddle/fluid/inference/analysis/ir_passes/dlnne_subgraph_pass.cc @@ -39,7 +39,6 @@ void analysis::DlnneSubgraphPass::InferShapeForDlnneMainGraph() const { "fetch", "recurrent", "go", - "rnn_memory_helper_grad", "conditional_block", "while", "send", diff --git a/paddle/fluid/operators/rnn_memory_helper_op.cc b/paddle/fluid/operators/rnn_memory_helper_op.cc deleted file mode 100644 index 48a204c10e4be..0000000000000 --- a/paddle/fluid/operators/rnn_memory_helper_op.cc +++ /dev/null @@ -1,184 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" - -namespace paddle { -namespace framework { -class InferShapeContext; -class OpDesc; -class Scope; -} // namespace framework -namespace imperative { -class OpBase; -} // namespace imperative -} // namespace paddle - -namespace paddle { -namespace operators { -class RNNMemoryHelperOp : public framework::OperatorBase { - public: - RNNMemoryHelperOp(const std::string &type, - const framework::VariableNameMap &inputs, - const framework::VariableNameMap &outputs, - const framework::AttributeMap &attrs) - : OperatorBase(type, inputs, outputs, attrs) {} - - private: - void RunImpl(const framework::Scope &scope, - const platform::Place &dev_place) const override { - auto mem_var_name = Input("X"); - auto *mem_var = scope.FindVar(mem_var_name); - PADDLE_ENFORCE_NOT_NULL( - mem_var, - platform::errors::NotFound("Cannot find mem_var: %s in scope.", - mem_var_name)); - - auto out_name = this->Output("Out"); - auto *out_var = scope.FindVar(out_name); - PADDLE_ENFORCE_NOT_NULL(out_var, - platform::errors::NotFound( - "Cannot find out_var: %s in scope.", out_name)); - - platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); - auto &dev_ctx = *pool.Get(dev_place); - - auto *out_tensor = out_var->GetMutable(); - auto &mem_tensor = mem_var->Get(); - framework::TensorCopy(mem_tensor, dev_place, dev_ctx, out_tensor); - out_tensor->set_lod(mem_tensor.lod()); - } -}; - -class RNNMemoryHelperOpShapeInference : public framework::InferShapeBase { - public: - void operator()(framework::InferShapeContext *ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "RNNMemoryHelper"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "RNNMemoryHelper"); - - ctx->ShareDim("X", /*->*/ "Out"); - ctx->ShareLoD("X", /*->*/ "Out"); - } -}; - -class RNNMemoryHelperOpInfoMaker : public framework::OpProtoAndCheckerMaker { - public: - void Make() override { - AddInput("X", ""); - AddOutput("Out", ""); - AddAttr("dtype", - "(int, default 5 (FP32)) " - "Output data type") - .SetDefault(framework::proto::VarType::FP32); - AddComment(""); - } -}; - -class RNNMemoryHelperGradOp : public framework::OperatorBase { - public: - RNNMemoryHelperGradOp(const std::string &type, - const framework::VariableNameMap &inputs, - const framework::VariableNameMap &outputs, - const framework::AttributeMap &attrs) - : OperatorBase(type, inputs, outputs, attrs) {} - - private: - void RunImpl(const framework::Scope &scope, - const platform::Place &dev_place) const override { - auto out_grad_var_name = Input(framework::GradVarName("Out")); - auto *out_grad_var = scope.FindVar(out_grad_var_name); - - auto in_grad_var_name = Output(framework::GradVarName("X")); - auto *in_grad_var = scope.FindVar(in_grad_var_name); - PADDLE_ENFORCE_NOT_NULL( - in_grad_var, - platform::errors::NotFound("Cannot find in_grad_var: %s in scope.", - in_grad_var_name)); - - platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); - auto &dev_ctx = *pool.Get(dev_place); - - // NOTE(xiongkun03): In standalone executor, after each run, the - // var.tensor.holder will be delete instead of variable. So we need exam the - // IsInitialized(). - if (out_grad_var == nullptr || - !out_grad_var->Get().IsInitialized()) { - VLOG(5) << "Using fill constant 0 as starting gradient"; - auto in_var_name = Input("X"); - auto *in_var = scope.FindVar(in_var_name); - auto &in_var_tensor = in_var->Get(); - - framework::AttributeMap attrs; - attrs["dtype"] = framework::TransToProtoVarType(in_var_tensor.dtype()); - attrs["shape"] = phi::vectorize(in_var_tensor.dims()); - attrs["value"] = 0.0f; - - auto zero_op = framework::OpRegistry::CreateOp( - "fill_constant", {}, {{"Out", {in_grad_var_name}}}, attrs); - zero_op->Run(scope, dev_place); - } else { - auto &out_grad_tensor = out_grad_var->Get(); - auto *in_grad_tensor = in_grad_var->GetMutable(); - framework::TensorCopy( - out_grad_tensor, dev_place, dev_ctx, in_grad_tensor); - in_grad_tensor->set_lod(out_grad_tensor.lod()); - } - } -}; - -class RNNMemoryHelperGradOpInfoMaker - : public framework::OpProtoAndCheckerMaker { - public: - void Make() override { - AddInput(framework::GradVarName("Out"), ""); - AddInput("X", ""); - AddInput("Out", ""); - AddOutput(framework::GradVarName("X"), ""); - AddAttr("dtype", - "(int, default 5 (FP32)) " - "Output data type") - .SetDefault(framework::proto::VarType::FP32); - AddComment(""); - } -}; - -class RNNMemoryHelperGradOpShapeInference : public framework::InferShapeBase { - public: - void operator()(framework::InferShapeContext *ctx) const override { - auto x_grad_name = framework::GradVarName("X"); - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "RNNMemoryHelperGrad"); - OP_INOUT_CHECK(ctx->HasOutput(x_grad_name), - "Output", - x_grad_name, - "RNNMemoryHelperGrad"); - ctx->SetOutputDim(x_grad_name, ctx->GetInputDim("X")); - ctx->ShareLoD("X", /*->*/ x_grad_name); - } -}; - -} // namespace operators -} // namespace paddle - -REGISTER_OPERATOR( - rnn_memory_helper, - paddle::operators::RNNMemoryHelperOp, - paddle::operators::RNNMemoryHelperOpInfoMaker, - paddle::operators::RNNMemoryHelperOpShapeInference, - paddle::framework::DefaultGradOpMaker, - paddle::framework::DefaultGradOpMaker); -REGISTER_OPERATOR(rnn_memory_helper_grad, - paddle::operators::RNNMemoryHelperGradOp, - paddle::operators::RNNMemoryHelperGradOpInfoMaker, - paddle::operators::RNNMemoryHelperGradOpShapeInference); diff --git a/paddle/fluid/operators/unity_build_rule.cmake b/paddle/fluid/operators/unity_build_rule.cmake index c58b78cf3bc21..9151e1b4a2c5c 100644 --- a/paddle/fluid/operators/unity_build_rule.cmake +++ b/paddle/fluid/operators/unity_build_rule.cmake @@ -223,7 +223,6 @@ register_unity_group( reverse_op.cc) register_unity_group( cc - rnn_memory_helper_op.cc roi_align_op.cc roll_op.cc run_program_op.cc diff --git a/python/paddle/base/backward.py b/python/paddle/base/backward.py index 6d30823d4bf4a..b3a675882a3a3 100755 --- a/python/paddle/base/backward.py +++ b/python/paddle/base/backward.py @@ -1671,17 +1671,8 @@ def _append_backward_vars_(block, start_op_idx, grad_to_var, grad_info_map): or var in parent_op_vars ] if not existing_grad_var_ins: - ''' - FIXME(paddle-dev, zengjinle): rnn_memory_helper_grad is used - in recurrent op. The input of this op does not even exist in - the program! Therefore, any dependency analysis would not - work to this op! If I do not add the following code, this op - would be pruned, and the calculation result would be wrong. - Maybe we should re-design this op later... - ''' - if op_desc.type() not in ['rnn_memory_helper_grad']: - ops_to_remove.append(op_idx) - continue + ops_to_remove.append(op_idx) + continue # sum may create invalid variable, here to deal with it. if op_desc.type() == 'sum': diff --git a/python/paddle/base/framework.py b/python/paddle/base/framework.py index 92f06227a0a1d..ec580ba50d246 100644 --- a/python/paddle/base/framework.py +++ b/python/paddle/base/framework.py @@ -2879,7 +2879,6 @@ class Operator: 'fetch', 'recurrent', 'go', - 'rnn_memory_helper_grad', 'conditional_block', 'pylayer', 'while', diff --git a/test/ir/inference/program_config.py b/test/ir/inference/program_config.py index 4516c2cb4ad0c..250b547efca31 100644 --- a/test/ir/inference/program_config.py +++ b/test/ir/inference/program_config.py @@ -113,7 +113,6 @@ def __repr__(self): 'fetch', 'recurrent', 'go', - 'rnn_memory_helper_grad', 'conditional_block', 'static_pylayer', 'while', diff --git a/test/legacy_test/test_rnn_memory_helper_op.py b/test/legacy_test/test_rnn_memory_helper_op.py deleted file mode 100644 index 16a0cccb10d6f..0000000000000 --- a/test/legacy_test/test_rnn_memory_helper_op.py +++ /dev/null @@ -1,146 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import unittest - -import numpy as np - -from paddle.base import core -from paddle.base.executor import Executor -from paddle.base.framework import Program - - -class RNNMemoryHelperOpTest(unittest.TestCase): - def setUp(self): - self.program = Program() - self.place = core.CPUPlace() - - self.X = self.program.global_block().create_var( - name='X', shape=[2, 3], dtype='float32' - ) - self.Out = self.program.global_block().create_var( - name='Out', shape=[2, 3], dtype='float32' - ) - self.program.global_block().append_op( - type='rnn_memory_helper', - inputs={"X": self.X}, - outputs={"Out": self.Out}, - attrs={}, - ) - - def test_forward(self): - x_np = np.random.normal(size=(2, 3)).astype("float32") - self.feed_map = {'X': x_np} - self.fetch_list = [self.Out] - exe = Executor(self.place) - out = exe.run( - self.program, feed=self.feed_map, fetch_list=self.fetch_list - ) - np.testing.assert_allclose(out[0], x_np, rtol=1e-05) - - -class RNNMemoryHelperGradOpTest(unittest.TestCase): - def setUp(self): - self.program = Program() - self.place = core.CPUPlace() - - self.input_names = ['X', 'Out', 'Out@GRAD'] - self.input_vars = { - name: self.program.global_block().create_var( - name=name, shape=[2, 3], dtype='float32' - ) - for name in self.input_names - } - - self.output_names = ['X@GRAD'] - self.output_vars = { - name: self.program.global_block().create_var( - name=name, shape=[2, 3], dtype='float32' - ) - for name in self.output_names - } - - self.program.global_block().append_op( - type='rnn_memory_helper_grad', - inputs=self.input_vars, - outputs=self.output_vars, - attrs={}, - ) - - def test_backward(self): - self.feed_map = { - name: np.random.normal(size=(2, 3)).astype("float32") - for name in self.input_names - } - self.fetch_list = [self.output_vars['X@GRAD']] - - exe = Executor(self.place) - out = exe.run( - self.program, feed=self.feed_map, fetch_list=self.fetch_list - ) - np.isclose(out[0], self.feed_map['Out@GRAD'], rtol=1e-5) - - -class RNNMemoryHelperGradOpWithoutInputTest(unittest.TestCase): - def setUp(self): - self.program = Program() - self.fake_program = Program() - self.place = core.CPUPlace() - - self.input_names = ['X', 'Out'] - self.input_vars = { - name: self.program.global_block().create_var( - name=name, shape=[2, 3], dtype='float32' - ) - for name in self.input_names - } - self.input_vars[ - "Out@GRAD" - ] = self.fake_program.global_block().create_var( - name="Out@GRAD", shape=[2, 3], dtype='float32' - ) - - self.output_names = ['X@GRAD'] - self.output_vars = { - name: self.program.global_block().create_var( - name=name, shape=[2, 3], dtype='float32' - ) - for name in self.output_names - } - - self.program.global_block().append_op( - type='rnn_memory_helper_grad', - inputs=self.input_vars, - outputs=self.output_vars, - attrs={}, - ) - - def test_backward(self): - self.feed_map = { - name: np.random.normal(size=(2, 3)).astype("float32") - for name in ['X', 'Out'] - } - self.fetch_list = [self.output_vars['X@GRAD']] - - exe = Executor(self.place) - out = exe.run( - self.program, feed=self.feed_map, fetch_list=self.fetch_list - ) - np.testing.assert_allclose( - out[0], np.zeros(shape=(2, 3)).astype('float32'), rtol=1e-05 - ) - - -if __name__ == '__main__': - unittest.main() diff --git a/tools/parallel_UT_rule.py b/tools/parallel_UT_rule.py index 8755ef4d13ffb..e5055272e9c94 100755 --- a/tools/parallel_UT_rule.py +++ b/tools/parallel_UT_rule.py @@ -353,7 +353,6 @@ 'rw_lock_test', 'exception_holder_test', 'enforce_test', - 'test_rnn_memory_helper_op', 'ddim_test', 'test_eager_deletion_padding_rnn', 'test_is_test_pass', @@ -1707,7 +1706,6 @@ 'test_run_fluid_by_module_or_command_line', 'test_rpn_target_assign_op', 'test_row_conv', - 'test_rnn_memory_helper_op', 'test_reshape_transpose_matmul_mkldnn_fuse_pass', 'test_reshape_bf16_op', 'test_require_version', diff --git a/tools/static_mode_white_list.py b/tools/static_mode_white_list.py index 228218e46ecf4..095c2e2646f9f 100755 --- a/tools/static_mode_white_list.py +++ b/tools/static_mode_white_list.py @@ -427,7 +427,6 @@ 'test_reverse_op', 'test_rmsprop_op', 'test_rnn_cell_api', - 'test_rnn_memory_helper_op', 'test_roi_align_op', 'test_roi_perspective_transform_op', 'test_roi_pool_op', From db6d4d772f6c9ea4a04a558172b5685f725b128f Mon Sep 17 00:00:00 2001 From: Bo Zhang <105368690+zhangbopd@users.noreply.github.com> Date: Mon, 9 Oct 2023 17:51:12 +0800 Subject: [PATCH 31/62] Standard naming Part 2 (#57939) * split SymbolicDimMgr ShapeComputationIRAnalysis --- .../shape/transforms/shape_optimization.cc | 169 ++++ .../shape/transforms/shape_optimization.h | 52 ++ .../transforms/shape_optimization_pass.cc | 1 + .../shape/utils/shape_optimization_utils.cc | 606 +++++++++++++++ .../shape/utils/shape_optimization_utils.h | 94 +++ paddle/pir/dialect/shape/utils/shape_utils.cc | 734 ------------------ paddle/pir/dialect/shape/utils/shape_utils.h | 129 +-- .../pir/shape_dialect/constraint_pass_test.cc | 1 + 8 files changed, 924 insertions(+), 862 deletions(-) create mode 100644 paddle/pir/dialect/shape/transforms/shape_optimization.cc create mode 100644 paddle/pir/dialect/shape/transforms/shape_optimization.h diff --git a/paddle/pir/dialect/shape/transforms/shape_optimization.cc b/paddle/pir/dialect/shape/transforms/shape_optimization.cc new file mode 100644 index 0000000000000..959d098675b29 --- /dev/null +++ b/paddle/pir/dialect/shape/transforms/shape_optimization.cc @@ -0,0 +1,169 @@ +// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/pir/dialect/shape/transforms/shape_optimization.h" +#include "paddle/pir/dialect/shape/utils/shape_utils.h" + +namespace pir { + +ShapeComputationIRAnalysis::ShapeComputationIRAnalysis(ModuleOp m, + SymbolicDimMgr& mgr) + : m_(m), mgr_(mgr) {} + +bool ShapeComputationIRAnalysis::Run() { + // Make sure only run once. + if (initialized_) return false; + initialized_ = true; + auto buildShapeFunc = + std::bind(&ShapeComputationIRAnalysis::BuildShapeOnOperation, + this, + std::placeholders::_1); + if (!RunOnRegion(&(m_->region(0)), buildShapeFunc)) return false; + auto applyOpConstraintFunc = + std::bind(&ShapeComputationIRAnalysis::ApplyOpConstraint, + this, + std::placeholders::_1); + if (!RunOnRegion(&(m_->region(0)), applyOpConstraintFunc)) return false; + return true; +} + +bool ShapeComputationIRAnalysis::RunOnRegion(Region* region, func fn) { + for (Block* block : *region) { + if (!RunOnBlock(block, fn)) return false; + } + return true; +} + +bool ShapeComputationIRAnalysis::RunOnBlock(Block* block, func fn) { + // TODO(liujinnan): mapping block arguments + + std::vector op_list; + for (Operation* op : *block) op_list.push_back(op); + for (Operation* op : op_list) { + if (!RunOnOperation(op, fn)) return false; + } + return true; +} + +bool ShapeComputationIRAnalysis::RunOnOperation(Operation* op, func fn) { + for (size_t i = 0; i < op->num_regions(); ++i) { + if (!RunOnRegion(&(op->region(i)), fn)) return false; + } + return fn(op); +} + +bool ShapeComputationIRAnalysis::BuildShapeOnOperation(Operation* op) { + if (op->isa()) return true; + if (op->isa()) { + Value value = op->operand_source(0); + std::vector symbols; + if (op->HasAttribute(SymbolicDim::GetSymbolicDimAttrName())) { + auto attrs = + op->attribute(SymbolicDim::GetSymbolicDimAttrName()) + .AsVector(); + for (Attribute attr : attrs) { + auto sym = mgr_.symbolTable().Lookup( + attr.dyn_cast().AsString()); + assert(sym); + SymbolicDim root = mgr_.GetRootSymbolicDim(sym); + symbols.push_back(root); + } + } else { + symbols = mgr_.CreateSymbolicDimsForRankedValue(value); + std::vector attrs; + for (SymbolicDim sym : symbols) { + Attribute rootSymbol = + StrAttribute::get(m_->ir_context(), sym.GetSymName()); + attrs.push_back(rootSymbol); + } + op->set_attribute(SymbolicDim::GetSymbolicDimAttrName(), + ArrayAttribute::get(m_->ir_context(), attrs)); + } + rankedTensor2SymDims_[value] = std::move(symbols); + return true; + } + for (size_t i = 0; i < op->num_results(); ++i) { + if (!BuildShapeOnValue(op->result(i))) return false; + } + return true; +} + +bool ShapeComputationIRAnalysis::BuildShapeOnValue(Value value) { + Type type = value.type(); + if (IsIntOrIndex(type)) { + SymbolicDim sym = mgr_.NewSymbolicDim(); + value2SymDim_[value] = sym; + } else if (IsCandidateShapeTensorType(type)) { + auto shapedTy = type.dyn_cast(); + std::vector symbols; + for (size_t i = 0, d = shapedTy.GetShape()[0]; i < d; ++i) + symbols.push_back(mgr_.NewSymbolicDim()); + shapeTensor2SymDims_[value] = std::move(symbols); + } + return true; +} + +bool ShapeComputationIRAnalysis::ApplyOpConstraint(Operation* op) { + IR_ENFORCE(ApplyIndexOpConstraint(op), + "Fail to apply constraint for index op"); + IR_ENFORCE(ApplyTieShapeOpConstraint(op), + "Fail to apply constraint for tie_shape op"); + + // TODO(zhangbo63): add more constraints + return true; +} + +bool ShapeComputationIRAnalysis::ApplyIndexOpConstraint(Operation* op) { + if (op->num_results() == 0) return true; + + Type type = op->result(0).type(); + if (!IsIntOrIndex(type)) return true; + + if (auto dimOp = op->dyn_cast()) { + int64_t dimIndex = dimOp.index() + .dyn_cast() + .owner() + ->attribute("value") + .data(); + value2SymDim_[dimOp.out()].UpdateKnownNonNegative(true); + if (!mgr_.MapSymbolicDimEqual( + value2SymDim_[dimOp.out()], + rankedTensor2SymDims_[dimOp.source()][dimIndex])) { + return false; + } + + } else if (auto constOp = op->dyn_cast()) { + int64_t val = constOp.value().dyn_cast().data(); + if (!mgr_.MapSymbolicDimEqual(value2SymDim_[op->result(0)], + mgr_.NewConstantSymbolicDim(val))) { + return false; + } + } + // TODO(zhangbo63): add support for reifyInferShape. (e.g. mul/add) + return true; +} + +bool ShapeComputationIRAnalysis::ApplyTieShapeOpConstraint(Operation* op) { + if (auto tieShape = op->dyn_cast()) { + auto& value = rankedTensor2SymDims_[op->operand_source(0)]; + for (size_t idx = 0; idx < tieShape.dims().size(); ++idx) { + if (!mgr_.MapSymbolicDimEqual(value2SymDim_[tieShape.dims()[idx]], + value[idx])) + return false; + mgr_.GetRootSymbolicDim(value[idx]).UpdateKnownNonNegative(true); + } + } + return true; +} +} // namespace pir diff --git a/paddle/pir/dialect/shape/transforms/shape_optimization.h b/paddle/pir/dialect/shape/transforms/shape_optimization.h new file mode 100644 index 0000000000000..ba711f288a770 --- /dev/null +++ b/paddle/pir/dialect/shape/transforms/shape_optimization.h @@ -0,0 +1,52 @@ +// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/pir/dialect/shape/utils/shape_optimization_utils.h" +#include "paddle/pir/dialect/shape/utils/symbol_table.h" + +namespace pir { +class ShapeComputationIRAnalysis { + public: + using func = std::function; + explicit ShapeComputationIRAnalysis(ModuleOp m, + SymbolicDimMgr& mgr); // NOLINT + bool Run(); + + private: + bool RunOnRegion(Region* region, func fn); + bool RunOnBlock(Block* block, func fn); + bool RunOnOperation(Operation* op, func fn); + + bool BuildShapeOnOperation(Operation* op); + bool BuildShapeOnValue(Value value); + + bool ApplyOpConstraint(Operation* op); + bool ApplyIndexOpConstraint(Operation* op); + bool ApplyTieShapeOpConstraint(Operation* op); + + bool initialized_ = false; + ModuleOp m_; + SymbolicDimMgr& mgr_; + + std::unordered_map value2SymDim_; + + // shape tensor is the 1D ranked tensor with int/index dtype. + std::unordered_map> shapeTensor2SymDims_; + + std::unordered_map> rankedTensor2SymDims_; +}; + +} // namespace pir diff --git a/paddle/pir/dialect/shape/transforms/shape_optimization_pass.cc b/paddle/pir/dialect/shape/transforms/shape_optimization_pass.cc index 6bbb918ebc1f1..f9316f3682aa3 100644 --- a/paddle/pir/dialect/shape/transforms/shape_optimization_pass.cc +++ b/paddle/pir/dialect/shape/transforms/shape_optimization_pass.cc @@ -18,6 +18,7 @@ #include "paddle/pir/core/builtin_op.h" #include "paddle/pir/core/program.h" +#include "paddle/pir/dialect/shape/transforms/shape_optimization.h" #include "paddle/pir/dialect/shape/utils/shape_utils.h" #include "paddle/pir/pass/pass.h" #include "paddle/pir/pass/pass_manager.h" diff --git a/paddle/pir/dialect/shape/utils/shape_optimization_utils.cc b/paddle/pir/dialect/shape/utils/shape_optimization_utils.cc index 35776be4f5325..07f7cf4129a4d 100644 --- a/paddle/pir/dialect/shape/utils/shape_optimization_utils.cc +++ b/paddle/pir/dialect/shape/utils/shape_optimization_utils.cc @@ -13,3 +13,609 @@ // limitations under the License. #include "paddle/pir/dialect/shape/utils/shape_optimization_utils.h" +#include "paddle/pir/core/builtin_type.h" +#include "paddle/pir/dialect/shape/utils/symbol_table.h" + +namespace pir { + +bool CompareSymbolicDimNames(const std::string& lhs, const std::string& rhs) { + // S -> Symbol : unknown dimension size at compile time + // C -> Constant : constant dimension size at compile time + if (lhs.size() < 1 || (lhs[0] != 'S' && lhs[0] != 'C')) return lhs < rhs; + if (rhs.size() < 1 || (rhs[0] != 'S' && rhs[0] != 'C')) return lhs < rhs; + int64_t lhs_idx = 0, rhs_idx = 0; + try { + lhs_idx = stol(lhs.substr(1)); + rhs_idx = stol(rhs.substr(1)); + } catch (const std::exception& e) { + IR_THROW("Invalid symbolic name"); + } + return (lhs[0] < rhs[0]) || (lhs[0] == rhs[0] && lhs_idx < rhs_idx); +} + +// Gives a consistent order of a list op SymbolicDimProducts +bool CompareSymbolicDimProduct(SymbolicDimProduct& lhs, // NOLINT + SymbolicDimProduct& rhs) { // NOLINT + if (lhs.symbols.size() < rhs.symbols.size()) return true; + if (lhs.symbols.size() == rhs.symbols.size()) { + for (size_t idx = 0; idx < lhs.symbols.size(); ++idx) { + const std::string lhs_name = lhs.symbols[idx].GetSymName(); + const std::string rhs_name = rhs.symbols[idx].GetSymName(); + if (CompareSymbolicDimNames(lhs_name, rhs_name)) return true; + if (lhs_name != rhs_name) return false; + } + } + return false; +} + +SymbolicDimMgr::SymbolicDimMgr(ModuleOp m) : m_(m) { + for (auto op : *(m.block())) { + if (op->isa()) { + symbol_table_ = SymbolTable(op); + return; + } + } + Builder builder = Builder(m_.ir_context(), m_.block(), m_.block()->begin()); + dialect::FuncOp func = builder.Build(); + symbol_table_ = SymbolTable(func); +} + +bool SymbolicDimMgr::Load() { + auto func_op = symbol_table_.getOp()->dyn_cast(); + assert(func_op); + for (auto op : *(func_op.block())) { + symbol_table_.insert(op); + if (SymbolicDim sym_dim_op = op->dyn_cast()) { + symbol_dim_union_set_[sym_dim_op] = sym_dim_op; + symbol_name_set_.insert(sym_dim_op.GetSymName()); + } + } + return LoadShapeConstraintGraph(); +} + +bool SymbolicDimMgr::LoadShapeConstraintGraph() { + // TODO(liujinnan): add more constraint function. currently, only support + // tie_product_equal. + auto constraint_vec = + symbol_table_.Lookup("tie_product_equal"); + + if (!constraint_vec.size()) return true; + + auto build_sym_product = [&](std::vector range, + SymbolicDimProduct& product) { + for (Value v : range) { + auto defining_op = v.dyn_cast().owner(); + if (auto constOp = defining_op->dyn_cast()) { + product.factor *= constOp.value().dyn_cast().data(); + continue; + } else if (auto dimOp = defining_op->dyn_cast()) { + auto sym = symbol_table_.Lookup(dimOp.getName()); + if (!sym) return false; + product.symbols.push_back(sym); + continue; + } + return false; + } + return true; + }; + + for (auto op : constraint_vec) { + SymbolicDimProduct lhs, rhs; + if (!build_sym_product(op.lhs(), lhs) || + !build_sym_product(op.rhs(), rhs) || + !MapSymbolicDimProductEqual(lhs, rhs)) + return false; + } + return true; +} + +bool SymbolicDimMgr::MapSymbolicDimProductEqual(const SymbolicDimProduct& lhs, + const SymbolicDimProduct& rhs) { + SymbolicDimProduct new_lhs, new_rhs; + std::tie(new_lhs, new_rhs) = SimplifySymbolicDimProductPair(lhs, rhs); + + // Return true for identity case. + if (new_lhs == new_rhs) return true; + + if (new_lhs.factor == new_rhs.factor && new_lhs.symbols.size() == 1 && + new_rhs.symbols.size() == 1) { + return MapSymbolicDimEqual(new_lhs.symbols[0], new_rhs.symbols[0]); + } else if (new_lhs.symbols.size() == 0 && new_rhs.symbols.size() == 1 && + new_rhs.factor == 1) { + return MapSymbolicDimEqual(NewConstantSymbolicDim(new_lhs.factor), + new_rhs.symbols[0]); + } else if (new_rhs.symbols.size() == 0 && new_lhs.symbols.size() == 1 && + new_lhs.factor == 1) { + return MapSymbolicDimEqual(NewConstantSymbolicDim(new_rhs.factor), + new_lhs.symbols[0]); + } + + product_equality_map_[new_lhs][new_rhs] = + product_equality_map_[new_rhs][new_lhs] = true; + + product_equality_map_updated_ = false; + return true; +} + +SymbolicDimProduct SymbolicDimMgr::SimplifySymbolicDimProduct( + const SymbolicDimProduct& x) { + std::vector copied; + copied.reserve(x.symbols.size()); + for (SymbolicDim op : x.symbols) copied.push_back(GetRootSymbolicDim(op)); + + std::sort( + copied.begin(), copied.end(), [&](SymbolicDim lhs, SymbolicDim rhs) { + return CompareSymbolicDimNames(lhs.GetSymName(), rhs.GetSymName()); + }); + SymbolicDimProduct new_x; + new_x.factor = x.factor; + for (SymbolicDim op : copied) { + if (!op.IsDynamic()) { + new_x.factor *= op.GetDimSize(); + } else { + new_x.symbols.push_back(op); + } + } + return new_x; +} + +std::pair +SymbolicDimMgr::SimplifySymbolicDimProductPair(const SymbolicDimProduct& x, + const SymbolicDimProduct& y) { + // First do some basic clean up (e.g. folding const symbolic dim op into the + // fator field) + auto lhs = SimplifySymbolicDimProduct(x); + auto rhs = SimplifySymbolicDimProduct(y); + + SymbolicDimProduct new_lhs, new_rhs; + int64_t gcd_factor = std::gcd(std::abs(lhs.factor), std::abs(rhs.factor)); + + // 0 * lhs_symbols = 0 * rhs_symbols, no more information. + // Just return empty new_lhs & new_rhs + if (!gcd_factor) + return std::make_pair(std::move(new_lhs), std::move(new_rhs)); + + // Canonicalization factor form: always let the smaller factor being positive + // number. + if (std::abs(lhs.factor) < std::abs(rhs.factor)) { + if (lhs.factor < 0) gcd_factor = -gcd_factor; + } else { + if (rhs.factor < 0) gcd_factor = -gcd_factor; + } + + new_lhs.factor = lhs.factor / gcd_factor; + new_rhs.factor = rhs.factor / gcd_factor; + + std::unordered_map lhs_symbol_map; + std::unordered_map rhs_symbol_map; + + for (SymbolicDim op : lhs.symbols) ++lhs_symbol_map[op]; + for (SymbolicDim op : rhs.symbols) ++rhs_symbol_map[op]; + + for (SymbolicDim op : lhs.symbols) { + auto it = rhs_symbol_map.find(op); + if (it != rhs_symbol_map.end() && op.GetKnownNonSizeZero()) { + if (--it->second == 0) rhs_symbol_map.erase(it); + continue; + } + new_lhs.symbols.push_back(op); + } + + for (SymbolicDim op : rhs.symbols) { + auto it = lhs_symbol_map.find(op); + if (it != lhs_symbol_map.end() && op.GetKnownNonSizeZero()) { + if (--it->second == 0) lhs_symbol_map.erase(it); + continue; + } + new_rhs.symbols.push_back(op); + } + + if (!new_lhs.factor) new_lhs.symbols.clear(); + if (!new_rhs.factor) new_rhs.symbols.clear(); + + return std::make_pair(std::move(new_lhs), std::move(new_rhs)); +} + +const std::string SymbolicDimMgr::GetNextName() { + std::string name; + do { + name = "S" + std::to_string(next_symbolic_idx_++); + } while (!symbol_name_set_.insert(name).second); + return name; +} + +SymbolicDim SymbolicDimMgr::NewSymbolicDim(const std::string& name) { + auto func_op = symbol_table_.getOp()->dyn_cast(); + assert(func_op); + Builder builder = Builder(m_.ir_context(), func_op.block()); + // default settting dim != 0 + dialect::SymbolicDim symbol = + builder.Build(name.empty() ? GetNextName() : name, + ShapedTypeInterface::kDynamic, + false, + false, + false, + true); + symbol_dim_union_set_[symbol] = symbol; + symbol_table_.insert(symbol); + return symbol; +} + +SymbolicDim SymbolicDimMgr::NewConstantSymbolicDim(int64_t val) { + auto it = constant_symbolic_dim_map_.find(val); + if (it == constant_symbolic_dim_map_.end()) { + auto name = "C" + std::to_string(val); + it = constant_symbolic_dim_map_ + .insert(std::make_pair(val, NewSymbolicDim(name))) + .first; + it->second.SetDimSize(val); + if (val == -1) it->second.UpdateKnownNegativeOne(true); + if (val >= 0) it->second.UpdateKnownNonNegative(true); + if (val != 1) it->second.UpdateKnownNonSizeOne(true); + if (val != 0) it->second.UpdateKnownNonSizeZero(true); + } + return GetRootSymbolicDim(it->second); +} + +std::vector SymbolicDimMgr::CreateSymbolicDimsForRankedValue( + Value value) { + std::vector symbols; + auto dims = value.type().dyn_cast().dims(); + for (int idx = 0; idx < dims.size(); ++idx) { + symbols.push_back(dims[idx] == ShapedTypeInterface::kDynamic + ? NewSymbolicDim() + : NewConstantSymbolicDim(dims[idx])); + } + return symbols; +} + +SymbolicDim SymbolicDimMgr::GetRootSymbolicDim(SymbolicDim symbol) { + SymbolicDim current = symbol; + std::vector path; + while (symbol_dim_union_set_[current] != current) { + path.push_back(current); + current = symbol_dim_union_set_[current]; + } + for (SymbolicDim sym : path) symbol_dim_union_set_[sym] = current; + return current; +} + +bool SymbolicDimMgr::IsSymbolicDimEqual(SymbolicDim lhs, SymbolicDim rhs) { + SymbolicDim lhs_root = GetRootSymbolicDim(lhs); + SymbolicDim rhs_root = GetRootSymbolicDim(rhs); + return lhs_root == rhs_root; +} + +bool SymbolicDimMgr::MapSymbolicDimEqual(SymbolicDim lhs, SymbolicDim rhs) { + SymbolicDim lhs_root = GetRootSymbolicDim(lhs); + SymbolicDim rhs_root = GetRootSymbolicDim(rhs); + + if (lhs_root != rhs_root) { + if (CompareSymbolicDimNames(lhs_root.GetSymName(), rhs_root.GetSymName())) { + if (!lhs_root.Merge(rhs_root)) return false; + symbol_dim_union_set_[rhs_root] = lhs_root; + } else { + if (!rhs_root.Merge(lhs_root)) return false; + symbol_dim_union_set_[lhs_root] = rhs_root; + } + product_equality_map_updated_ = false; + } + return true; +} + +SymbolicDimProduct* SymbolicDimMgr::SymbolicDimProductDivide( + const SymbolicDimProduct& lhs, const SymbolicDimProduct& rhs) { + SymbolicDimProduct new_lhs, new_rhs; + std::tie(new_lhs, new_rhs) = SimplifySymbolicDimProductPair(lhs, rhs); + + if (new_lhs.factor == 0 || new_rhs.factor == 0) return nullptr; + if (new_lhs.factor % new_rhs.factor != 0) return nullptr; + if (new_lhs.symbols.size() < new_rhs.symbols.size()) return nullptr; + + SymbolicDimProduct* result = new SymbolicDimProduct(); + result->factor = new_lhs.factor / new_rhs.factor; + + std::unordered_map sym_proc_map; + for (SymbolicDim sym : new_rhs.symbols) ++sym_proc_map[sym]; + + for (SymbolicDim sym : new_lhs.symbols) { + auto it = sym_proc_map.find(sym); + if (it == sym_proc_map.end()) { + result->symbols.push_back(sym); + continue; + } + if (--it->second == 0) { + sym_proc_map.erase(it); + continue; + } + } + + if (!sym_proc_map.empty()) return nullptr; + return result; +} + +bool SymbolicDimMgr::IsMultipleOfKnownSymbolicDimProductEqualPair( + const SymbolicDimProduct& lhs, const SymbolicDimProduct& rhs) { + for (auto& pair_outter : product_equality_map_) { + const SymbolicDimProduct& x = pair_outter.first; + auto factor_x = SymbolicDimProductDivide(lhs, x); + if (!factor_x) continue; + for (auto& pair_inner : pair_outter.second) { + if (!pair_inner.second) continue; + const SymbolicDimProduct& y = pair_inner.first; + auto factor_y = SymbolicDimProductDivide(rhs, y); + if (!factor_y || (*factor_x) != (*factor_y)) continue; + return true; + } + } + + return false; +} + +bool SymbolicDimMgr::UpdateProductEqualityMap() { + // Return true if nothing is updated. + if (product_equality_map_updated_) return true; + + SymbolicDimProductMap new_map; + std::unordered_set product_set; + for (auto& pair_outter : product_equality_map_) { + const SymbolicDimProduct& x = pair_outter.first; + for (auto& pair_inner : pair_outter.second) { + if (!pair_inner.second) continue; + + const SymbolicDimProduct& y = pair_inner.first; + SymbolicDimProduct new_x, new_y; + std::tie(new_x, new_y) = SimplifySymbolicDimProductPair(x, y); + if (new_x == new_y) continue; + + new_map[new_x][new_y] = new_map[new_y][new_x] = true; + product_set.insert(new_x); + product_set.insert(new_y); + } + } + // hash function of SymbolicDimProduct is expensive, thus we map it to integer + // domain first. + std::unordered_map symProd2Idx; + std::vector idx2SymProd(product_set.size()); + std::vector idx2root(product_set.size()); + for (auto& x : product_set) { + size_t idx = symProd2Idx.size(); + symProd2Idx[&x] = idx; + idx2SymProd[idx] = &x; + idx2root[idx] = idx; + } + + auto getRootIdx = [&](size_t root) { + std::vector path; + while (idx2root[root] != root) { + path.push_back(root); + root = idx2root[root]; + } + for (size_t idx : path) idx2root[idx] = root; + return root; + }; + + for (size_t x = 0; x < symProd2Idx.size(); ++x) { + auto& xProd = *idx2SymProd[x]; + auto& rowMap = new_map[xProd]; + size_t xRoot = getRootIdx(x); + for (size_t y = x; y < symProd2Idx.size(); ++y) { + auto& yProd = *idx2SymProd[y]; + if (!rowMap[yProd]) continue; + idx2root[getRootIdx(y)] = xRoot; + } + } + + for (size_t x = 0; x < symProd2Idx.size(); ++x) + for (size_t y = x; y < symProd2Idx.size(); ++y) { + if (getRootIdx(x) != getRootIdx(y)) continue; + auto& xSymProd = *idx2SymProd[x]; + auto& ySymProd = *idx2SymProd[y]; + + new_map[xSymProd][ySymProd] = new_map[ySymProd][xSymProd] = true; + } + + product_equality_map_ = std::move(new_map); + + for (auto& x : product_set) + for (auto& y : product_set) { + if (!product_equality_map_[x][y]) continue; + product_equality_map_[x][y] = product_equality_map_[y][x] = false; + if (!IsMultipleOfKnownSymbolicDimProductEqualPair(x, y)) { + product_equality_map_[x][y] = product_equality_map_[y][x] = true; + } + } + + std::unordered_set toRemove; + for (auto& x : product_set) { + if (std::all_of(product_set.begin(), + product_set.end(), + [&](const SymbolicDimProduct& y) { + return !product_equality_map_[x][y]; + })) { + toRemove.insert(x); + } + } + + for (auto& x : toRemove) { + product_equality_map_.erase(x); + } + + product_equality_map_updated_ = true; + return true; +} + +bool SymbolicDimMgr::IsSymbolicDimProductEqual(const SymbolicDimProduct& lhs, + const SymbolicDimProduct& rhs) { + SymbolicDimProduct new_lhs, new_rhs; + std::tie(new_lhs, new_rhs) = SimplifySymbolicDimProductPair(lhs, rhs); + + // Return true for identity case. + if (new_lhs == new_rhs) return true; + IR_ENFORCE(UpdateProductEqualityMap(), "Update product equality map failed."); + return IsMultipleOfKnownSymbolicDimProductEqualPair(new_lhs, new_rhs); +} + +bool SymbolicDimMgr::Save() { + using Name2SymbolFn = std::function; + auto update_attrs = [&](ArrayAttribute attrs, Name2SymbolFn fn) { + std::vector new_attrs; + for (Attribute attr : attrs.AsVector()) { + auto sym = fn(attr.dyn_cast().AsString()); + assert(sym); + SymbolicDim root = GetRootSymbolicDim(sym); + Attribute root_symbol = + StrAttribute::get(m_->ir_context(), root.GetSymName()); + new_attrs.push_back(root_symbol); + } + return ArrayAttribute::get(m_->ir_context(), new_attrs); + }; + + // TODO(liujinnan): update attributes attached in DenseTensorType + for (auto op : *(m_.block())) { + if (!op->HasAttribute(SymbolicDim::GetSymbolicDimAttrName())) continue; + auto attrs = + op->attribute(SymbolicDim::GetSymbolicDimAttrName()); + auto symbolic_shape_attr = + update_attrs(attrs, [&](const std::string& name) { + return symbol_table_.Lookup(name); + }); + op->set_attribute(SymbolicDim::GetSymbolicDimAttrName(), + symbolic_shape_attr); + } + if (!UpdateProductEqualityMap()) { + return false; + } + std::unordered_set used_symbolic_ops; + std::vector used_symbol_names; + // TODO(liujinnan): collect uses in value. + auto collect_used_symbols = [&](ArrayAttribute attrs) { + for (Attribute attr : attrs.AsVector()) { + auto sym = symbol_table_.Lookup( + attr.dyn_cast().AsString()); + assert(sym); + if (used_symbolic_ops.insert(sym).second) + used_symbol_names.push_back(sym.GetSymName()); + } + }; + for (auto op : *(m_.block())) { + if (!op->HasAttribute(SymbolicDim::GetSymbolicDimAttrName())) continue; + auto attrs = + op->attribute(SymbolicDim::GetSymbolicDimAttrName()); + collect_used_symbols(attrs); + } + auto func_op = symbol_table_.getOp()->dyn_cast(); + assert(func_op); + for (auto& p : symbol_dim_union_set_) { + if (!used_symbolic_ops.count(p.first)) { + func_op.block()->erase(*(p.first.operation())); + } + } + + std::vector candidates; + for (auto& outter : product_equality_map_) { + if (std::any_of( + outter.first.symbols.begin(), + outter.first.symbols.end(), + [&](SymbolicDim sym) { return used_symbolic_ops.count(sym) == 0; })) + candidates.push_back(outter.first); + } + + for (auto& prod : candidates) product_equality_map_.erase(prod); + for (auto& outter : product_equality_map_) { + std::vector candidates; + for (auto& inner : outter.second) { + if (std::any_of(inner.first.symbols.begin(), + inner.first.symbols.end(), + [&](SymbolicDim sym) { + return used_symbolic_ops.count(sym) == 0; + })) + candidates.push_back(outter.first); + } + for (auto& prod : candidates) outter.second.erase(prod); + } + + std::sort(used_symbol_names.begin(), + used_symbol_names.end(), + [&](const std::string& lhs, const std::string& rhs) { + return CompareSymbolicDimNames(lhs, rhs); + }); + int non_const_dims_num = 0; + std::unordered_map name_mapping; + for (const auto& name : used_symbol_names) { + if (name.size() > 0 && name[0] == 'C') { + name_mapping[name] = name; + } else { + name_mapping[name] = ("S" + std::to_string(non_const_dims_num++)); + } + } + + std::unordered_map name_to_symbol; + for (SymbolicDim op : used_symbolic_ops) { + auto name = op.GetSymName(); + op.SetSymName(name_mapping[name]); + name_to_symbol[name] = op; + } + + for (auto op : *(m_.block())) { + if (!op->HasAttribute(SymbolicDim::GetSymbolicDimAttrName())) continue; + auto attrs = + op->attribute(SymbolicDim::GetSymbolicDimAttrName()); + auto symbolic_shape_attr = update_attrs( + attrs, [&](const std::string& name) { return name_to_symbol[name]; }); + op->set_attribute(SymbolicDim::GetSymbolicDimAttrName(), + symbolic_shape_attr); + } + + // TODO(liujinnan): update attributes attached to values. + + return SaveShapeConstraintGraph(); +} + +bool SymbolicDimMgr::SaveShapeConstraintGraph() { + auto func_op = symbol_table_.getOp()->dyn_cast(); + assert(func_op); + auto op_it = func_op.block()->rbegin(); + while (op_it != func_op.block()->rend()) { + if (((*op_it)->isa()) || + ((*op_it)->isa())) + op_it++; + else + op_it = decltype(op_it)(func_op.block()->erase(*(*op_it))); + } + + // save product equal predicate + Builder builder = Builder(m_->ir_context(), func_op.block()); + auto build_operands = [&](const SymbolicDimProduct& prod) { + std::vector values; + + if (prod.factor != 1) { + values.push_back( + builder + .Build( + Int32Attribute::get(m_->ir_context(), prod.factor), + Int32Type::get(m_->ir_context())) + ->result(0)); + } + for (SymbolicDim sym : prod.symbols) { + values.push_back(builder.Build(sym.GetSymName()).out()); + } + return values; + }; + std::vector sorted_product_vec; + for (auto& p : product_equality_map_) sorted_product_vec.push_back(p.first); + std::sort(sorted_product_vec.begin(), + sorted_product_vec.end(), + CompareSymbolicDimProduct); + for (auto& x : sorted_product_vec) { + for (auto& y : sorted_product_vec) { + if (!CompareSymbolicDimProduct(x, y)) continue; + if (!product_equality_map_[x][y]) continue; + auto lhs_operands = build_operands(x); + auto rhs_operands = build_operands(y); + builder.Build(lhs_operands, rhs_operands); + } + } + return true; +} +} // namespace pir diff --git a/paddle/pir/dialect/shape/utils/shape_optimization_utils.h b/paddle/pir/dialect/shape/utils/shape_optimization_utils.h index 7f31a4fb55cf1..fdec957aa6be7 100644 --- a/paddle/pir/dialect/shape/utils/shape_optimization_utils.h +++ b/paddle/pir/dialect/shape/utils/shape_optimization_utils.h @@ -13,3 +13,97 @@ // limitations under the License. #pragma once +#include +#include "paddle/pir/dialect/shape/utils/symbol_table.h" + +namespace pir { +using dialect::SymbolicDim; + +struct SymbolicDimProduct { + std::vector symbols; + int64_t factor = 1; + bool empty() { return factor == 1 && symbols.empty(); } + friend inline bool operator==(const SymbolicDimProduct& lhs, + const SymbolicDimProduct& rhs) { + return lhs.factor == rhs.factor && lhs.symbols == rhs.symbols; + } + + friend inline bool operator!=(const SymbolicDimProduct& lhs, + const SymbolicDimProduct& rhs) { + return !(lhs == rhs); + } +}; + +struct SymDimHasher { + size_t operator()(const dialect::SymbolicDim& symbol) const noexcept { + return std::hash{}(symbol.operation()); + } +}; + +struct SymProductHasher { + size_t operator()(const SymbolicDimProduct& symProd) const noexcept { + size_t hash = std::hash{}(symProd.symbols.size()); + for (auto& symbol : symProd.symbols) { + hash = hash_combine(hash, SymDimHasher{}(symbol)); // NOLINT + } + hash = hash_combine(hash, std::hash{}(symProd.factor)); + return hash; + } +}; + +class SymbolicDimMgr { + public: + explicit SymbolicDimMgr(ModuleOp m); + bool Load(); + SymbolicDim NewSymbolicDim(const std::string& name = {}); + SymbolicDim NewConstantSymbolicDim(int64_t val); + std::vector CreateSymbolicDimsForRankedValue(Value value); + SymbolicDim GetRootSymbolicDim(SymbolicDim symbol); + bool IsSymbolicDimEqual(SymbolicDim lhs, SymbolicDim rhs); + bool MapSymbolicDimEqual(SymbolicDim lhs, SymbolicDim rhs); + SymbolicDimProduct SimplifySymbolicDimProduct(const SymbolicDimProduct& x); + std::pair + SimplifySymbolicDimProductPair(const SymbolicDimProduct& x, + const SymbolicDimProduct& y); + SymbolicDimProduct* SymbolicDimProductDivide(const SymbolicDimProduct& x, + const SymbolicDimProduct& y); + bool Save(); + bool IsSymbolicDimProductEqual(const SymbolicDimProduct& lhs, + const SymbolicDimProduct& rhs); + + bool MapSymbolicDimProductEqual(const SymbolicDimProduct& lhs, + const SymbolicDimProduct& rhs); + SymbolTable& symbolTable() { return symbol_table_; } + + private: + const std::string GetNextName(); + bool SaveShapeConstraintGraph(); + bool LoadShapeConstraintGraph(); + bool UpdateProductEqualityMap(); + bool IsMultipleOfKnownSymbolicDimProductEqualPair( + const SymbolicDimProduct& lhs, const SymbolicDimProduct& rhs); + + private: + ModuleOp m_; + + SymbolTable symbol_table_; + + int64_t next_symbolic_idx_ = 0; + + std::unordered_set symbol_name_set_; + + std::unordered_map + symbol_dim_union_set_; + + std::unordered_map constant_symbolic_dim_map_; + + // product_equality_map_[A][B] == true : Product[A] == Product[B] + using SymbolicDimProductMap = std::unordered_map< + SymbolicDimProduct, + std::unordered_map, + SymProductHasher>; + SymbolicDimProductMap product_equality_map_; + bool product_equality_map_updated_ = true; +}; + +} // namespace pir diff --git a/paddle/pir/dialect/shape/utils/shape_utils.cc b/paddle/pir/dialect/shape/utils/shape_utils.cc index ad2cc1d956918..4e4c87ed30f86 100644 --- a/paddle/pir/dialect/shape/utils/shape_utils.cc +++ b/paddle/pir/dialect/shape/utils/shape_utils.cc @@ -129,740 +129,6 @@ bool ShapeConstraintIRAnalysis::IsProductEqual(Value lhs, return mgr_.IsSymbolicDimProductEqual(lhs_prod, rhs_prod); } -// Gives a consistent order of a list op SymbolicDim Ops -bool CompareSymbolicDimNames(const std::string& lhs, const std::string& rhs) { - // S -> unknown dimension size at compile time - // C -> constant dimension size at compile time - if (lhs.size() < 1 || (lhs[0] != 'S' && lhs[0] != 'C')) return lhs < rhs; - if (rhs.size() < 1 || (rhs[0] != 'S' && rhs[0] != 'C')) return lhs < rhs; - int64_t lhs_idx = 0, rhs_idx = 0; - try { - lhs_idx = stol(lhs.substr(1)); - rhs_idx = stol(rhs.substr(1)); - } catch (const std::exception& e) { - IR_THROW("Invalid symbolic name"); - } - return (lhs[0] < rhs[0]) || (lhs[0] == rhs[0] && lhs_idx < rhs_idx); -} - -// Gives a consistent order of a list op SymbolicDimProducts -bool CompareSymbolicDimProduct(SymbolicDimProduct& lhs, // NOLINT - SymbolicDimProduct& rhs) { // NOLINT - if (lhs.symbols.size() < rhs.symbols.size()) return true; - if (lhs.symbols.size() == rhs.symbols.size()) { - for (size_t idx = 0; idx < lhs.symbols.size(); ++idx) { - const std::string lhs_name = lhs.symbols[idx].GetSymName(); - const std::string rhs_name = rhs.symbols[idx].GetSymName(); - if (CompareSymbolicDimNames(lhs_name, rhs_name)) return true; - if (lhs_name != rhs_name) return false; - } - } - return false; -} - -bool SymbolicDimMgr::Load() { - auto func_op = symbol_table_.getOp()->dyn_cast(); - assert(func_op); - for (auto op_ : *(func_op.block())) { - symbol_table_.insert(op_); - if (SymbolicDim op = op_->dyn_cast()) { - symbolDimUnionSet_[op] = op; - symbolNameSet_.insert(op.GetSymName()); - } - } - return LoadShapeConstraintGraph(); -} - -bool SymbolicDimMgr::LoadShapeConstraintGraph() { - // TODO(liujinnan): add more constraint function. currently, only support - // tie_product_equal. - auto constraint_vec = - symbol_table_.Lookup("tie_product_equal"); - - if (!constraint_vec.size()) return true; - - auto build_sym_product = [&](std::vector range, - SymbolicDimProduct& product) { - for (Value v : range) { - auto definingOp = v.dyn_cast().owner(); - if (auto constOp = definingOp->dyn_cast()) { - product.factor *= constOp.value().dyn_cast().data(); - continue; - } else if (auto dimOp = definingOp->dyn_cast()) { - auto sym = symbol_table_.Lookup(dimOp.getName()); - if (!sym) return false; - product.symbols.push_back(sym); - continue; - } - return false; - } - return true; - }; - - for (auto op : constraint_vec) { - SymbolicDimProduct lhs, rhs; - if (!build_sym_product(op.lhs(), lhs) || - !build_sym_product(op.rhs(), rhs) || - !MapSymbolicDimProductEqual(lhs, rhs)) - return false; - } - return true; -} - -bool SymbolicDimMgr::MapSymbolicDimProductEqual(const SymbolicDimProduct& lhs, - const SymbolicDimProduct& rhs) { - SymbolicDimProduct new_lhs, new_rhs; - std::tie(new_lhs, new_rhs) = SimplifySymbolicDimProductPair(lhs, rhs); - - // early return for identity case. - if (new_lhs == new_rhs) return true; - - if (new_lhs.factor == new_rhs.factor && new_lhs.symbols.size() == 1 && - new_rhs.symbols.size() == 1) { - return MapSymbolicDimEqual(new_lhs.symbols[0], new_rhs.symbols[0]); - } else if (new_lhs.symbols.size() == 0 && new_rhs.symbols.size() == 1 && - new_rhs.factor == 1) { - return MapSymbolicDimEqual(NewConstantSymbolicDim(new_lhs.factor), - new_rhs.symbols[0]); - } else if (new_rhs.symbols.size() == 0 && new_lhs.symbols.size() == 1 && - new_lhs.factor == 1) { - return MapSymbolicDimEqual(NewConstantSymbolicDim(new_rhs.factor), - new_lhs.symbols[0]); - } - - productEqualityMap_[new_lhs][new_rhs] = - productEqualityMap_[new_rhs][new_lhs] = true; - - productEqualityMapUpdated_ = false; - return true; -} - -std::pair -SymbolicDimMgr::SimplifySymbolicDimProductPair(const SymbolicDimProduct& x, - const SymbolicDimProduct& y) { - auto lhs = SimplifySymbolicDimProduct(x); - auto rhs = SimplifySymbolicDimProduct(y); - - SymbolicDimProduct new_lhs, new_rhs; - int64_t gcd_factor = std::gcd(std::abs(lhs.factor), std::abs(rhs.factor)); - if (!gcd_factor) - return std::make_pair(std::move(new_lhs), std::move(new_rhs)); - if (std::abs(lhs.factor) < std::abs(rhs.factor)) { - if (lhs.factor < 0) gcd_factor = -gcd_factor; - } else { - if (rhs.factor < 0) gcd_factor = -gcd_factor; - } - - new_lhs.factor = lhs.factor / gcd_factor; - new_rhs.factor = rhs.factor / gcd_factor; - - std::unordered_map lhs_symbol_map; - std::unordered_map rhs_symbol_map; - for (SymbolicDim op : lhs.symbols) ++lhs_symbol_map[op]; - for (SymbolicDim op : rhs.symbols) ++rhs_symbol_map[op]; - - for (SymbolicDim op : lhs.symbols) { - auto it = rhs_symbol_map.find(op); - if (it != rhs_symbol_map.end() && op.GetKnownNonSizeZero()) { - if (--it->second == 0) rhs_symbol_map.erase(it); - continue; - } - new_lhs.symbols.push_back(op); - } - - for (SymbolicDim op : rhs.symbols) { - auto it = lhs_symbol_map.find(op); - if (it != lhs_symbol_map.end() && op.GetKnownNonSizeZero()) { - if (--it->second == 0) lhs_symbol_map.erase(it); - continue; - } - new_rhs.symbols.push_back(op); - } - - if (!new_lhs.factor) new_lhs.symbols.clear(); - if (!new_rhs.factor) new_rhs.symbols.clear(); - - return std::make_pair(std::move(new_lhs), std::move(new_rhs)); -} - -SymbolicDimProduct SymbolicDimMgr::SimplifySymbolicDimProduct( - const SymbolicDimProduct& x) { - std::vector copied; - copied.reserve(x.symbols.size()); - for (SymbolicDim op : x.symbols) copied.push_back(GetRootSymbolicDim(op)); - - sort(copied.begin(), copied.end(), [&](SymbolicDim lhs, SymbolicDim rhs) { - return CompareSymbolicDimNames(lhs.GetSymName(), rhs.GetSymName()); - }); - SymbolicDimProduct newX; - newX.factor = x.factor; - for (SymbolicDim op : copied) { - if (!op.IsDynamic()) { - newX.factor *= op.GetDimSize(); - } else { - newX.symbols.push_back(op); - } - } - return newX; -} - -const std::string SymbolicDimMgr::GetNextName() { - std::string name; - do { - name = "S" + std::to_string(nextSymbolicIdx_++); - } while (!symbolNameSet_.insert(name).second); - return name; -} - -SymbolicDimMgr::SymbolicDimMgr(ModuleOp m) : m_(m) { - for (auto op : *(m.block())) { - if (op->isa()) { - symbol_table_ = SymbolTable(op); - return; - } - } - Builder builder = Builder(m_.ir_context(), m_.block(), m_.block()->begin()); - dialect::FuncOp func = builder.Build(); - symbol_table_ = SymbolTable(func); -} - -SymbolicDim SymbolicDimMgr::NewSymbolicDim(const std::string& name) { - auto func_op = symbol_table_.getOp()->dyn_cast(); - assert(func_op); - Builder builder = Builder(m_.ir_context(), func_op.block()); - // default settting dim != 0 - dialect::SymbolicDim symbol = - builder.Build(name.empty() ? GetNextName() : name, - ShapedTypeInterface::kDynamic, - false, - false, - false, - true); - symbolDimUnionSet_[symbol] = symbol; - symbol_table_.insert(symbol); - return symbol; -} - -SymbolicDim SymbolicDimMgr::NewConstantSymbolicDim(int64_t val) { - auto it = constantSymbolicDimMap_.find(val); - if (it == constantSymbolicDimMap_.end()) { - auto name = "C" + std::to_string(val); - it = constantSymbolicDimMap_ - .insert(std::make_pair(val, NewSymbolicDim(name))) - .first; - it->second.SetDimSize(val); - if (val == -1) it->second.UpdateKnownNegativeOne(true); - if (val >= 0) it->second.UpdateKnownNonNegative(true); - if (val != 1) it->second.UpdateKnownNonSizeOne(true); - if (val != 0) it->second.UpdateKnownNonSizeZero(true); - } - return GetRootSymbolicDim(it->second); -} - -std::vector SymbolicDimMgr::CreateSymbolicDimsForRankedValue( - Value value) { - std::vector symbols; - auto dims = value.type().dyn_cast().dims(); - for (int idx = 0; idx < dims.size(); ++idx) { - symbols.push_back(dims[idx] == ShapedTypeInterface::kDynamic - ? NewSymbolicDim() - : NewConstantSymbolicDim(dims[idx])); - } - return symbols; -} - -SymbolicDim SymbolicDimMgr::GetRootSymbolicDim(SymbolicDim symbol) { - SymbolicDim current = symbol; - std::vector path; - while (symbolDimUnionSet_[current] != current) { - path.push_back(current); - current = symbolDimUnionSet_[current]; - } - for (SymbolicDim sym : path) symbolDimUnionSet_[sym] = current; - return current; -} - -bool SymbolicDimMgr::IsSymbolicDimEqual(SymbolicDim lhs, SymbolicDim rhs) { - SymbolicDim lhsRoot = GetRootSymbolicDim(lhs); - SymbolicDim rhsRoot = GetRootSymbolicDim(rhs); - return lhsRoot == rhsRoot; -} - -bool SymbolicDimMgr::MapSymbolicDimEqual(SymbolicDim lhs, SymbolicDim rhs) { - SymbolicDim lhsRoot = GetRootSymbolicDim(lhs); - SymbolicDim rhsRoot = GetRootSymbolicDim(rhs); - - if (lhsRoot != rhsRoot) { - if (CompareSymbolicDimNames(lhsRoot.GetSymName(), rhsRoot.GetSymName())) { - if (!lhsRoot.Merge(rhsRoot)) return false; - symbolDimUnionSet_[rhsRoot] = lhsRoot; - } else { - if (!rhsRoot.Merge(lhsRoot)) return false; - symbolDimUnionSet_[lhsRoot] = rhsRoot; - } - } - return true; -} - -SymbolicDimProduct* SymbolicDimMgr::SymbolicDimProductDivide( - const SymbolicDimProduct& lhs, const SymbolicDimProduct& rhs) { - SymbolicDimProduct new_lhs, new_rhs; - std::tie(new_lhs, new_rhs) = SimplifySymbolicDimProductPair(lhs, rhs); - - if (new_lhs.factor == 0 || new_rhs.factor == 0) return nullptr; - if (new_lhs.factor % new_rhs.factor != 0) return nullptr; - if (new_lhs.symbols.size() < new_rhs.symbols.size()) return nullptr; - - SymbolicDimProduct* result = new SymbolicDimProduct(); - result->factor = new_lhs.factor / new_rhs.factor; - - std::unordered_map sym_proc_map; - for (SymbolicDim sym : new_rhs.symbols) ++sym_proc_map[sym]; - - for (SymbolicDim sym : new_lhs.symbols) { - auto it = sym_proc_map.find(sym); - if (it == sym_proc_map.end()) { - result->symbols.push_back(sym); - continue; - } - if (--it->second == 0) { - sym_proc_map.erase(it); - continue; - } - } - - if (!sym_proc_map.empty()) return nullptr; - return result; -} - -bool SymbolicDimMgr::IsMultipleOfKnownSymbolicDimProductEqualPair( - const SymbolicDimProduct& lhs, const SymbolicDimProduct& rhs) { - for (auto& pairOutter : productEqualityMap_) { - const SymbolicDimProduct& x = pairOutter.first; - auto factorX = SymbolicDimProductDivide(lhs, x); - if (!factorX) continue; - for (auto& pairInner : pairOutter.second) { - if (!pairInner.second) continue; - const SymbolicDimProduct& y = pairInner.first; - auto factorY = SymbolicDimProductDivide(rhs, y); - if (!factorY || (*factorX) != (*factorY)) continue; - return true; - } - } - - return false; -} - -bool SymbolicDimMgr::UpdateProductEqualityMap() { - // early return if nothing is updated. - if (productEqualityMapUpdated_) return true; - - SymbolicDimProductMap newMap; - std::unordered_set productSet; - for (auto& pairOutter : productEqualityMap_) { - const SymbolicDimProduct& x = pairOutter.first; - for (auto& pairInner : pairOutter.second) { - if (!pairInner.second) continue; - const SymbolicDimProduct& y = pairInner.first; - SymbolicDimProduct newX, newY; - std::tie(newX, newY) = SimplifySymbolicDimProductPair(x, y); - if (newX == newY) continue; - newMap[newX][newY] = newMap[newY][newX] = true; - productSet.insert(newX); - productSet.insert(newY); - } - } - // hash function of SymbolicDimProduct is expensive, thus we map it to integer - // domain first. - std::unordered_map symProd2Idx; - std::vector idx2SymProd(productSet.size()); - std::vector idx2root(productSet.size()); - for (auto& x : productSet) { - size_t idx = symProd2Idx.size(); - symProd2Idx[&x] = idx; - idx2SymProd[idx] = &x; - idx2root[idx] = idx; - } - - auto getRootIdx = [&](size_t root) { - std::vector path; - while (idx2root[root] != root) { - path.push_back(root); - root = idx2root[root]; - } - for (size_t idx : path) idx2root[idx] = root; - return root; - }; - - for (size_t x = 0; x < symProd2Idx.size(); ++x) { - auto& xProd = *idx2SymProd[x]; - auto& rowMap = newMap[xProd]; - size_t xRoot = getRootIdx(x); - for (size_t y = x; y < symProd2Idx.size(); ++y) { - auto& yProd = *idx2SymProd[y]; - if (!rowMap[yProd]) continue; - idx2root[getRootIdx(y)] = xRoot; - } - } - - for (size_t x = 0; x < symProd2Idx.size(); ++x) - for (size_t y = x; y < symProd2Idx.size(); ++y) { - if (getRootIdx(x) != getRootIdx(y)) continue; - auto& xSymProd = *idx2SymProd[x]; - auto& ySymProd = *idx2SymProd[y]; - - newMap[xSymProd][ySymProd] = newMap[ySymProd][xSymProd] = true; - } - - productEqualityMap_ = std::move(newMap); - - for (auto& x : productSet) - for (auto& y : productSet) { - if (!productEqualityMap_[x][y]) continue; - productEqualityMap_[x][y] = productEqualityMap_[y][x] = false; - if (!IsMultipleOfKnownSymbolicDimProductEqualPair(x, y)) { - productEqualityMap_[x][y] = productEqualityMap_[y][x] = true; - } - } - - std::unordered_set toRemove; - for (auto& x : productSet) { - if (std::all_of(productSet.begin(), - productSet.end(), - [&](const SymbolicDimProduct& y) { - return !productEqualityMap_[x][y]; - })) { - toRemove.insert(x); - } - } - - for (auto& x : toRemove) { - productEqualityMap_.erase(x); - } - - productEqualityMapUpdated_ = true; - return true; -} - -bool SymbolicDimMgr::IsSymbolicDimProductEqual(const SymbolicDimProduct& lhs, - const SymbolicDimProduct& rhs) { - SymbolicDimProduct new_lhs, new_rhs; - std::tie(new_lhs, new_rhs) = SimplifySymbolicDimProductPair(lhs, rhs); - - // early return for identity case. - if (new_lhs == new_rhs) return true; - IR_ENFORCE(UpdateProductEqualityMap(), "Update product equality map failed."); - return IsMultipleOfKnownSymbolicDimProductEqualPair(new_lhs, new_rhs); -} - -bool SymbolicDimMgr::Save() { - using Name2SymbolFn = std::function; - auto updateAttrs = [&](ArrayAttribute attrs, Name2SymbolFn fn) { - std::vector newAttrs; - for (Attribute attr : attrs.AsVector()) { - auto sym = fn(attr.dyn_cast().AsString()); - assert(sym); - SymbolicDim root = GetRootSymbolicDim(sym); - Attribute rootSymbol = - StrAttribute::get(m_->ir_context(), root.GetSymName()); - newAttrs.push_back(rootSymbol); - } - return ArrayAttribute::get(m_->ir_context(), newAttrs); - }; - - // TODO(liujinnan): update attributes attached in DenseTensorType - for (auto op : *(m_.block())) { - if (!op->HasAttribute(SymbolicDim::GetSymbolicDimAttrName())) continue; - auto attrs = - op->attribute(SymbolicDim::GetSymbolicDimAttrName()); - auto symbolicShapeAttr = updateAttrs(attrs, [&](const std::string& name) { - return symbol_table_.Lookup(name); - }); - op->set_attribute(SymbolicDim::GetSymbolicDimAttrName(), symbolicShapeAttr); - } - if (!UpdateProductEqualityMap()) { - return false; - } - std::unordered_set usedSymbolicOps; - std::vector usedSymbolNames; - // TODO(liujinnan): collect uses in value. - auto collectUsedSymbols = [&](ArrayAttribute attrs) { - for (Attribute attr : attrs.AsVector()) { - auto sym = symbol_table_.Lookup( - attr.dyn_cast().AsString()); - assert(sym); - if (usedSymbolicOps.insert(sym).second) - usedSymbolNames.push_back(sym.GetSymName()); - } - }; - for (auto op : *(m_.block())) { - if (!op->HasAttribute(SymbolicDim::GetSymbolicDimAttrName())) continue; - auto attrs = - op->attribute(SymbolicDim::GetSymbolicDimAttrName()); - collectUsedSymbols(attrs); - } - auto func_op = symbol_table_.getOp()->dyn_cast(); - assert(func_op); - for (auto& p : symbolDimUnionSet_) { - if (!usedSymbolicOps.count(p.first)) { - func_op.block()->erase(*(p.first.operation())); - } - } - - std::vector candidates; - for (auto& outter : productEqualityMap_) { - if (std::any_of( - outter.first.symbols.begin(), - outter.first.symbols.end(), - [&](SymbolicDim sym) { return usedSymbolicOps.count(sym) == 0; })) - candidates.push_back(outter.first); - } - - for (auto& prod : candidates) productEqualityMap_.erase(prod); - for (auto& outter : productEqualityMap_) { - std::vector candidates; - for (auto& inner : outter.second) { - if (std::any_of( - inner.first.symbols.begin(), - inner.first.symbols.end(), - [&](SymbolicDim sym) { return usedSymbolicOps.count(sym) == 0; })) - candidates.push_back(outter.first); - } - for (auto& prod : candidates) outter.second.erase(prod); - } - - std::sort(usedSymbolNames.begin(), - usedSymbolNames.end(), - [&](const std::string& lhs, const std::string& rhs) { - return CompareSymbolicDimNames(lhs, rhs); - }); - int numNonConstDims = 0; - std::unordered_map nameMapping; - for (const auto& name : usedSymbolNames) { - if (name.size() > 0 && name[0] == 'C') { - nameMapping[name] = name; - } else { - nameMapping[name] = ("S" + std::to_string(numNonConstDims++)); - } - } - - std::unordered_map name2Symbol; - for (SymbolicDim op : usedSymbolicOps) { - auto name = op.GetSymName(); - op.SetSymName(nameMapping[name]); - name2Symbol[name] = op; - } - - for (auto op : *(m_.block())) { - if (!op->HasAttribute(SymbolicDim::GetSymbolicDimAttrName())) continue; - auto attrs = - op->attribute(SymbolicDim::GetSymbolicDimAttrName()); - auto symbolicShapeAttr = updateAttrs( - attrs, [&](const std::string& name) { return name2Symbol[name]; }); - op->set_attribute(SymbolicDim::GetSymbolicDimAttrName(), symbolicShapeAttr); - } - - // TODO(liujinnan): update attributes attached to values. - - return SaveShapeConstraintGraph(); -} - -bool SymbolicDimMgr::SaveShapeConstraintGraph() { - auto func_op = symbol_table_.getOp()->dyn_cast(); - assert(func_op); - auto op_it = func_op.block()->rbegin(); - while (op_it != func_op.block()->rend()) { - if (((*op_it)->isa()) || - ((*op_it)->isa())) - op_it++; - else - op_it = decltype(op_it)(func_op.block()->erase(*(*op_it))); - } - - Builder builder = Builder(m_->ir_context(), func_op.block()); - auto build_operands = [&](const SymbolicDimProduct& prod) { - std::vector values; - - if (prod.factor != 1) { - values.push_back( - builder - .Build( - Int32Attribute::get(m_->ir_context(), prod.factor), - Int32Type::get(m_->ir_context())) - ->result(0)); - } - for (SymbolicDim sym : prod.symbols) { - values.push_back(builder.Build(sym.GetSymName()).out()); - } - return values; - }; - std::vector sortedProductVec; - for (auto& p : productEqualityMap_) sortedProductVec.push_back(p.first); - std::sort(sortedProductVec.begin(), - sortedProductVec.end(), - CompareSymbolicDimProduct); - for (auto& x : sortedProductVec) { - for (auto& y : sortedProductVec) { - if (!CompareSymbolicDimProduct(x, y)) continue; - if (!productEqualityMap_[x][y]) continue; - auto lhsOperands = build_operands(x); - auto rhsOperands = build_operands(y); - builder.Build(lhsOperands, rhsOperands); - } - } - return true; -} - -ShapeComputationIRAnalysis::ShapeComputationIRAnalysis(ModuleOp m, - SymbolicDimMgr& mgr) - : m_(m), mgr_(mgr) {} - -bool ShapeComputationIRAnalysis::Run() { - // Make sure only run once. - if (initialized_) return false; - initialized_ = true; - auto buildShapeFunc = - std::bind(&ShapeComputationIRAnalysis::BuildShapeOnOperation, - this, - std::placeholders::_1); - if (!RunOnRegion(&(m_->region(0)), buildShapeFunc)) return false; - auto applyOpConstraintFunc = - std::bind(&ShapeComputationIRAnalysis::ApplyOpConstraint, - this, - std::placeholders::_1); - if (!RunOnRegion(&(m_->region(0)), applyOpConstraintFunc)) return false; - return true; -} - -bool ShapeComputationIRAnalysis::RunOnRegion(Region* region, func fn) { - for (Block* block : *region) { - if (!RunOnBlock(block, fn)) return false; - } - return true; -} - -bool ShapeComputationIRAnalysis::RunOnBlock(Block* block, func fn) { - // TODO(liujinnan): mapping block arguments - - std::vector op_list; - for (Operation* op : *block) op_list.push_back(op); - for (Operation* op : op_list) { - if (!RunOnOperation(op, fn)) return false; - } - return true; -} - -bool ShapeComputationIRAnalysis::RunOnOperation(Operation* op, func fn) { - for (size_t i = 0; i < op->num_regions(); ++i) { - if (!RunOnRegion(&(op->region(i)), fn)) return false; - } - return fn(op); -} - -bool ShapeComputationIRAnalysis::BuildShapeOnOperation(Operation* op) { - if (op->isa()) return true; - if (op->isa()) { - Value value = op->operand_source(0); - std::vector symbols; - if (op->HasAttribute(SymbolicDim::GetSymbolicDimAttrName())) { - auto attrs = - op->attribute(SymbolicDim::GetSymbolicDimAttrName()) - .AsVector(); - for (Attribute attr : attrs) { - auto sym = mgr_.symbolTable().Lookup( - attr.dyn_cast().AsString()); - assert(sym); - SymbolicDim root = mgr_.GetRootSymbolicDim(sym); - symbols.push_back(root); - } - } else { - symbols = mgr_.CreateSymbolicDimsForRankedValue(value); - std::vector attrs; - for (SymbolicDim sym : symbols) { - Attribute rootSymbol = - StrAttribute::get(m_->ir_context(), sym.GetSymName()); - attrs.push_back(rootSymbol); - } - op->set_attribute(SymbolicDim::GetSymbolicDimAttrName(), - ArrayAttribute::get(m_->ir_context(), attrs)); - } - rankedTensor2SymDims_[value] = std::move(symbols); - return true; - } - for (size_t i = 0; i < op->num_results(); ++i) { - if (!BuildShapeOnValue(op->result(i))) return false; - } - return true; -} - -bool ShapeComputationIRAnalysis::BuildShapeOnValue(Value value) { - Type type = value.type(); - if (IsIntOrIndex(type)) { - SymbolicDim sym = mgr_.NewSymbolicDim(); - value2SymDim_[value] = sym; - } else if (IsCandidateShapeTensorType(type)) { - auto shapedTy = type.dyn_cast(); - std::vector symbols; - for (size_t i = 0, d = shapedTy.GetShape()[0]; i < d; ++i) - symbols.push_back(mgr_.NewSymbolicDim()); - shapeTensor2SymDims_[value] = std::move(symbols); - } - return true; -} - -bool ShapeComputationIRAnalysis::ApplyOpConstraint(Operation* op) { - IR_ENFORCE(ApplyIndexOpConstraint(op), - "Fail to apply constraint for index op"); - IR_ENFORCE(ApplyTieShapeOpConstraint(op), - "Fail to apply constraint for tie_shape op"); - - // TODO(zhangbo63): add more constraints - return true; -} - -bool ShapeComputationIRAnalysis::ApplyIndexOpConstraint(Operation* op) { - if (op->num_results() == 0) return true; - - Type type = op->result(0).type(); - if (!IsIntOrIndex(type)) return true; - - if (auto dimOp = op->dyn_cast()) { - int64_t dimIndex = dimOp.index() - .dyn_cast() - .owner() - ->attribute("value") - .data(); - value2SymDim_[dimOp.out()].UpdateKnownNonNegative(true); - if (!mgr_.MapSymbolicDimEqual( - value2SymDim_[dimOp.out()], - rankedTensor2SymDims_[dimOp.source()][dimIndex])) { - return false; - } - - } else if (auto constOp = op->dyn_cast()) { - int64_t val = constOp.value().dyn_cast().data(); - if (!mgr_.MapSymbolicDimEqual(value2SymDim_[op->result(0)], - mgr_.NewConstantSymbolicDim(val))) { - return false; - } - } - // TODO(zhangbo63): add support for reifyInferShape. (e.g. mul/add) - return true; -} - -bool ShapeComputationIRAnalysis::ApplyTieShapeOpConstraint(Operation* op) { - if (auto tieShape = op->dyn_cast()) { - auto& value = rankedTensor2SymDims_[op->operand_source(0)]; - for (size_t idx = 0; idx < tieShape.dims().size(); ++idx) { - if (!mgr_.MapSymbolicDimEqual(value2SymDim_[tieShape.dims()[idx]], - value[idx])) - return false; - mgr_.GetRootSymbolicDim(value[idx]).UpdateKnownNonNegative(true); - } - } - return true; -} - bool IsIntOrIndex(Type type) { return type.isa() || type.isa() || type.isa() || type.isa() || diff --git a/paddle/pir/dialect/shape/utils/shape_utils.h b/paddle/pir/dialect/shape/utils/shape_utils.h index 3388971d32aac..72510f8a23c83 100644 --- a/paddle/pir/dialect/shape/utils/shape_utils.h +++ b/paddle/pir/dialect/shape/utils/shape_utils.h @@ -14,6 +14,7 @@ #pragma once +#include "paddle/pir/dialect/shape/utils/shape_optimization_utils.h" #include "paddle/pir/dialect/shape/utils/symbol_table.h" namespace pir { @@ -49,101 +50,10 @@ class ShapeAnalysis { using dialect::SymbolicDim; -struct SymbolicDimProduct { - std::vector symbols; - int64_t factor = 1; - bool empty() { return factor == 1 && symbols.empty(); } - friend inline bool operator==(const SymbolicDimProduct& lhs, - const SymbolicDimProduct& rhs) { - return lhs.factor == rhs.factor && lhs.symbols == rhs.symbols; - } - - friend inline bool operator!=(const SymbolicDimProduct& lhs, - const SymbolicDimProduct& rhs) { - return !(lhs == rhs); - } -}; - -struct SymDimHasher { - size_t operator()(const dialect::SymbolicDim& symbol) const noexcept { - return std::hash{}(symbol.operation()); - } -}; - -struct SymProductHasher { - size_t operator()(const SymbolicDimProduct& symProd) const noexcept { - size_t hash = std::hash{}(symProd.symbols.size()); - for (auto& symbol : symProd.symbols) { - hash = hash_combine(hash, SymDimHasher{}(symbol)); // NOLINT - } - hash = hash_combine(hash, std::hash{}(symProd.factor)); - return hash; - } -}; - -class SymbolicDimMgr { - public: - explicit SymbolicDimMgr(ModuleOp m); - bool Load(); - SymbolicDim NewSymbolicDim(const std::string& name = {}); - SymbolicDim NewConstantSymbolicDim(int64_t val); - std::vector CreateSymbolicDimsForRankedValue(Value value); - SymbolicDim GetRootSymbolicDim(SymbolicDim symbol); - bool IsSymbolicDimEqual(SymbolicDim lhs, SymbolicDim rhs); - SymbolTable& symbolTable() { return symbol_table_; } - bool MapSymbolicDimEqual(SymbolicDim lhs, SymbolicDim rhs); - SymbolicDimProduct SimplifySymbolicDimProduct(const SymbolicDimProduct& x); - std::pair - SimplifySymbolicDimProductPair(const SymbolicDimProduct& x, - const SymbolicDimProduct& y); - SymbolicDimProduct* SymbolicDimProductDivide(const SymbolicDimProduct& x, - const SymbolicDimProduct& y); - - bool Save(); - - bool IsSymbolicDimProductEqual(const SymbolicDimProduct& lhs, - const SymbolicDimProduct& rhs); - bool MapSymbolicDimProductEqual(const SymbolicDimProduct& lhs, - const SymbolicDimProduct& rhs); - - private: - const std::string GetNextName(); - bool UpdateProductEqualityMap(); - bool IsMultipleOfKnownSymbolicDimProductEqualPair( - const SymbolicDimProduct& lhs, const SymbolicDimProduct& rhs); - bool SaveShapeConstraintGraph(); - bool LoadShapeConstraintGraph(); - - private: - ModuleOp m_; - - SymbolTable symbol_table_; - - int64_t nextSymbolicIdx_ = 0; - - std::unordered_set symbolNameSet_; - - std::unordered_map symbolDimUnionSet_; - - std::unordered_map constantSymbolicDimMap_; - - // productEqualityMap_[A][B] == true : Product[A] == Product[B] - using SymbolicDimProductMap = std::unordered_map< - SymbolicDimProduct, - std::unordered_map, - SymProductHasher>; - SymbolicDimProductMap productEqualityMap_; - bool productEqualityMapUpdated_ = true; -}; - // A subclass to impement `ShapeAnalysis` on buffer level. // The implementation is based on shape constraint ir. class ShapeConstraintIRAnalysis : public ShapeAnalysis { public: - // Build shape related analysis on the provided `op`. - // This generally can be divided into two steps: - // 1, load exsiting shape constraint ir (e.g. symbolic dim ops) - // 2, build mapping between memref values and symbolic dim ops. explicit ShapeConstraintIRAnalysis(ModuleOp m); // auto-save updated shape constriant ir when destroying. @@ -156,12 +66,6 @@ class ShapeConstraintIRAnalysis : public ShapeAnalysis { // Returns true if the two value have the same symbolic shape. bool IsShapeEqual(Value lhs, Value rhs) override; - // Suppose: - // lhs_dim_idxs = {ld0, ld1, ...} - // rhs_dim_idxs = {rd0, rd1, ...} - // Returns true if: - // lhs.shape[ld0] * lhs.shape[ld1] * ... == - // rhs.shape[rd0] * rhs.shape[rd1] * ... bool IsProductEqual(Value lhs, std::vector lhs_dim_idxs, Value rhs, @@ -177,37 +81,6 @@ class ShapeConstraintIRAnalysis : public ShapeAnalysis { std::unordered_map> value_to_sym_dims_; }; -class ShapeComputationIRAnalysis { - public: - using func = std::function; - explicit ShapeComputationIRAnalysis(ModuleOp m, - SymbolicDimMgr& mgr); // NOLINT - bool Run(); - - private: - bool RunOnRegion(Region* region, func fn); - bool RunOnBlock(Block* block, func fn); - bool RunOnOperation(Operation* op, func fn); - - bool BuildShapeOnOperation(Operation* op); - bool BuildShapeOnValue(Value value); - - bool ApplyOpConstraint(Operation* op); - bool ApplyIndexOpConstraint(Operation* op); - bool ApplyTieShapeOpConstraint(Operation* op); - - bool initialized_ = false; - ModuleOp m_; - SymbolicDimMgr& mgr_; - - std::unordered_map value2SymDim_; - - // shape tensor is the 1D ranked tensor with int/index dtype. - std::unordered_map> shapeTensor2SymDims_; - - std::unordered_map> rankedTensor2SymDims_; -}; - bool IsIntOrIndex(Type type); bool IsCandidateShapeTensorType(Type ty); } // namespace pir diff --git a/test/cpp/pir/shape_dialect/constraint_pass_test.cc b/test/cpp/pir/shape_dialect/constraint_pass_test.cc index 7c645044a09d0..f5282727f7250 100644 --- a/test/cpp/pir/shape_dialect/constraint_pass_test.cc +++ b/test/cpp/pir/shape_dialect/constraint_pass_test.cc @@ -39,6 +39,7 @@ #include "paddle/pir/core/value.h" #include "paddle/pir/dialect/shape/ir/shape_dialect.h" #include "paddle/pir/dialect/shape/ir/shape_op.h" +#include "paddle/pir/dialect/shape/transforms/shape_optimization.h" #include "paddle/pir/dialect/shape/transforms/shape_optimization_pass.h" #include "paddle/pir/dialect/shape/utils/shape_utils.h" #include "paddle/pir/pass/pass.h" From dcb545fab537d93674855df2b01ad1d8f2b18df4 Mon Sep 17 00:00:00 2001 From: Zhenghai Zhang <65210872+ccsuzzh@users.noreply.github.com> Date: Mon, 9 Oct 2023 18:07:52 +0800 Subject: [PATCH 32/62] [CodeStyle][task 39] enable isort in `python/paddle/base/framework.py`(part3) (#57839) * enable isort in python/paddle/base/framework.py * try relative import * `fluid_version` -> `paddle_version` * correct noqa: F401 * try to fix import order deps * refine comments * `ir_change` -> `ir_guard` * fix some ugly import from #46946 * fix `paddle/base/__init__.py` * move all patch code into one place * move all patch code to base/__init__.py * remove monkey_patch_math_tensor, its part of monkey_patch_tensor * adjust patch order * Revert "adjust patch order" This reverts commit 11a4881bb53ba83b4db762f08a6cba1672100897. * Revert "remove monkey_patch_math_tensor, its part of monkey_patch_tensor" This reverts commit 9532b666bb72562e2869ea3b9e2acb3143c92ed2. * Revert "move all patch code to base/__init__.py" This reverts commit 3b1894ca9fc8285f3ccaf1b52a6e8fdc71c1cb4a. * Revert "move all patch code into one place" This reverts commit 574d9ca42195367a8eb0b32bd976ebe23f5b98aa. --------- Co-authored-by: SigureMo --- pyproject.toml | 1 - python/paddle/__init__.py | 7 ++- python/paddle/base/__init__.py | 8 ++-- python/paddle/base/framework.py | 53 +++++++++++----------- python/paddle/pir/__init__.py | 8 ++-- test/dygraph_to_static/test_origin_info.py | 4 +- 6 files changed, 40 insertions(+), 41 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 4a49ec99f4ec6..ef651db3d1849 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,7 +17,6 @@ extend_skip_glob = [ # These files do not need to be formatted, # see .flake8 for more details "python/paddle/utils/gast/**", - "python/paddle/base/framework.py", ] [tool.ruff] diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py index e73b9ae0cc309..f9d0d9a536e70 100644 --- a/python/paddle/__init__.py +++ b/python/paddle/__init__.py @@ -23,6 +23,9 @@ import paddle from the source directory; please install paddlepaddle*.whl firstly.''' ) +# NOTE(SigureMo): We should place the import of base.core before other modules, +# because there are some initialization codes in base/core/__init__.py. +from .base import core # noqa: F401 from .batch import batch # Do the *DUPLICATED* monkey-patch for the tensor object. @@ -532,8 +535,8 @@ from .pir_utils import IrGuard -ir_change = IrGuard() -ir_change._switch_to_pir() +ir_guard = IrGuard() +ir_guard._switch_to_pir() __all__ = [ 'iinfo', diff --git a/python/paddle/base/__init__.py b/python/paddle/base/__init__.py index 4acf21c465776..5bab0d5cf84f0 100644 --- a/python/paddle/base/__init__.py +++ b/python/paddle/base/__init__.py @@ -15,6 +15,7 @@ import os import sys import atexit +import platform # The legacy core need to be removed before "import core", # in case of users installing paddlepaddle without -U option @@ -32,6 +33,8 @@ except Exception as e: raise e +from . import core + # import all class inside framework into base module from . import framework from .framework import ( @@ -138,11 +141,6 @@ def __bootstrap__(): Returns: None """ - import sys - import os - import platform - from . import core - # NOTE(zhiqiu): When (1)numpy < 1.19; (2) python < 3.7, # unittest is always imported in numpy (maybe some versions not). # so is_test is True and p2p is not inited. diff --git a/python/paddle/base/framework.py b/python/paddle/base/framework.py index ec580ba50d246..5c86638a76627 100644 --- a/python/paddle/base/framework.py +++ b/python/paddle/base/framework.py @@ -12,33 +12,32 @@ # See the License for the specific language governing permissions and # limitations under the License. -import textwrap import collections -from collections.abc import Iterable -from .wrapped_decorator import signature_safe_contextmanager, wrap_decorator +import copy +import functools +import multiprocessing import os import re +import subprocess +import sys +import textwrap +import threading import traceback -import copy -from types import MethodType, FunctionType +import warnings +from collections.abc import Iterable +from types import FunctionType, MethodType import numpy as np -import subprocess -import multiprocessing -import sys -from .proto import framework_pb2 -from .proto import data_feed_pb2 # noqa: F401 +import paddle.version as paddle_version -from . import core -from . import unique_name from .. import pir -from paddle.base.libpaddle import DataType -import paddle.version as fluid_version -import warnings -import functools -from .variable_index import _getitem_static, _setitem_static, _setitem_impl_ -import threading +from . import core, unique_name +from .libpaddle import DataType +from .proto import data_feed_pb2 # noqa: F401 +from .proto import framework_pb2 +from .variable_index import _getitem_static, _setitem_impl_, _setitem_static +from .wrapped_decorator import signature_safe_contextmanager, wrap_decorator __all__ = [] @@ -503,10 +502,10 @@ def require_version(min_version, max_version=None): ) version_installed = [ - fluid_version.major, - fluid_version.minor, - fluid_version.patch, - fluid_version.rc, + paddle_version.major, + paddle_version.minor, + paddle_version.patch, + paddle_version.rc, ] zero_version = ['0', '0', '0', '0'] @@ -524,7 +523,7 @@ def version_cmp(ver_a, ver_b): "PaddlePaddle version in [{}, {}] required, but {} installed. " "Maybe you are using a develop version, " "please make sure the version is good with your code.".format( - min_version, max_version, fluid_version.full_version + min_version, max_version, paddle_version.full_version ) ) else: @@ -532,7 +531,7 @@ def version_cmp(ver_a, ver_b): "PaddlePaddle version {} or higher is required, but {} installed, " "Maybe you are using a develop version, " "please make sure the version is good with your code.".format( - min_version, fluid_version.full_version + min_version, paddle_version.full_version ) ) return @@ -554,7 +553,7 @@ def version_cmp(ver_a, ver_b): ): raise Exception( "VersionError: PaddlePaddle version in [{}, {}] required, but {} installed.".format( - min_version, max_version, fluid_version.full_version + min_version, max_version, paddle_version.full_version ) ) else: @@ -562,7 +561,7 @@ def version_cmp(ver_a, ver_b): raise Exception( "VersionError: PaddlePaddle version {} or higher is required, but {} installed, " "please upgrade your PaddlePaddle to {} or other higher version.".format( - min_version, fluid_version.full_version, min_version + min_version, paddle_version.full_version, min_version ) ) @@ -7703,8 +7702,8 @@ def _get_var(name, program=None): @signature_safe_contextmanager def dygraph_guard_if_declarative(): - from .dygraph.base import in_to_static_mode from .dygraph import Tracer + from .dygraph.base import in_to_static_mode if in_to_static_mode(): # Under @paddle.jit.to_static decorator, we switch back dygraph mode temporarily. diff --git a/python/paddle/pir/__init__.py b/python/paddle/pir/__init__.py index 39b8c71ca5a2f..8a454c09e058d 100644 --- a/python/paddle/pir/__init__.py +++ b/python/paddle/pir/__init__.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from paddle.base.libpaddle.pir import ( +from paddle.base.libpaddle.pir import ( # noqa: F401 Program, Block, Operation, @@ -22,8 +22,8 @@ fake_op_result, is_fake_op_result, Type, -) # noqa: F401 -from paddle.base.libpaddle.pir import ( +) +from paddle.base.libpaddle.pir import ( # noqa: F401 translate_to_new_ir, set_global_program, set_insertion_point, @@ -32,7 +32,7 @@ check_unregistered_ops, register_paddle_dialect, PassManager, -) # noqa: F401 +) from . import core diff --git a/test/dygraph_to_static/test_origin_info.py b/test/dygraph_to_static/test_origin_info.py index c6415dff1ba1c..e2925d4fa1a4b 100644 --- a/test/dygraph_to_static/test_origin_info.py +++ b/test/dygraph_to_static/test_origin_info.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import inspect import sys import unittest @@ -23,11 +24,10 @@ OriginInfo, attach_origin_info, create_and_update_origin_info_map, - gast, - inspect, unwrap, ) from paddle.jit.dy2static.utils import ast_to_func +from paddle.utils import gast def simple_func(x): From aa277a509552cb687de903df7ef438e63fd0e89b Mon Sep 17 00:00:00 2001 From: Kaedeharai <127716752+Kaedeharai@users.noreply.github.com> Date: Mon, 9 Oct 2023 18:23:37 +0800 Subject: [PATCH 33/62] [CodeStyle][task 8] enable Ruff C408 rule in python/paddle/base (#57864) * [CodeStyle][task 8] enable Ruff C408 rule in python/paddle/base * [CodeStyle][task 8] enable Ruff C408 rule in python/paddle/base * [CodeStyle][task 8] enable Ruff C408 rule in python/paddle/base * [CodeStyle][task 8] enable Ruff C408 rule in python/paddle/base --- pyproject.toml | 1 - python/paddle/base/backward.py | 18 +++++++++--------- python/paddle/base/default_scope_funcs.py | 2 +- python/paddle/base/executor.py | 16 ++++++++-------- python/paddle/base/framework.py | 16 ++++++++-------- .../base/layers/layer_function_generator.py | 4 ++-- 6 files changed, 28 insertions(+), 29 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index ef651db3d1849..4ef1904e0c70f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -103,7 +103,6 @@ ignore = [ # Temporarily ignored "python/paddle/base/**" = [ - "C408", "UP030", "C405", "B019", # Confirmation required diff --git a/python/paddle/base/backward.py b/python/paddle/base/backward.py index b3a675882a3a3..82df59271bef1 100755 --- a/python/paddle/base/backward.py +++ b/python/paddle/base/backward.py @@ -812,7 +812,7 @@ def insert_output(self, var): assert isinstance(var, Var) self.outputs.append(var) - var_versions = dict() + var_versions = {} def _create_node(name): if name not in var_versions.keys(): @@ -1808,7 +1808,7 @@ def _rename_grad_( def _get_stop_gradients_(program): - no_grad_dict = dict() + no_grad_dict = {} assert isinstance(program, framework.Program) for block in program.blocks: assert isinstance(block, framework.Block) @@ -2032,7 +2032,7 @@ def append_backward( for idx in son_parent_block_idx_dict: block_fwd_op_num_dict[idx] = program.block(idx).desc.op_size() - grad_to_var = dict() + grad_to_var = {} # pass the cuda_graph_attr to the fill_constant which generates the loss_grad op_desc = _create_loss_op_desc_(loss) @@ -2046,7 +2046,7 @@ def append_backward( map(_strip_grad_suffix_, no_grad_dict[block_idx]) ) - op_path_dict = dict() + op_path_dict = {} op_path = _find_op_path_( block, [loss], [], block_no_grad_set, op_path_dict ) @@ -2109,7 +2109,7 @@ def append_backward( grad_op_id_to_fwd_op=grad_op_id_to_fwd_op, ) - grad_info_map = dict() + grad_info_map = {} # if in control flow, target_grad_block is a created new block which only contains grad ops, # so fwd_op_num is set to 0. @@ -2310,7 +2310,7 @@ def _find_op_path_( input_names = {inp.name for inp in inputs} output_names = _get_output_names(block, targets) if op_path_dict is None: - op_path_dict = dict() + op_path_dict = {} relevant_op_flags = [True] * len(block.ops) @@ -2456,7 +2456,7 @@ def calc_gradient_helper( raise ValueError("input must be in the same program as targets") block_no_grad_set = set(map(_strip_grad_suffix_, no_grad_dict[0])) - op_path_dict = dict() + op_path_dict = {} op_path = _find_op_path_( block, targets, inputs, block_no_grad_set, op_path_dict ) @@ -2507,8 +2507,8 @@ def calc_gradient_helper( block_no_grad_set.update(no_grad_vars) no_grad_dict[0].update(list(map(_append_grad_suffix_, block_no_grad_set))) - grad_to_var = dict() - grad_info_map = dict() + grad_to_var = {} + grad_info_map = {} _append_backward_ops_( block, op_path, diff --git a/python/paddle/base/default_scope_funcs.py b/python/paddle/base/default_scope_funcs.py index dd820572e5edc..225da00088d9a 100644 --- a/python/paddle/base/default_scope_funcs.py +++ b/python/paddle/base/default_scope_funcs.py @@ -42,7 +42,7 @@ def get_cur_scope(): """ cur_scope_stack = getattr(__tl_scope__, 'cur_scope', None) if cur_scope_stack is None: - __tl_scope__.cur_scope = list() + __tl_scope__.cur_scope = [] if len(__tl_scope__.cur_scope) == 0: __tl_scope__.cur_scope.append(paddle.base.core.Scope()) return __tl_scope__.cur_scope[-1] diff --git a/python/paddle/base/executor.py b/python/paddle/base/executor.py index c2db2f04f663d..e5b513831afb8 100755 --- a/python/paddle/base/executor.py +++ b/python/paddle/base/executor.py @@ -1091,18 +1091,18 @@ def __init__(self, place=None): self.place = expected_place else: self.place = framework._get_paddle_place(place) - self.program_caches = dict() - self.ctx_caches = dict() - self.trainer_caches = dict() - self.scope_caches = dict() - self.micro_scope_cache = dict() - self.var_caches = dict() - self.pruned_program_caches = dict() + self.program_caches = {} + self.ctx_caches = {} + self.trainer_caches = {} + self.scope_caches = {} + self.micro_scope_cache = {} + self.var_caches = {} + self.pruned_program_caches = {} p = core.Place() p.set_place(self.place) self._default_executor = core.Executor(p) self._closed = False - self.pruned_program_scope_caches = dict() + self.pruned_program_scope_caches = {} self._prepare_to_run_called = False self._auto_checkpoint_name = unique_name.generate( diff --git a/python/paddle/base/framework.py b/python/paddle/base/framework.py index 5c86638a76627..3aea7e6a85a8a 100644 --- a/python/paddle/base/framework.py +++ b/python/paddle/base/framework.py @@ -1022,7 +1022,7 @@ def cuda_pinned_places(device_count=None): class NameScope: def __init__(self, name="", parent=None): - self._children = dict() + self._children = {} self._name = name self._parent = parent @@ -1218,7 +1218,7 @@ def _debug_string_(proto, throw_on_error=True): Returns(str): The debug string of the protobuf message """ - error_fields = list() + error_fields = [] if not proto.IsInitialized(error_fields) and throw_on_error: raise ValueError( f"{error_fields} are not initialized.\nThe message is {proto}:\n" @@ -2931,7 +2931,7 @@ def __init__( # https://github.com/PaddlePaddle/Paddle/pull/12583#pullrequestreview-145093173 op_attrs = attrs if op_attrs is None: - op_attrs = dict() + op_attrs = {} del attrs # attr for static graph mode cuda graph @@ -3955,7 +3955,7 @@ class Block: def __init__(self, program, idx): self.desc = program.desc.block(idx) self.vars = collections.OrderedDict() # var_name --> var - self.ops = list() # operator list + self.ops = [] # operator list self.program = program def __str__(self): @@ -4113,7 +4113,7 @@ def _find_var_recursive(self, name): Returns: Variable: the Variable with the giving name. Or None if not found. """ - frontier = list() + frontier = [] visited = set() frontier.append(self) @@ -5426,7 +5426,7 @@ def safe_remove_nodes(self, remove_nodes): def resolve_hazard(self): ordered_nodes = core.topology_sort(self.graph) - var_nodes = dict() + var_nodes = {} for node in ordered_nodes: if node.is_op() and node.op() is not None: for each_var_name in node.op().input_arg_names(): @@ -5483,7 +5483,7 @@ def build_adjacency_list(self): dict{IrNode: set(IrNode)}: the adjacency list. """ adj_list = core.build_adjacency_list(self.graph) - wrapped_adj_list = dict() + wrapped_adj_list = {} for k, v in adj_list.items(): wrapped_adj_list[IrNode(k)] = {IrNode(n) for n in v} return wrapped_adj_list @@ -7121,7 +7121,7 @@ def condition(var): var_list = filter(condition, self.list_vars()) - state_dict = dict() + state_dict = {} for var in var_list: var_temp = scope.find_var(var.name) if var_temp is None: diff --git a/python/paddle/base/layers/layer_function_generator.py b/python/paddle/base/layers/layer_function_generator.py index b0f35af4fefed..f77d26ac50a5f 100644 --- a/python/paddle/base/layers/layer_function_generator.py +++ b/python/paddle/base/layers/layer_function_generator.py @@ -214,7 +214,7 @@ def func(*args, **kwargs): dtype = infer_and_check_dtype(op_proto, *args, **kwargs) - inputs = dict() + inputs = {} for ipt in op_proto.inputs: name = _convert_(ipt.name) val = kwargs.pop(name, []) @@ -225,7 +225,7 @@ def func(*args, **kwargs): args = args[1:] inputs[ipt.name] = val - outputs = dict() + outputs = {} out = kwargs.pop(_convert_(o_name), []) if out: out_var = out[0] if (isinstance(out, (list, tuple))) else out From f333b614d13182a4252d86d86e798a0d1c46cbf9 Mon Sep 17 00:00:00 2001 From: wanghuancoder Date: Mon, 9 Oct 2023 18:31:13 +0800 Subject: [PATCH 34/62] del_pad_constant_like_op (#57928) --- .../fluid/operators/pad_constant_like_op.cc | 280 ------------------ paddle/fluid/operators/pad_constant_like_op.h | 101 ------- test/legacy_test/test_pad_constant_like.py | 70 ----- 3 files changed, 451 deletions(-) delete mode 100644 paddle/fluid/operators/pad_constant_like_op.cc delete mode 100644 paddle/fluid/operators/pad_constant_like_op.h delete mode 100644 test/legacy_test/test_pad_constant_like.py diff --git a/paddle/fluid/operators/pad_constant_like_op.cc b/paddle/fluid/operators/pad_constant_like_op.cc deleted file mode 100644 index d00cefab45045..0000000000000 --- a/paddle/fluid/operators/pad_constant_like_op.cc +++ /dev/null @@ -1,280 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/pad_constant_like_op.h" - -#include - -namespace paddle { -namespace operators { - -class PadConstantLikeOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext *ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "PadConstantLike"); - OP_INOUT_CHECK(ctx->HasInput("Y"), "Input", "Y", "PadConstantLike"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "PadConstantLike"); - - auto x_dim = ctx->GetInputDim("X"); - auto y_dim = ctx->GetInputDim("Y"); - - PADDLE_ENFORCE_EQ(x_dim.size(), - y_dim.size(), - platform::errors::InvalidArgument( - "The size of Input(X)'s dimension and the size of " - "Input(Y)'s dimension should be the same, but " - "received %d for Input(X) vs %d for Input(Y).", - x_dim.size(), - y_dim.size())); - - for (int i = 0; i < x_dim.size(); ++i) { - if ((!ctx->IsRuntime()) && ((x_dim[i] == -1) || (y_dim[i] == -1))) { - continue; - } else { - PADDLE_ENFORCE_GE( - x_dim[i], - y_dim[i], - platform::errors::InvalidArgument( - "The size of each dimension of Input(X) expected to be greater " - "than or equal to size of corresponding dimension of Input(Y) " - "(X_dim[i] >= Y_dim[i]), but received %d < %d for dimension %d", - x_dim[i], - y_dim[i], - i)); - } - } - - ctx->SetOutputDim("Out", x_dim); - ctx->ShareLoD("X", /*->*/ "Out"); - } - - protected: - phi::KernelKey GetExpectedKernelType( - const framework::ExecutionContext &ctx) const override { - return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(ctx, "Y"), - ctx.device_context().GetPlace()); - } -}; - -class PadConstantLikeOpMaker : public framework::OpProtoAndCheckerMaker { - public: - void Make() override { - AddInput("X", - "The input of pad_constant_like op. " - "The input should be a k-D tensor(k > 0 and k < 7)"); - AddInput("Y", - "The input of pad_constant_like op. " - "The input should be a k-D tensor(k > 0 and k < 7)"); - AddOutput("Out", - "The output of pad_constant_like op. " - "A tensor with the same shape as X."); - AddAttr("pad_value", - "(float, default 0.0) " - "The value to fill the padded areas.") - .SetDefault(0.0f); - AddComment(R"DOC( -PadConstantLikeOp Operator. - -Pad input(Y) with a pad_value, the number of values padded to the edges of each -axis is specified by the difference of the shape of X and Y. -((0, shape_x_0 - shape_y_0), ... (0, shape_x_n - shape_y_n)) unique pad widths for -each axis. -The input should be a k-D tensor(k > 0 and k < 7). As an example: - -case1: - Given: - X = [[1, 2], - [3, 4], - [1, 2], - [3, 4]]], - X.shape = (4, 2) - - Y = [[5, 6], - [7, 8]], - Y.shape = (2, 2) - - And - pad_value = 0, - - Return: - Out = [[5, 6], - [7, 8], - [0, 0], - [0, 0]] - Out.shape = (4, 2) - -case2: - Given: - X = [[[[ 0, 1, 2], - [ 3, 4, 5]], - [[ 6, 7, 8], - [ 9, 10, 11]], - [[12, 13, 14], - [15, 16, 17]]], - [[[18, 19, 20], - [21, 22, 23]], - [[24, 25, 26], - [27, 28, 29]], - [[30, 31, 32], - [33, 34, 35]]]] - X.shape = (2, 3, 2, 3) - - Y = [[[[35, 36, 37]], - [[38, 39, 40]], - [[41, 42, 43]]]] - Y.shape = (1, 3, 1, 3) - - And - pad_value = -1, - - Return: - - Out = [[[[35, 36, 37], - [-1, -1, -1]], - [[38, 39, 40], - [-1, -1, -1]], - [[41, 42, 43], - [-1, -1, -1]]], - [[[-1, -1, -1], - [-1, -1, -1]], - [[-1, -1, -1], - [-1, -1, -1]], - [[-1, -1, -1], - [-1, -1, -1]]]] - Out.shape = (2, 3, 2, 3) -)DOC"); - } -}; - -class PadConstantLikeOpGrad : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext *ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("Y"), "Input", "Y", "PadConstantLike@Grad"); - OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")), - "Input", - framework::GradVarName("Out"), - "PadConstantLike@Grad"); - - auto y_dim = ctx->GetInputDim("Y"); - auto dout_dim = ctx->GetInputDim(framework::GradVarName("Out")); - - PADDLE_ENFORCE_EQ( - dout_dim.size(), - y_dim.size(), - platform::errors::InvalidArgument( - "Op(PadConstantLike@Grad) the size of Input(Out@Grad)'s dimension " - "and the size of Input(Y)'s dimension should be the same, but " - "received %d for Input(Out@Grad) vs %d for Input(Y).", - dout_dim.size(), - y_dim.size())); - - auto y_grad_name = framework::GradVarName("Y"); - if (ctx->HasOutput(y_grad_name)) { - ctx->SetOutputDim(y_grad_name, y_dim); - ctx->ShareLoD("Y", /*->*/ y_grad_name); - - for (int i = 0; i < y_dim.size(); ++i) { - if ((!ctx->IsRuntime()) && ((dout_dim[i] == -1) || (y_dim[i] == -1))) { - continue; - } else { - PADDLE_ENFORCE_GE( - dout_dim[i], - y_dim[i], - platform::errors::InvalidArgument( - "The size of each dimension of Input(Out@Grad) expected to " - "be greater than or equal to size of corresponding dimension " - "of Input(Y) (Out_dim[i] >= Y_dim[i]), but received %d < %d " - "for dimension %d", - dout_dim[i], - y_dim[i], - i)); - } - } - } - } - - protected: - phi::KernelKey GetExpectedKernelType( - const framework::ExecutionContext &ctx) const override { - return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(ctx, "Y"), - ctx.device_context().GetPlace()); - } -}; - -template -class PadConstantLikeOpGradMaker : public framework::SingleGradOpMaker { - public: - using framework::SingleGradOpMaker::SingleGradOpMaker; - - protected: - void Apply(GradOpPtr bind) const override { - bind->SetType("pad_constant_like_grad"); - bind->SetInput("Y", this->Input("Y")); - bind->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out")); - bind->SetOutput(framework::GradVarName("Y"), this->InputGrad("Y")); - bind->SetAttrMap(this->Attrs()); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -REGISTER_OPERATOR(pad_constant_like, - ops::PadConstantLikeOp, - ops::PadConstantLikeOpMaker, - ops::PadConstantLikeOpGradMaker, - ops::PadConstantLikeOpGradMaker); -REGISTER_OPERATOR(pad_constant_like_grad, ops::PadConstantLikeOpGrad); - -PD_REGISTER_STRUCT_KERNEL(pad_constant_like, - CPU, - ALL_LAYOUT, - ops::PadConstantLikeKernel, - float, - double, - int, - int64_t) {} -PD_REGISTER_STRUCT_KERNEL(pad_constant_like_grad, - CPU, - ALL_LAYOUT, - ops::PadConstantLikeGradKernel, - float, - double, - int, - int64_t) {} - -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -PD_REGISTER_STRUCT_KERNEL(pad_constant_like, - GPU, - ALL_LAYOUT, - ops::PadConstantLikeKernel, - float, - double, - int, - int64_t) {} -PD_REGISTER_STRUCT_KERNEL(pad_constant_like_grad, - GPU, - ALL_LAYOUT, - ops::PadConstantLikeGradKernel, - float, - double, - int, - int64_t) {} -#endif diff --git a/paddle/fluid/operators/pad_constant_like_op.h b/paddle/fluid/operators/pad_constant_like_op.h deleted file mode 100644 index f6162037fbd56..0000000000000 --- a/paddle/fluid/operators/pad_constant_like_op.h +++ /dev/null @@ -1,101 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include - -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/tensor.h" -#include "paddle/fluid/framework/tensor_util.h" -#include "paddle/phi/kernels/funcs/padding.h" - -namespace paddle { -namespace operators { - -template -class PadConstantLikeKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto in_x = context.Input("X"); - auto in_y = context.Input("Y"); - auto* out = context.Output("Out"); - - if (in_x->dims() == in_y->dims()) { - framework::TensorCopy(*in_y, context.GetPlace(), out); - return; - } - - T pad_value = static_cast(context.Attr("pad_value")); - out->mutable_data(context.GetPlace()); - - int rank = context.Input("X")->dims().size(); - - std::vector pads(rank * 2, 0); - - for (int j = 0; j < rank; ++j) { - pads[j * 2] = 0; - pads[j * 2 + 1] = static_cast(in_x->dims()[j] - in_y->dims()[j]); - } - - phi::funcs::PaddingFunctor( - rank, - context.template device_context(), - pads, - pad_value, - *in_y, - out); - } -}; - -template -class PadConstantLikeGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto in_y = context.Input("Y"); - auto in_dout = - context.Input(framework::GradVarName("Out")); - auto* d_y = context.Output(framework::GradVarName("Y")); - - if (d_y == nullptr) { - return; - } - - if (in_dout->dims() == in_y->dims()) { - framework::TensorCopy(*in_dout, context.GetPlace(), d_y); - return; - } - - d_y->mutable_data(context.GetPlace()); - int rank = in_dout->dims().size(); - - std::vector pads(static_cast(rank) * 2, 0); - for (int j = 0; j < rank; ++j) { - pads[j * 2] = 0; - pads[j * 2 + 1] = static_cast(in_dout->dims()[j] - in_y->dims()[j]); - } - - phi::funcs::PaddingGradFunctor( - rank, - context.template device_context(), - pads, - *in_dout, - d_y); - } -}; - -} // namespace operators -} // namespace paddle diff --git a/test/legacy_test/test_pad_constant_like.py b/test/legacy_test/test_pad_constant_like.py deleted file mode 100644 index e304bdf29e4c2..0000000000000 --- a/test/legacy_test/test_pad_constant_like.py +++ /dev/null @@ -1,70 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import unittest - -import numpy as np -from op_test import OpTest - - -class TestPadConstantLikeOp(OpTest): - def setUp(self): - self.initTestCase() - self.op_type = "pad_constant_like" - self.inputs = { - 'X': np.random.random(self.x_shape).astype("float64"), - 'Y': np.random.random(self.y_shape).astype("float64"), - } - self.attrs = {} - self.attrs['pad_value'] = self.pad_value - self.outputs = { - 'Out': np.pad( - self.inputs['Y'], - self.paddings, - mode='constant', - constant_values=self.pad_value, - ) - } - - def test_check_output(self): - self.check_output() - - def test_check_grad_normal(self): - self.check_grad(['Y'], 'Out') - - def initTestCase(self): - self.x_shape = (16, 40) - self.y_shape = (3, 40) - self.pad_value = 0.1 - self.paddings = [(0, 13), (0, 0)] - - -class TestCase1(TestPadConstantLikeOp): - def initTestCase(self): - self.x_shape = (4, 3, 4, 5) - self.y_shape = (2, 3, 4, 5) - self.paddings = [(0, 2), (0, 0), (0, 0), (0, 0)] - self.pad_value = 0.5 - - -class TestCase2(TestPadConstantLikeOp): - def initTestCase(self): - self.x_shape = (4, 3, 4, 10) - self.y_shape = (2, 3, 2, 10) - self.paddings = [(0, 2), (0, 0), (0, 2), (0, 0)] - self.pad_value = 0.5 - - -if __name__ == '__main__': - unittest.main() From 632fa8ca6c2f75bf2a85ffb0880fe115ac39d394 Mon Sep 17 00:00:00 2001 From: wanghuancoder Date: Mon, 9 Oct 2023 18:32:10 +0800 Subject: [PATCH 35/62] [CleanOps]del one hot op (#57894) * del one hot op --- paddle/fluid/operators/one_hot_op.cc | 140 --------------------------- paddle/fluid/operators/one_hot_op.cu | 103 -------------------- paddle/fluid/operators/one_hot_op.h | 110 --------------------- test/xpu/test_one_hot_op_xpu.py | 131 ------------------------- 4 files changed, 484 deletions(-) delete mode 100644 paddle/fluid/operators/one_hot_op.cc delete mode 100644 paddle/fluid/operators/one_hot_op.cu delete mode 100644 paddle/fluid/operators/one_hot_op.h delete mode 100644 test/xpu/test_one_hot_op_xpu.py diff --git a/paddle/fluid/operators/one_hot_op.cc b/paddle/fluid/operators/one_hot_op.cc deleted file mode 100644 index ffb3081ca0ba9..0000000000000 --- a/paddle/fluid/operators/one_hot_op.cc +++ /dev/null @@ -1,140 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/operators/one_hot_op.h" - -#include -#include - -namespace paddle { -namespace operators { - -class OneHotOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "OneHot"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "OneHot"); - - auto x_dims = ctx->GetInputDim("X"); - PADDLE_ENFORCE_GE(x_dims.size(), - 2, - platform::errors::InvalidArgument( - "Input(input) rank should be at least 2, " - "but received input rank (%d) less than 2", - x_dims.size())); - - if (ctx->IsRuntime() || x_dims[x_dims.size() - 1] > 0) { - PADDLE_ENFORCE_GE(x_dims[x_dims.size() - 1], - 1U, - platform::errors::InvalidArgument( - "Last dimension of Input(input) should be 1, " - "but received input Last dimension(%d) != 1", - x_dims[x_dims.size() - 1])); - } - - framework::DDim out_dims(x_dims); - int depth = ctx->Attrs().Get("depth"); - if (ctx->HasInput("depth_tensor")) { - depth = -1; - } - - out_dims[out_dims.size() - 1] = depth; - ctx->SetOutputDim("Out", out_dims); - ctx->ShareLoD("X", /* --> */ "Out"); - } - - protected: - phi::KernelKey GetExpectedKernelType( - const framework::ExecutionContext& ctx) const override { - return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(ctx, "X"), - ctx.GetPlace()); - } - - phi::KernelKey GetKernelTypeForVar( - const std::string& var_name, - const phi::DenseTensor& tensor, - const phi::KernelKey& expected_kernel_type) const override { - if (var_name == "depth_tensor") { - return phi::KernelKey(phi::Backend::ALL_BACKEND, - expected_kernel_type.layout(), - expected_kernel_type.dtype()); - } - return phi::KernelKey( - tensor.place(), tensor.layout(), expected_kernel_type.dtype()); - } -}; - -class OneHotOpMaker : public framework::OpProtoAndCheckerMaker { - public: - void Make() override { - AddInput("X", - "(phi::DenseTensor, phi::DenseTensor) Input variable with " - "rank at least 2. " - "The last dimension of X should be 1. Each value of X is an index " - "to indicate the position."); - AddInput("depth_tensor", "(Tensor, Tensor), Length of one-hot vector") - .AsDispensable(); - AddOutput("Out", - "(Tensor, Tensor) Output tensor with same rank as X. " - "The tensor consists of one-hot representations of values in X."); - - AddAttr("depth", - "A positive integer to specify the length of one-hot vector.") - .SetDefault(-1); - AddAttr("dtype", - "An integer to specify the data type of one-hot " - "vector. The default value is FP32.") - .SetDefault(paddle::framework::proto::VarType::FP32); - AddAttr("allow_out_of_range", - "If it is set true and the input data is out of range, " - "the output tensor will be filled zeros. The default value " - "is false.") - .SetDefault(false); - AddComment(R"DOC( -One Hot Operator. This operator creates the one-hot representations for input -index values. The following example will help to explain the function of this -operator: - -X is a LoDTensor: - X.lod = [[0, 1, 4]] - X.shape = [4, 1] - X.data = [[1], [1], [3], [0]] - -set depth = 4 - -Out is a LoDTensor: - Out.lod = [[0, 1, 4]] - Out.shape = [4, 4] - Out.data = [[0., 1., 0., 0.], - [0., 1., 0., 0.], - [0., 0., 0., 1.], - [1., 0., 0., 0.]] -)DOC"); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OPERATOR( - one_hot, - ops::OneHotOp, - ops::OneHotOpMaker, - paddle::framework::EmptyGradOpMaker, - paddle::framework::EmptyGradOpMaker); -REGISTER_OP_CPU_KERNEL(one_hot, - ops::OneHotKernel, - ops::OneHotKernel); diff --git a/paddle/fluid/operators/one_hot_op.cu b/paddle/fluid/operators/one_hot_op.cu deleted file mode 100644 index 917fa857e0778..0000000000000 --- a/paddle/fluid/operators/one_hot_op.cu +++ /dev/null @@ -1,103 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/operators/one_hot_op.h" -#include "paddle/fluid/platform/device/gpu/gpu_info.h" -#include "paddle/phi/backends/gpu/gpu_primitives.h" - -namespace paddle { -namespace operators { -using phi::PADDLE_CUDA_NUM_THREADS; - -template -__global__ void FillOutputKernel(const InT* p_in_data, - OutT* p_out_data, - const int64_t numel, - const int depth) { - int idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx < numel && p_in_data[idx] >= 0 && p_in_data[idx] < depth) { - *(p_out_data + (idx * depth) + p_in_data[idx]) = 1.0; - } -} - -template -struct OneHotOpCUDAFunctor { - const phi::DenseTensor* in_; - phi::DenseTensor* out_; - const DeviceContext& ctx_; - int depth_; - - OneHotOpCUDAFunctor(const phi::DenseTensor* in, - phi::DenseTensor* out, - int depth, - const DeviceContext& ctx) - : in_(in), out_(out), depth_(depth), ctx_(ctx) {} - - template - void apply() const { - auto* p_in_data = in_->data(); - auto numel = in_->numel(); - auto* p_out_data = out_->mutable_data(ctx_.GetPlace()); - auto stream = ctx_.stream(); - phi::funcs::set_constant(ctx_, out_, 0.0); - - FillOutputKernel<<<(numel + PADDLE_CUDA_NUM_THREADS - 1) / - PADDLE_CUDA_NUM_THREADS, - PADDLE_CUDA_NUM_THREADS, - 0, - stream>>>(p_in_data, p_out_data, numel, depth_); - } -}; - -template -class OneHotCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* in = context.Input("X"); - auto* out = context.Output("Out"); - - int depth = -1; - if (context.HasInput("depth_tensor")) { - auto* depth_tensor = context.Input("depth_tensor"); - if (platform::is_gpu_place(depth_tensor->place())) { - phi::DenseTensor temp; - paddle::framework::TensorCopySync( - *depth_tensor, platform::CPUPlace(), &temp); - depth = *temp.data(); - } else { - depth = *depth_tensor->data(); - } - - auto in_dims = in->dims(); - framework::DDim out_dims(in_dims); - out_dims[out_dims.size() - 1] = depth; - out->Resize(out_dims); - } else { - depth = context.Attr("depth"); - } - framework::VisitDataType( - static_cast( - context.Attr("dtype")), - OneHotOpCUDAFunctor( - in, out, depth, context.template device_context())); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL(one_hot, - ops::OneHotCUDAKernel, - ops::OneHotCUDAKernel); diff --git a/paddle/fluid/operators/one_hot_op.h b/paddle/fluid/operators/one_hot_op.h deleted file mode 100644 index 41ec3eb9a135f..0000000000000 --- a/paddle/fluid/operators/one_hot_op.h +++ /dev/null @@ -1,110 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace paddle { -namespace operators { - -template -struct OneHotOpFunctor { - const phi::DenseTensor* in_; - phi::DenseTensor* out_; - int depth_; - const DeviceContext& ctx_; - bool allow_out_of_range_; - - OneHotOpFunctor(const phi::DenseTensor* in, - phi::DenseTensor* out, - int depth, - const DeviceContext& ctx, - bool allow_out_of_range = false) - : in_(in), - out_(out), - depth_(depth), - ctx_(ctx), - allow_out_of_range_(allow_out_of_range) {} - - template - void apply() const { - auto* p_in_data = in_->data(); - auto numel = in_->numel(); - auto* p_out_data = out_->mutable_data(ctx_.GetPlace()); - phi::funcs::set_constant(ctx_, out_, 0.0); - - if (allow_out_of_range_) { - for (int i = 0; i < numel; ++i) { - if (p_in_data[i] >= 0 && p_in_data[i] < depth_) { - *(p_out_data + i * depth_ + p_in_data[i]) = 1.0; - } - } - } else { - for (int i = 0; i < numel; ++i) { - PADDLE_ENFORCE_GE( - p_in_data[i], - 0, - platform::errors::InvalidArgument( - "Illegal index value, Input(input) value should be at least 0, " - "but received input (%d) less than 0", - p_in_data[i])); - PADDLE_ENFORCE_LT( - p_in_data[i], - depth_, - platform::errors::InvalidArgument( - "Illegal index value, Input(input) value should be less than " - "Input(depth), " - "but received input (%d) not less than depth (%d)", - p_in_data[i], - depth_)); - - *(p_out_data + i * depth_ + p_in_data[i]) = 1.0; - } - } - } -}; - -template -class OneHotKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* in = context.Input("X"); - auto* out = context.Output("Out"); - int depth = context.Attr("depth"); - bool allow_out_of_range = context.Attr("allow_out_of_range"); - if (context.HasInput("depth_tensor")) { - auto* depth_tensor = context.Input("depth_tensor"); - auto* depth_data = depth_tensor->data(); - depth = depth_data[0]; - auto in_dims = in->dims(); - framework::DDim out_dims(in_dims); - out_dims[out_dims.size() - 1] = depth; - out->Resize(out_dims); - } - - framework::VisitDataType( - static_cast( - context.Attr("dtype")), - OneHotOpFunctor( - in, - out, - depth, - context.template device_context(), - allow_out_of_range)); - } -}; - -} // namespace operators -} // namespace paddle diff --git a/test/xpu/test_one_hot_op_xpu.py b/test/xpu/test_one_hot_op_xpu.py deleted file mode 100644 index 9536a8202919b..0000000000000 --- a/test/xpu/test_one_hot_op_xpu.py +++ /dev/null @@ -1,131 +0,0 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import unittest - -import numpy as np -from get_test_cover_info import ( - XPUOpTestWrapper, - create_test_class, - get_xpu_op_support_types, -) -from op_test_xpu import XPUOpTest - -import paddle -from paddle.base import core - -paddle.enable_static() - - -class XPUTestOneHotOP(XPUOpTestWrapper): - def __init__(self): - self.op_name = 'one_hot' - self.use_dynamic_create_class = False - - class TestXPUOneHotOP(XPUOpTest): - def setUp(self): - self.place = paddle.XPUPlace(0) - self.init_dtype() - self.op_type = 'one_hot' - - self.set_data() - self.set_input() - - def set_data(self): - self.depth = 10 - self.depth_np = np.array(10).astype('int32') - self.x_lod = [[4, 1, 3, 3]] - self.x = [ - np.random.randint(0, self.depth - 1) - for i in range(sum(self.x_lod[0])) - ] - self.x = ( - np.array(self.x) - .astype(self.dtype) - .reshape([sum(self.x_lod[0]), 1]) - ) - - self.out = np.zeros( - shape=(np.prod(self.x.shape[:-1]), self.depth) - ).astype('float32') - for i in range(np.prod(self.x.shape)): - self.out[i, self.x[i]] = 1.0 - - self.outputs = {'Out': (self.out, self.x_lod)} - - def set_input(self): - self.inputs = { - 'X': (self.x, self.x_lod), - 'depth_tensor': self.depth_np, - } - self.attrs = {'dtype': int(core.VarDesc.VarType.FP32)} - - def test_check_output(self): - self.check_output(check_dygraph=False) - - def init_dtype(self): - self.dtype = self.in_type - - class TestXPUOneHotOP_attr(TestXPUOneHotOP): - def set_input(self): - self.inputs = {'X': (self.x, self.x_lod)} - self.attrs = { - 'dtype': int(core.VarDesc.VarType.FP32), - 'depth': self.depth, - } - - class TestXPUOneHotOP_default_dtype(TestXPUOneHotOP): - def set_input(self): - self.inputs = { - 'X': (self.x, self.x_lod), - 'depth_tensor': self.depth_np, - } - self.attrs = {} - - class TestXPUOneHotOP_default_dtype_attr(TestXPUOneHotOP): - def set_input(self): - self.inputs = {'X': (self.x, self.x_lod)} - self.attrs = {'depth': self.depth} - - class TestXPUOneHotOP_out_of_range(TestXPUOneHotOP): - def set_data(self): - self.depth = 10 - self.x_lod = [[4, 1, 3, 3]] - self.x = [ - np.random.choice([-1, self.depth]) - for i in range(sum(self.x_lod[0])) - ] - self.x = ( - np.array(self.x) - .astype(self.dtype) - .reshape([sum(self.x_lod[0]), 1]) - ) - - self.out = np.zeros( - shape=(np.prod(self.x.shape[:-1]), self.depth) - ).astype('float32') - - self.outputs = {'Out': (self.out, self.x_lod)} - - def set_input(self): - self.inputs = {'X': (self.x, self.x_lod)} - self.attrs = {'depth': self.depth, 'allow_out_of_range': True} - - -support_types = get_xpu_op_support_types('one_hot') -for stype in support_types: - create_test_class(globals(), XPUTestOneHotOP, stype) - -if __name__ == "__main__": - unittest.main() From efaadbbfe767ae001edc0af6c5ae307a96e4f3cd Mon Sep 17 00:00:00 2001 From: RedContritio Date: Mon, 9 Oct 2023 18:33:31 +0800 Subject: [PATCH 36/62] [CodeStyle][task 33] enable Ruff PLC0414 rule in python/paddle/base (#57876) --- pyproject.toml | 1 - python/paddle/base/backward.py | 3 +-- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 4ef1904e0c70f..4ed2255b36ca8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -108,7 +108,6 @@ ignore = [ "B019", # Confirmation required "C416", "F821", - "PLC0414", ] # B017 diff --git a/python/paddle/base/backward.py b/python/paddle/base/backward.py index 82df59271bef1..debea42cdc562 100755 --- a/python/paddle/base/backward.py +++ b/python/paddle/base/backward.py @@ -20,8 +20,7 @@ from collections.abc import Sequence import paddle.base -from paddle.base import framework as framework -from paddle.base import program_guard +from paddle.base import framework, program_guard from . import core, log_helper, unique_name from .data_feeder import check_type From 89b3863a9c23fef3a0d27ffbca814e52918f9c18 Mon Sep 17 00:00:00 2001 From: wanghuancoder Date: Mon, 9 Oct 2023 18:35:02 +0800 Subject: [PATCH 37/62] fix full_with_tensor choose kernel (#57919) --- paddle/fluid/pir/transforms/pd_op_to_kernel_pass.cc | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/paddle/fluid/pir/transforms/pd_op_to_kernel_pass.cc b/paddle/fluid/pir/transforms/pd_op_to_kernel_pass.cc index c322f71893ff7..2eaed09881907 100644 --- a/paddle/fluid/pir/transforms/pd_op_to_kernel_pass.cc +++ b/paddle/fluid/pir/transforms/pd_op_to_kernel_pass.cc @@ -515,6 +515,16 @@ phi::KernelKey GetKernelKey( op->result(0).type().dyn_cast().dtype())}; } + if (op->isa()) { + auto backend = paddle::experimental::ParseBackend(place); + auto dtype = op->attributes() + .at("dtype") + .dyn_cast() + .data(); + + return {backend, phi::DataLayout::ANY, dtype}; + } + phi::Backend kernel_backend = phi::Backend::UNDEFINED; phi::DataLayout kernel_layout = phi::DataLayout::UNDEFINED; phi::DataType kernel_data_type = phi::DataType::UNDEFINED; From d0fd7ffca34a4581e27ee5aba82440659a39fdb5 Mon Sep 17 00:00:00 2001 From: wanghuancoder Date: Mon, 9 Oct 2023 18:41:43 +0800 Subject: [PATCH 38/62] del_gaussian_random_batch_size_like_op (#57925) --- .../gaussian_random_batch_size_like_op.cc | 108 ------------------ .../gaussian_random_batch_size_like_op.cu | 90 --------------- python/paddle/static/amp/fp16_lists.py | 1 - 3 files changed, 199 deletions(-) delete mode 100644 paddle/fluid/operators/gaussian_random_batch_size_like_op.cc delete mode 100644 paddle/fluid/operators/gaussian_random_batch_size_like_op.cu diff --git a/paddle/fluid/operators/gaussian_random_batch_size_like_op.cc b/paddle/fluid/operators/gaussian_random_batch_size_like_op.cc deleted file mode 100644 index c792532e58f79..0000000000000 --- a/paddle/fluid/operators/gaussian_random_batch_size_like_op.cc +++ /dev/null @@ -1,108 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/operators/batch_size_like.h" - -namespace paddle { -namespace operators { - -template -class CPUGaussianRandomBatchSizeLikeKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - float mean = context.Attr("mean"); - float std = context.Attr("std"); - auto* tensor = context.Output("Out"); - T* data = tensor->mutable_data(context.GetPlace()); - - unsigned int seed = static_cast(context.Attr("seed")); - std::minstd_rand engine; - if (seed == 0) { - seed = std::random_device()(); - } - engine.seed(seed); - std::normal_distribution dist(mean, std); - int64_t size = tensor->numel(); - for (int64_t i = 0; i < size; ++i) { - data[i] = dist(engine); - } - } -}; - -class GaussianRandomBatchSizeLikeOp : public BatchSizeLikeOp { - protected: - using BatchSizeLikeOp::BatchSizeLikeOp; - - phi::KernelKey GetExpectedKernelType( - const framework::ExecutionContext& ctx) const override { - return phi::KernelKey( - static_cast(ctx.Attr("dtype")), - ctx.GetPlace()); - } -}; - -class GaussianRandomBatchSizeLikeOpMaker : public BatchSizeLikeOpMaker { - protected: - void Apply() override { - AddAttr("mean", - "(float, default 0.0) " - "The mean (or center) of the gaussian distribution.") - .SetDefault(.0f); - AddAttr("std", - "(float, default 1.0) " - "The standard deviation (std, or spread) of the " - "gaussian distribution.") - .SetDefault(1.0f); - AddAttr("seed", - "(int, default 0) " - "Random seed of generator." - "0 means don't specify random seed." - "Note that if seed is not 0, this operator will always " - "generate the same random numbers every time.") - .SetDefault(0); - AddAttr("dtype", - "(int, default 5(FP32)) " - "Output data type.") - .SetDefault(framework::proto::VarType::FP32); - - AddComment(R"DOC( - -Used to initialize tensors with gaussian random generator. -The default mean of the distribution is 0, and default standard -deviation (std) of the distribution is 1.0. Uers can set mean and std -via input arguments. -)DOC"); - } -}; - -} // namespace operators -} // namespace paddle - -REGISTER_OPERATOR( - gaussian_random_batch_size_like, - paddle::operators::GaussianRandomBatchSizeLikeOp, - paddle::operators::GaussianRandomBatchSizeLikeOpMaker, - paddle::framework::EmptyGradOpMaker, - paddle::framework::EmptyGradOpMaker, - paddle::operators::BatchSizeLikeNoNeedBufferVarsInferer); - -namespace ops = paddle::operators; -PD_REGISTER_STRUCT_KERNEL(gaussian_random_batch_size_like, - CPU, - ALL_LAYOUT, - ops::CPUGaussianRandomBatchSizeLikeKernel, - float, - double) {} diff --git a/paddle/fluid/operators/gaussian_random_batch_size_like_op.cu b/paddle/fluid/operators/gaussian_random_batch_size_like_op.cu deleted file mode 100644 index 9c5244976fc9d..0000000000000 --- a/paddle/fluid/operators/gaussian_random_batch_size_like_op.cu +++ /dev/null @@ -1,90 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" -#include "paddle/phi/common/amp_type_traits.h" -#include "paddle/phi/core/generator.h" -#include "paddle/phi/kernels/funcs/index_impl.cu.h" - -namespace paddle { -namespace operators { - -template -struct GaussianGenerator { - T mean_, std_; - unsigned int seed_; - unsigned int offset_ = 0; - - __host__ __device__ GaussianGenerator(T mean, T std, int seed) - : mean_(mean), std_(std), seed_(seed) {} - - __host__ __device__ GaussianGenerator(T mean, T std, int seed, int offset) - : mean_(mean), std_(std), seed_(seed), offset_(offset) {} - - __host__ __device__ T operator()(const unsigned int n) const { - thrust::minstd_rand rng; - rng.seed(seed_); - using MT = typename phi::dtype::MPTypeTrait::Type; - thrust::normal_distribution dist(static_cast(mean_), - static_cast(std_)); - unsigned int new_n = n + offset_; - rng.discard(new_n); - MT out = dist(rng); - return static_cast(out); - } -}; - -template -class GPUGaussianRandomBatchSizeLikeKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* tensor = context.Output("Out"); - T* data = tensor->mutable_data(context.GetPlace()); - unsigned int seed = static_cast(context.Attr("seed")); - T mean = static_cast(context.Attr("mean")); - T std = static_cast(context.Attr("std")); - int64_t size = tensor->numel(); - - int device_id = context.GetPlace().GetDeviceId(); - auto gen_cuda = phi::DefaultCUDAGenerator(device_id); - auto& dev_cxt = context.template device_context(); - - if (seed == 0) { - // use global Generator seed - auto seed_offset = gen_cuda->IncrementOffset(1); - uint64_t seed = seed_offset.first; - uint64_t offset = seed_offset.second; - auto func = GaussianGenerator(mean, std, seed, size * offset); - phi::IndexKernel>(dev_cxt, tensor, func); - } else { - auto func = GaussianGenerator(mean, std, seed); - phi::IndexKernel>(dev_cxt, tensor, func); - } - } -}; -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; -PD_REGISTER_STRUCT_KERNEL(gaussian_random_batch_size_like, - GPU, - ALL_LAYOUT, - ops::GPUGaussianRandomBatchSizeLikeKernel, - float, - double, - plat::float16) {} diff --git a/python/paddle/static/amp/fp16_lists.py b/python/paddle/static/amp/fp16_lists.py index 3023628e9a389..06630039ca877 100644 --- a/python/paddle/static/amp/fp16_lists.py +++ b/python/paddle/static/amp/fp16_lists.py @@ -253,7 +253,6 @@ def _update_list(self): 'uniform_random', 'uniform_random_batch_size_like', 'gaussian_random', - 'gaussian_random_batch_size_like', 'slice', 'rank', 'scale', From 34628322af26080e1537fbded59b19b229f9b37f Mon Sep 17 00:00:00 2001 From: gouzil <66515297+gouzil@users.noreply.github.com> Date: Mon, 9 Oct 2023 18:43:10 +0800 Subject: [PATCH 39/62] [codestyle][ruff] enable `PGH004` (#57941) * [codestyle] enable PGH004 * Update pyproject.toml Co-authored-by: Nyakku Shigure --------- Co-authored-by: Nyakku Shigure --- pyproject.toml | 3 ++ .../distributed/fleet/utils/__init__.py | 2 +- python/paddle/distribution/__init__.py | 2 +- python/paddle/distribution/transform.py | 2 +- python/paddle/incubate/asp/__init__.py | 2 +- python/paddle/incubate/autograd/__init__.py | 2 +- python/paddle/incubate/nn/__init__.py | 2 +- python/paddle/inference/__init__.py | 2 +- python/paddle/io/__init__.py | 2 +- python/paddle/jit/__init__.py | 2 +- python/paddle/metric/__init__.py | 2 +- python/paddle/nn/__init__.py | 2 +- python/paddle/nn/functional/__init__.py | 2 +- python/paddle/nn/initializer/__init__.py | 2 +- python/paddle/nn/layer/layers.py | 2 +- python/paddle/nn/utils/__init__.py | 2 +- python/paddle/optimizer/__init__.py | 2 +- python/paddle/optimizer/lr.py | 2 +- python/paddle/static/__init__.py | 2 +- python/paddle/static/nn/__init__.py | 2 +- python/paddle/tensor/__init__.py | 2 +- python/paddle/text/__init__.py | 2 +- python/paddle/utils/__init__.py | 2 +- python/paddle/utils/cpp_extension/__init__.py | 2 +- python/paddle/utils/unique_name.py | 2 +- python/paddle/vision/__init__.py | 2 +- python/paddle/vision/datasets/__init__.py | 2 +- python/paddle/vision/models/__init__.py | 2 +- python/paddle/vision/ops.py | 2 +- python/paddle/vision/transforms/__init__.py | 2 +- .../test_autograd_functional_dynamic.py | 46 +++++++++---------- 31 files changed, 55 insertions(+), 52 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 4ed2255b36ca8..f50f5a363b2c0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -72,6 +72,9 @@ select = [ "PLR1711", "PLR1722", "PLW3301", + + # Pygrep-hooks + "PGH004", ] unfixable = [ "NPY001" diff --git a/python/paddle/distributed/fleet/utils/__init__.py b/python/paddle/distributed/fleet/utils/__init__.py index 0ad0d6256ab88..67665dc10c398 100644 --- a/python/paddle/distributed/fleet/utils/__init__.py +++ b/python/paddle/distributed/fleet/utils/__init__.py @@ -26,7 +26,7 @@ from . import sequence_parallel_utils -__all__ = ["LocalFS", "recompute", "DistributedInfer", "HDFSClient"] # noqa +__all__ = ["LocalFS", "recompute", "DistributedInfer", "HDFSClient"] def recompute(function, *args, **kwargs): diff --git a/python/paddle/distribution/__init__.py b/python/paddle/distribution/__init__.py index 82a2e6ce87a05..68f4820da994d 100644 --- a/python/paddle/distribution/__init__.py +++ b/python/paddle/distribution/__init__.py @@ -32,7 +32,7 @@ from paddle.distribution.laplace import Laplace from paddle.distribution.geometric import Geometric -__all__ = [ # noqa +__all__ = [ 'Bernoulli', 'Beta', 'Categorical', diff --git a/python/paddle/distribution/transform.py b/python/paddle/distribution/transform.py index 92313c9bec58a..39e98a910499b 100644 --- a/python/paddle/distribution/transform.py +++ b/python/paddle/distribution/transform.py @@ -25,7 +25,7 @@ variable, ) -__all__ = [ # noqa +__all__ = [ 'Transform', 'AbsTransform', 'AffineTransform', diff --git a/python/paddle/incubate/asp/__init__.py b/python/paddle/incubate/asp/__init__.py index 9e6af7e94c139..e69d13afa2739 100644 --- a/python/paddle/incubate/asp/__init__.py +++ b/python/paddle/incubate/asp/__init__.py @@ -34,7 +34,7 @@ from .supported_layer_list import add_supported_layer # noqa: F401 -__all__ = [ # noqa +__all__ = [ 'calculate_density', 'decorate', 'prune_model', diff --git a/python/paddle/incubate/autograd/__init__.py b/python/paddle/incubate/autograd/__init__.py index d9b9e41781917..41f5387864112 100644 --- a/python/paddle/incubate/autograd/__init__.py +++ b/python/paddle/incubate/autograd/__init__.py @@ -16,7 +16,7 @@ from .primx import prim2orig from .utils import disable_prim, enable_prim, prim_enabled -__all__ = [ # noqa +__all__ = [ 'vjp', 'jvp', 'Jacobian', diff --git a/python/paddle/incubate/nn/__init__.py b/python/paddle/incubate/nn/__init__.py index c663d6248feb0..d8dcbf99c4ad4 100644 --- a/python/paddle/incubate/nn/__init__.py +++ b/python/paddle/incubate/nn/__init__.py @@ -24,7 +24,7 @@ from .layer.fused_dropout_add import FusedDropoutAdd # noqa: F401 from .layer.fused_dropout_nd import FusedDropout # noqa: F401 -__all__ = [ # noqa +__all__ = [ 'FusedMultiHeadAttention', 'FusedFeedForward', 'FusedTransformerEncoderLayer', diff --git a/python/paddle/inference/__init__.py b/python/paddle/inference/__init__.py index f59c5990573db..7d0f3e4ff4af0 100644 --- a/python/paddle/inference/__init__.py +++ b/python/paddle/inference/__init__.py @@ -33,7 +33,7 @@ XpuConfig, ) -__all__ = [ # noqa +__all__ = [ 'Config', 'DataType', 'PlaceType', diff --git a/python/paddle/io/__init__.py b/python/paddle/io/__init__.py index 6c2e0dae67834..bffa1f957c297 100755 --- a/python/paddle/io/__init__.py +++ b/python/paddle/io/__init__.py @@ -30,7 +30,7 @@ from .dataloader import Subset # noqa: F401 from .dataloader import random_split # noqa: F401 -__all__ = [ # noqa +__all__ = [ 'Dataset', 'IterableDataset', 'TensorDataset', diff --git a/python/paddle/jit/__init__.py b/python/paddle/jit/__init__.py index f508f72478b00..37b1203e68e63 100644 --- a/python/paddle/jit/__init__.py +++ b/python/paddle/jit/__init__.py @@ -23,7 +23,7 @@ from .dy2static.logging_utils import set_code_level, set_verbosity from .translated_layer import TranslatedLayer -__all__ = [ # noqa +__all__ = [ 'save', 'load', 'to_static', diff --git a/python/paddle/metric/__init__.py b/python/paddle/metric/__init__.py index 60dff58ec48bf..c18c2bee4f5ba 100644 --- a/python/paddle/metric/__init__.py +++ b/python/paddle/metric/__init__.py @@ -19,7 +19,7 @@ from .metrics import Auc # noqa: F401 from .metrics import accuracy # noqa: F401 -__all__ = [ # noqa +__all__ = [ 'Metric', 'Accuracy', 'Precision', diff --git a/python/paddle/nn/__init__.py b/python/paddle/nn/__init__.py index dbef7079c1bf3..1ef27639abd13 100644 --- a/python/paddle/nn/__init__.py +++ b/python/paddle/nn/__init__.py @@ -207,7 +207,7 @@ def weight_norm(*args): return utils.weight_norm(*args) -__all__ = [ # noqa +__all__ = [ 'BatchNorm', 'CELU', 'GroupNorm', diff --git a/python/paddle/nn/functional/__init__.py b/python/paddle/nn/functional/__init__.py index 87f2eabba1f59..608587becd952 100644 --- a/python/paddle/nn/functional/__init__.py +++ b/python/paddle/nn/functional/__init__.py @@ -140,7 +140,7 @@ from .flash_attention import scaled_dot_product_attention from .flash_attention import sdp_kernel -__all__ = [ # noqa +__all__ = [ 'celu', 'conv1d', 'conv1d_transpose', diff --git a/python/paddle/nn/initializer/__init__.py b/python/paddle/nn/initializer/__init__.py index adc81e5bbfd5d..c1e0866ad8f06 100644 --- a/python/paddle/nn/initializer/__init__.py +++ b/python/paddle/nn/initializer/__init__.py @@ -45,7 +45,7 @@ from .kaiming import MSRAInitializer # noqa: F401 from .assign import NumpyArrayInitializer # noqa: F401 -__all__ = [ # noqa +__all__ = [ 'Bilinear', 'Constant', 'KaimingUniform', diff --git a/python/paddle/nn/layer/layers.py b/python/paddle/nn/layer/layers.py index 204023378b5d3..791b5549ee7a2 100644 --- a/python/paddle/nn/layer/layers.py +++ b/python/paddle/nn/layer/layers.py @@ -26,7 +26,7 @@ from paddle.base import core, framework, unique_name from paddle.base.core import VarDesc from paddle.base.dygraph import no_grad -from paddle.base.dygraph.base import in_declarative_mode # noqa F401 +from paddle.base.dygraph.base import in_declarative_mode # noqa: F401 from paddle.base.dygraph.base import ( _convert_into_variable, in_to_static_mode, diff --git a/python/paddle/nn/utils/__init__.py b/python/paddle/nn/utils/__init__.py index d1645deb905a9..2d255055d8cf5 100644 --- a/python/paddle/nn/utils/__init__.py +++ b/python/paddle/nn/utils/__init__.py @@ -22,7 +22,7 @@ from .clip_grad_norm_ import clip_grad_norm_ # noqa: F401 from .clip_grad_value_ import clip_grad_value_ # noqa: F401 -__all__ = [ # noqa +__all__ = [ 'weight_norm', 'remove_weight_norm', 'spectral_norm', diff --git a/python/paddle/optimizer/__init__.py b/python/paddle/optimizer/__init__.py index 7d9737dc7da1f..af86573905273 100644 --- a/python/paddle/optimizer/__init__.py +++ b/python/paddle/optimizer/__init__.py @@ -25,7 +25,7 @@ from .lbfgs import LBFGS # noqa: F401 from . import lr # noqa: F401 -__all__ = [ # noqa +__all__ = [ 'Optimizer', 'Adagrad', 'Adam', diff --git a/python/paddle/optimizer/lr.py b/python/paddle/optimizer/lr.py index 6fb777447f8a1..0d24286fb40cd 100644 --- a/python/paddle/optimizer/lr.py +++ b/python/paddle/optimizer/lr.py @@ -28,7 +28,7 @@ ) from paddle.base.layer_helper import LayerHelper -__all__ = [ # noqa +__all__ = [ 'LRScheduler', 'NoamDecay', 'PiecewiseDecay', diff --git a/python/paddle/static/__init__.py b/python/paddle/static/__init__.py index 5cf44a3efc7c0..57c4abec6d8d0 100644 --- a/python/paddle/static/__init__.py +++ b/python/paddle/static/__init__.py @@ -79,7 +79,7 @@ from ..base.framework import program_guard # noqa: F401 from ..base.framework import Program # noqa: F401 -__all__ = [ # noqa +__all__ = [ 'append_backward', 'gradients', 'Executor', diff --git a/python/paddle/static/nn/__init__.py b/python/paddle/static/nn/__init__.py index d144f87ec32cb..f3693e1501c40 100755 --- a/python/paddle/static/nn/__init__.py +++ b/python/paddle/static/nn/__init__.py @@ -60,7 +60,7 @@ from .control_flow import cond from .static_pylayer import static_pylayer -__all__ = [ # noqa +__all__ = [ 'fc', 'batch_norm', 'bilinear_tensor_product', diff --git a/python/paddle/tensor/__init__.py b/python/paddle/tensor/__init__.py index c61bd9a361cdc..e16ef89ce8a47 100644 --- a/python/paddle/tensor/__init__.py +++ b/python/paddle/tensor/__init__.py @@ -381,7 +381,7 @@ from ..signal import stft # noqa: F401 # this list used in math_op_patch.py for _binary_creator_ -tensor_method_func = [ # noqa +tensor_method_func = [ 'create_parameter', 'create_tensor', 'matmul', diff --git a/python/paddle/text/__init__.py b/python/paddle/text/__init__.py index fbfa0c3fe2e02..378ed13431d86 100644 --- a/python/paddle/text/__init__.py +++ b/python/paddle/text/__init__.py @@ -21,7 +21,7 @@ from .datasets import WMT14 # noqa: F401 from .datasets import WMT16 # noqa: F401 -__all__ = [ # noqa +__all__ = [ 'Conll05st', 'Imdb', 'Imikolov', diff --git a/python/paddle/utils/__init__.py b/python/paddle/utils/__init__.py index 630af97f280f5..75057ed8accdb 100644 --- a/python/paddle/utils/__init__.py +++ b/python/paddle/utils/__init__.py @@ -53,4 +53,4 @@ from .layers_utils import _contain_var # noqa: F401 from .layers_utils import _convert_to_tensor_list # noqa: F401 -__all__ = ['deprecated', 'run_check', 'require_version', 'try_import'] # noqa +__all__ = ['deprecated', 'run_check', 'require_version', 'try_import'] diff --git a/python/paddle/utils/cpp_extension/__init__.py b/python/paddle/utils/cpp_extension/__init__.py index 9ad431f00a65f..96d55bea663c5 100644 --- a/python/paddle/utils/cpp_extension/__init__.py +++ b/python/paddle/utils/cpp_extension/__init__.py @@ -22,7 +22,7 @@ from .extension_utils import get_build_directory # noqa: F401 from .extension_utils import load_op_meta_info_and_register_op # noqa: F401 -__all__ = [ # noqa +__all__ = [ 'CppExtension', 'CUDAExtension', 'load', diff --git a/python/paddle/utils/unique_name.py b/python/paddle/utils/unique_name.py index bfd26da255fa1..f34109b18ec0b 100644 --- a/python/paddle/utils/unique_name.py +++ b/python/paddle/utils/unique_name.py @@ -17,4 +17,4 @@ from ..base.unique_name import guard # noqa: F401 from ..base.unique_name import switch # noqa: F401 -__all__ = ['generate', 'switch', 'guard'] # noqa +__all__ = ['generate', 'switch', 'guard'] diff --git a/python/paddle/vision/__init__.py b/python/paddle/vision/__init__.py index a2a782c03599b..cc70de710bf90 100644 --- a/python/paddle/vision/__init__.py +++ b/python/paddle/vision/__init__.py @@ -112,4 +112,4 @@ from .transforms import adjust_hue # noqa: F401 from .transforms import normalize # noqa: F401 -__all__ = ['set_image_backend', 'get_image_backend', 'image_load'] # noqa +__all__ = ['set_image_backend', 'get_image_backend', 'image_load'] diff --git a/python/paddle/vision/datasets/__init__.py b/python/paddle/vision/datasets/__init__.py index 970c8cfcae86a..a7464275eb671 100644 --- a/python/paddle/vision/datasets/__init__.py +++ b/python/paddle/vision/datasets/__init__.py @@ -21,7 +21,7 @@ from .cifar import Cifar100 # noqa: F401 from .voc2012 import VOC2012 # noqa: F401 -__all__ = [ # noqa +__all__ = [ 'DatasetFolder', 'ImageFolder', 'MNIST', diff --git a/python/paddle/vision/models/__init__.py b/python/paddle/vision/models/__init__.py index 08f559bd440c9..bf9fa0bec0288 100644 --- a/python/paddle/vision/models/__init__.py +++ b/python/paddle/vision/models/__init__.py @@ -64,7 +64,7 @@ from .shufflenetv2 import shufflenet_v2_x2_0 # noqa: F401 from .shufflenetv2 import shufflenet_v2_swish # noqa: F401 -__all__ = [ # noqa +__all__ = [ 'ResNet', 'resnet18', 'resnet34', diff --git a/python/paddle/vision/ops.py b/python/paddle/vision/ops.py index d38f81a57ede9..5a8b433cea52e 100755 --- a/python/paddle/vision/ops.py +++ b/python/paddle/vision/ops.py @@ -26,7 +26,7 @@ from ..nn import BatchNorm2D, Conv2D, Layer, ReLU, Sequential from ..nn.initializer import Normal -__all__ = [ # noqa +__all__ = [ 'yolo_loss', 'yolo_box', 'prior_box', diff --git a/python/paddle/vision/transforms/__init__.py b/python/paddle/vision/transforms/__init__.py index 890e4b8982714..3e2d39c5a88f5 100644 --- a/python/paddle/vision/transforms/__init__.py +++ b/python/paddle/vision/transforms/__init__.py @@ -51,7 +51,7 @@ from .functional import normalize # noqa: F401 from .functional import erase # noqa: F401 -__all__ = [ # noqa +__all__ = [ 'BaseTransform', 'Compose', 'Resize', diff --git a/test/autograd/test_autograd_functional_dynamic.py b/test/autograd/test_autograd_functional_dynamic.py index 02c4e61748d0a..f46e7a35c10d8 100644 --- a/test/autograd/test_autograd_functional_dynamic.py +++ b/test/autograd/test_autograd_functional_dynamic.py @@ -145,9 +145,9 @@ def check_results(self, ref, res): class TestVJP(TestAutogradFunctional): def func_vjp_i1o1(self): test_cases = [ - [reduce, 'A'], # noqa - [reduce_dim, 'A'], # noqa - ] # noqa + [reduce, 'A'], + [reduce_dim, 'A'], + ] for f, inputs in test_cases: vjp, grad = self.gen_test_pairs(f, inputs) vjp_result, grad_result = vjp(), grad() @@ -155,9 +155,9 @@ def func_vjp_i1o1(self): def func_vjp_i2o1(self): test_cases = [ - [matmul, ['A', 'B']], # noqa - [mul, ['b', 'c']], # noqa - ] # noqa + [matmul, ['A', 'B']], + [mul, ['b', 'c']], + ] for f, inputs in test_cases: vjp, grad = self.gen_test_pairs(f, inputs) vjp_result, grad_result = vjp(), grad() @@ -165,8 +165,8 @@ def func_vjp_i2o1(self): def func_vjp_i2o2(self): test_cases = [ - [o2, ['A', 'A']], # noqa - ] # noqa + [o2, ['A', 'A']], + ] for f, inputs in test_cases: inputs = self.gen_inputs(inputs) v = make_v(f, inputs) @@ -176,8 +176,8 @@ def func_vjp_i2o2(self): def func_vjp_i2o2_omitting_v(self): test_cases = [ - [o2, ['A', 'A']], # noqa - ] # noqa + [o2, ['A', 'A']], + ] for f, inputs in test_cases: inputs = self.gen_inputs(inputs) vjp, grad = self.gen_test_pairs(f, inputs) @@ -187,7 +187,7 @@ def func_vjp_i2o2_omitting_v(self): def func_vjp_nested(self): x = self.gen_input('a') test_cases = [ - [nested(x), 'a'], # noqa + [nested(x), 'a'], ] for f, inputs in test_cases: vjp, grad = self.gen_test_pairs(f, inputs) @@ -274,9 +274,9 @@ def jac(grad_fn, f, inputs): class TestJVP(TestAutogradFunctional): def func_jvp_i1o1(self): test_cases = [ - [reduce, 'A'], # noqa - [reduce_dim, 'A'], # noqa - ] # noqa + [reduce, 'A'], + [reduce_dim, 'A'], + ] for f, inputs in test_cases: inputs = self.gen_inputs(inputs) forward_jac = jac(paddle.incubate.autograd.jvp, f, inputs) @@ -284,9 +284,9 @@ def func_jvp_i1o1(self): self.check_results(forward_jac, reverse_jac) def func_jvp_i2o1(self): - test_cases = [ # noqa - [matmul, ['A', 'B']], # noqa - ] # noqa + test_cases = [ + [matmul, ['A', 'B']], + ] for f, inputs in test_cases: inputs = self.gen_inputs(inputs) forward_jac = jac(paddle.incubate.autograd.jvp, f, inputs) @@ -294,9 +294,9 @@ def func_jvp_i2o1(self): self.check_results(forward_jac, reverse_jac) def func_jvp_i2o2(self): - test_cases = [ # noqa - [o2, ['A', 'A']], # noqa - ] # noqa + test_cases = [ + [o2, ['A', 'A']], + ] for f, inputs in test_cases: inputs = self.gen_inputs(inputs) forward_jac = jac(paddle.incubate.autograd.jvp, f, inputs) @@ -304,9 +304,9 @@ def func_jvp_i2o2(self): self.check_results(forward_jac, reverse_jac) def func_jvp_i2o2_omitting_v(self): - test_cases = [ # noqa - [o2, ['A', 'A']], # noqa - ] # noqa + test_cases = [ + [o2, ['A', 'A']], + ] for f, inputs in test_cases: inputs = self.gen_inputs(inputs) results_omitting_v = paddle.incubate.autograd.jvp(f, inputs) From 86a31bd4b995ad8e94fedfc82ad1cab5d5521f09 Mon Sep 17 00:00:00 2001 From: 6clc Date: Mon, 9 Oct 2023 19:12:20 +0800 Subject: [PATCH 40/62] cinn(py-dsl): parse compute of python dsl (#57731) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 拆分新特性:CINN Python DSL, 主PR和单测见:#56393 此PR只负责 解析python dsl中的compute定义 1. 装饰器@to_cinn_ir封装cinn的function kernel: CinnLowerLevelIrJit支持从Jit运行时中数据类型、target类型、python ast。后续解析compute的信息都会从CinnLowerLevelIrJit这个类中获取。 CinnLowerLevelIrJit也支持静态获取上述信息,通过python的annotation来填充。 2. compute 语义解析 将整个AST分为三种类型: stmts: Function, For, If, With ,对应封装上下文IR的PR: #57515 Assign: 表达式"lhs = rhs"的类型,Assign类型构成了stmts。 python/cinn/compiler/expr_executor.py中的exec_expr方法将rhs解析成cinn ir Expr python/cinn/compiler/expr_executor.py中的exec_assign方法,将lhs=rhs表达的assign语义存储在局部变量表中。 Expr:组成Assign中的rhs。 3. 变量管理 python/cinn/compiler/utils.py中的class VariableTable:用于管理Python DSL中定义的变量,主要是下面两个功能。 每次Enter新的Context,会复制当前的变量表 每次Exit Context,会删除当前Context增加的变量,恢复上一轮Context的变量表。 --- paddle/cinn/pybind/ir/ir_api.cc | 2 + python/cinn/__init__.py | 3 +- python/cinn/compiler/__init__.py | 17 ++ python/cinn/compiler/compiler.py | 38 +++ .../cinn/compiler/compute_code_generator.py | 245 ++++++++++++++++++ python/cinn/compiler/expr_executor.py | 159 ++++++++++++ python/cinn/compiler/utils.py | 76 ++++++ python/cinn/ir/ir.py | 2 +- python/cinn/runtime/__init__.py | 4 + python/cinn/runtime/cinn_jit.py | 115 ++++++++ python/cinn/runtime/utils.py | 35 +++ 11 files changed, 694 insertions(+), 2 deletions(-) create mode 100644 python/cinn/compiler/__init__.py create mode 100644 python/cinn/compiler/compiler.py create mode 100644 python/cinn/compiler/compute_code_generator.py create mode 100644 python/cinn/compiler/expr_executor.py create mode 100644 python/cinn/compiler/utils.py create mode 100644 python/cinn/runtime/cinn_jit.py create mode 100644 python/cinn/runtime/utils.py diff --git a/paddle/cinn/pybind/ir/ir_api.cc b/paddle/cinn/pybind/ir/ir_api.cc index ffbfd3375bf75..2170f360f5062 100644 --- a/paddle/cinn/pybind/ir/ir_api.cc +++ b/paddle/cinn/pybind/ir/ir_api.cc @@ -843,6 +843,8 @@ void BindIrContext(py::module *m) { .def_static("MakeThenContext", []() { return IRContext(new ThenContextNode()); }); + m->def("link_to_parent_context", &pybind::LinkToParentContext); + py::class_ ir_builder(*m, "IRBuilder"); ir_builder.def(py::init<>()) .def("EnterWithContext", &IRBuilder::EnterWithContext) diff --git a/python/cinn/__init__.py b/python/cinn/__init__.py index 9411b774e3836..55ab35e7e5624 100644 --- a/python/cinn/__init__.py +++ b/python/cinn/__init__.py @@ -12,6 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +from .version import full_version as __version__ +from .runtime.cinn_jit import to_cinn_llir import os cinndir = os.path.dirname(os.path.abspath(__file__)) @@ -189,4 +191,3 @@ reduce_mul, reduce_sum, ) -from .version import full_version as __version__ diff --git a/python/cinn/compiler/__init__.py b/python/cinn/compiler/__init__.py new file mode 100644 index 0000000000000..644bf2d949ca4 --- /dev/null +++ b/python/cinn/compiler/__init__.py @@ -0,0 +1,17 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .compiler import compile + +__all__ = ["compile"] diff --git a/python/cinn/compiler/compiler.py b/python/cinn/compiler/compiler.py new file mode 100644 index 0000000000000..330d34962641d --- /dev/null +++ b/python/cinn/compiler/compiler.py @@ -0,0 +1,38 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from ..runtime import CinnLowerLevelIrJit +from .compute_code_generator import ComputeCodeGenerator + + +def ast_to_llir(fn, inputs_signature): + function_name = fn.__name__ + # 1. Parse CINN Compute + llir_compute_generator = ComputeCodeGenerator( + fn, function_name, inputs_signature + ) + cinn_llir_func = llir_compute_generator.parse() + return cinn_llir_func + + +def compile(fn, just_convert=False, jit_inputs_signature=[], **kwargs): + if isinstance(fn, CinnLowerLevelIrJit): + llir_func = ast_to_llir(fn, jit_inputs_signature) + else: + raise Exception("Current Only support compile from CinnLowerLevelIrJit") + + if just_convert: + return llir_func + return llir_func diff --git a/python/cinn/compiler/compute_code_generator.py b/python/cinn/compiler/compute_code_generator.py new file mode 100644 index 0000000000000..9a54c504306f3 --- /dev/null +++ b/python/cinn/compiler/compute_code_generator.py @@ -0,0 +1,245 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import ast +import contextlib + +from cinn import ir + +from .expr_executor import ExprExecutor, exec_assign +from .utils import VariableTable, is_node_parsed_in_schedule + + +class ComputeCodeGenerator(ast.NodeVisitor): + """ + Convert python ast to CINN Lower Level IR, + containing only the semantics of the compute part + """ + + def __init__(self, fn, function_name, inputs_signature): + self.fn = fn + self.function_name = function_name + self.inputs_signature = inputs_signature + self.cinn_llir_func = None + self.variables_table = VariableTable() + self.extra_scope = {"range": ir.sequential} + + def parse(self): + ast_node = self.fn.parse() + with ir.IRBuilder() as builder, self.variables_table: + for k, v in self.fn.scope.items(): + self.variables_table.add(k, v) + for k, v in self.extra_scope.items(): + self.variables_table.add(k, v) + self.visit(ast_node) + return builder.get() + + def visit_FunctionDef(self, node) -> None: + """ + Parse CINN Low Level IR FunctionDef. + + Args: + node(ast.FunctionDef): The ast FunctionDef Node + """ + with ir.LowerFuncContext(self.function_name) as func_ctx: + arg_names = self.visit(node.args) + + assert len(node.args.defaults) == 0, "Not support default args" + + # 1. Construct args of function + for i, arg_name in enumerate(arg_names): + # Obj of Argument is ir::Buffer + if hasattr(self.inputs_signature[i], "dtype"): + tensor_shape = [ + ir.Expr(dim) for dim in self.inputs_signature[i].shape + ] + llir_value = ir._Buffer_.make( + arg_name, self.inputs_signature[i].dtype + ) + ir.Arg(arg_name, llir_value) + llir_value = ir._Tensor_.make( + arg_name, + self.inputs_signature[i].dtype, + tensor_shape, + tensor_shape, + ) + self.variables_table.add(arg_name, llir_value) + # Obj of Argument is ir::Var + else: + llir_value = ir.Var(arg_name) + ir.Arg(arg_name, llir_value) + llir_value = ir.Expr(llir_value) + self.variables_table.add(arg_name, llir_value) + + # 2. Construct body of function + body = self.visit_compound_statement(node.body) + + def visit_compound_statement(self, stmts): + for stmt in stmts: + self.visit(stmt) + + def visit_arguments(self, node): + """ + Parse CINN Low Level IR Argument. + If it is not jit mode, it will get information from arg.annoatation. + + Args: + node(ast.arguments): The ast argument Node + + Returns: + list[string]: A list of parameter names + """ + arg_names = [arg.arg for arg in node.args] + + if len(self.inputs_signature) != len(arg_names): + self.inputs_signature = [] + for arg in node.args: + arg_annotation = arg.annotation + if isinstance(arg_annotation, ast.Call): + self.inputs_signature.append( + ExprExecutor(self.variables_table.get()).exec( + arg_annotation + ) + ) + elif isinstance(arg_annotation, int): + if ( + -(2**21) <= arg_annotation + and arg_annotation <= 2**31 - 1 + ): + self.inputs_signature.append("i32") + elif ( + 2**63 <= arg_annotation + and arg_annotation <= 2**64 - 1 + ): + self.inputs_signature.append("u64") + else: + self.inputs_signature.append("i64") + elif isinstance(arg_annotation, float): + return self.inputs_signature.append("fp32") + else: + raise TypeError( + f'Unsupported type {type(arg_annotation)} for {arg_annotation}' + ) + + return arg_names + + def visit_For(self, node) -> ir.Expr: + """ + parse CINN Low Level IR For. + + Args: + node(ast.For): The ast For node + """ + for_ctx = ExprExecutor(self.variables_table.get()).exec(node.iter) + with self.variables_table: + with for_ctx as loop_var: + local_var_table = exec_assign( + target=node.target, source=loop_var + ) + for k, v in local_var_table.items(): + loop_var.rename(k) + self.variables_table.add(k, ir.Expr(v)) + self.visit_compound_statement(node.body) + + def visit_Assign(self, node): + """ + parse CINN Low Level IR Store. + + Args: + node(ast.Assign): The ast Assign node + + Returns: + ir.Expr, Points to the Expr of ir::ExprNode + """ + + if isinstance(node.value, ast.Call) and is_node_parsed_in_schedule( + node.value + ): + return "no compute" + + assert ( + len(node.targets) == 1 + ), "Unsupport targets is a \ + list of nodes, like 'a = b = c'" + lhs = node.targets[0] + + # 1 parse RHS + rhs_expr = ExprExecutor(self.variables_table.get()).exec(node.value) + + # 2 parse LHS + # 2.1 Type of arg is Tensor + if isinstance(lhs, ast.Subscript): + expr_tensor = ExprExecutor(self.variables_table.get()).exec( + lhs.value + ) + if isinstance(lhs.slice, ast.Tuple): + expr_indices = [] + for idx in lhs.slice.elts: + expr_indices.append( + ExprExecutor(self.variables_table.get()).exec(idx) + ) + else: + expr_indices = [ + ExprExecutor(self.variables_table.get()).exec(lhs.slice) + ] + if not isinstance(rhs_expr, ir.Expr): + rhs_expr = ir.Expr(rhs_expr) + ir.TensorStore(expr_tensor.Expr(), rhs_expr, expr_indices) + # 2.2 Type of arg is Var + else: + local_var_table = exec_assign(target=lhs, source=rhs_expr) + if isinstance(lhs, ast.Tuple): + for k, v in local_var_table.items(): + v.as_var_ref().rename(k) + self.variables_table.add(k, v) + else: + for k, v in local_var_table.items(): + v[0].as_var_ref().rename(k) + self.variables_table.add(k, v[0]) + + def visit_If(self, node): + with self.variables_table: + with ir.IfContext( + ExprExecutor(self.variables_table.get()).exec(node.test) + ): + with ir.ThenContext(): + with self.variables_table: + self.visit_compound_statement(node.body) + if node.orelse: + with ir.ElseContext(): + with self.variables_table: + self.visit_compound_statement(node.body) + + def visit_With(self, node): + with self.variables_table: + with contextlib.ExitStack() as context_stack: + for item in node.items: + cur_ctx = ExprExecutor(self.variables_table.get()).exec( + item.context_expr + ) + cur_ctx = context_stack.enter_context(cur_ctx) + if item.optional_vars is not None: + local_var_table = exec_assign( + target=item.optional_vars, source=cur_ctx + ) + for k, v in local_var_table.items(): + self.variables_table.add(k, v) + body = self.visit_compound_statement(node.body) + + def visit_Expr(self, node): + if is_node_parsed_in_schedule(node.value): + return + res = ExprExecutor(self.variables_table.get()).exec(node.value) + if isinstance(res, ir.Expr): + ir.link_to_parent_context(res) diff --git a/python/cinn/compiler/expr_executor.py b/python/cinn/compiler/expr_executor.py new file mode 100644 index 0000000000000..cff9a9d62d7c4 --- /dev/null +++ b/python/cinn/compiler/expr_executor.py @@ -0,0 +1,159 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import ast + +from cinn import ir + +# The Python native AST node that cinn ir supports +AST2CINN = { + ast.Add: ir.Add, + ast.Sub: ir.Sub, + ast.Mult: ir.Mul, + ast.Div: ir.Div, + ast.Mod: ir.Mod, + ast.And: ir.And, + ast.Or: ir.Or, + ast.USub: ir.Minus, + ast.Not: ir.Not, + ast.Eq: ir.EQ, + ast.NotEq: ir.NE, + ast.Lt: ir.LT, + ast.LtE: ir.LE, + ast.Gt: ir.GT, + ast.GtE: ir.GE, +} + + +class ExprExecutor: + def __init__(self, var_table): + self.var_table = var_table + self.tmp_value_count = 1 + + def exec(self, node): + ret = self.visit(node) + if isinstance(ret, ast.Name): + return self.var_table[ret.id] + if isinstance(ret, ast.Constant): + return ret.value + raise Exception(f"Error result type: {type(ret)}") + + def visit(self, node): + if isinstance(node, list): + return [self.visit(item) for item in node] + if isinstance(node, tuple): + return (self.visit(item) for item in node) + assert isinstance(node, ast.AST) + if isinstance(node, ast.Name): + return node + + if isinstance(node, ast.Constant): + return node + + if not isinstance(node, (ast.expr, ast.slice)): + # some nodes don't need to parse, such as ast.Load + return node + if isinstance(node, (ast.Lambda, ast.Starred)): + raise Exception("Current not suporrted: Lambda, Starred") + + cls_fields = {} + for field in node.__class__._fields: + attr = getattr(node, field) + if isinstance(attr, (ast.AST, tuple, list)): + cls_fields[field] = self.visit(attr) + else: + cls_fields[field] = attr + + node_type_name = f'eval_{type(node).__name__}' + if hasattr(self, node_type_name): + exec_func = getattr(self, node_type_name) + value = exec_func(cls_fields) + else: + new_node = node.__class__(**cls_fields) + ast.copy_location(new_node, node) + new_node = ast.Expression(new_node) + value = self.exec_expr(new_node) + return self.save_temp_value(value) + + def exec_expr(self, node): + if isinstance(node, ast.expr): + node = ast.Expression(body=node) + node = ast.fix_missing_locations(node) + exec = compile(node, filename="", mode="eval") + return eval(exec, self.var_table) + + def eval_BinOp(self, fields): + args = [self.exec_expr(fields["left"]), self.exec_expr(fields["right"])] + args = [ + ir.Expr(item) if not isinstance(item, ir.Expr) else item + for item in args + ] + return AST2CINN[type(fields["op"])].make(*args) + + def eval_UnaryOp(self, fields): + args = [self.exec_expr(fields["operand"])] + args = [ + ir.Expr(item) if not isinstance(item, ir.Expr) else item + for item in args + ] + return AST2CINN[type(fields["op"])].make(*args) + + def eval_Compare(self, fields): + assert ( + len(fields["ops"]) == 1 + ), "Only binary comparison symbols are supported. Expressions such as '1 <= a < 10' are not supported." + args = [ + self.exec_expr(fields["left"]), + self.exec_expr(fields["comparators"][0]), + ] + args = [ + ir.Expr(item) if not isinstance(item, ir.Expr) else item + for item in args + ] + return AST2CINN[type(fields["ops"][0])].make(*args) + + def save_temp_value(self, value): + name = f"__cinn_python_script_tmp_value_{self.tmp_value_count}" + self.tmp_value_count += 1 + self.var_table[name] = value + return ast.Name( + id=name, + ctx=ast.Load( + lineno=0, col_offset=0, end_lineno=None, end_col_offset=None + ), + lineno=0, + col_offset=0, + end_lineno=None, + end_col_offset=None, + ) + + +def exec_assign(target, source): + right_value_var_name = "__CINN_RIGHT_VALUE_VAR_NAME__" + local_var_table = {right_value_var_name: source} + mod = ast.fix_missing_locations( + ast.Module( + body=[ + ast.Assign( + targets=[target], + value=ast.Name(id=right_value_var_name, ctx=ast.Load()), + ) + ], + type_ignores=[], + ) + ) + exe = compile(mod, filename="", mode="exec") + exec(exe, {}, local_var_table) + del local_var_table[right_value_var_name] + return local_var_table diff --git a/python/cinn/compiler/utils.py b/python/cinn/compiler/utils.py new file mode 100644 index 0000000000000..6f78446245fb4 --- /dev/null +++ b/python/cinn/compiler/utils.py @@ -0,0 +1,76 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import ast + +try: + from _collections import defaultdict +except ImportError: + pass + + +from cinn.schedule import IRSchedule + + +def is_node_parsed_in_schedule(node: ast.Call): + func_name = "" + if isinstance(node.func, ast.Name): + func_name = node.func.id + elif isinstance(node.func, ast.Attribute): + func_name = node.func.attr + if func_name == "make": + return False + if func_name == "print": + return True + + return getattr(IRSchedule, func_name, None) + + +def node_is_schedule_block_context(node: ast.Call): + if isinstance(node.func, ast.Name): + return node.Name == "ScheduleBlockContext" + if isinstance(node.func, ast.Attribute): + return node.func.attr == "ScheduleBlockContext" + return False + + +class VariableTable: + def __init__(self): + # var name added by current context + self.var_name_list = [] + # var name to var. Dtype is {string:list} + # list records the value assigned to each layer of context + self.name2value = defaultdict(list) + + def __enter__(self): + self.var_name_list.append([]) + return self + + def __exit__(self, ptype, value, trace) -> None: + # clear var assign in current context + if ptype is None and value is None: + var_names = self.var_name_list.pop() + for var_name in var_names: + self.name2value[var_name].pop() + if len(self.name2value[var_name]) == 0: + self.name2value.pop(var_name) + + def add(self, name, value, cover=False): + if cover and name in self.var_name_list[-1]: + self.name2value[name][-1] = value + else: + self.var_name_list[-1].append(name) + self.name2value[name].append(value) + + def get(self): + return {k: v[-1] for k, v in self.name2value.items()} diff --git a/python/cinn/ir/ir.py b/python/cinn/ir/ir.py index 5c683de04e705..7d51a302a3dfb 100644 --- a/python/cinn/ir/ir.py +++ b/python/cinn/ir/ir.py @@ -17,7 +17,7 @@ from .ir_context import ForContext -# Python's rang() function calls the sequential() +# Python's range() function calls the sequential() def sequential(min, extent=None): if extent is None: extent = min diff --git a/python/cinn/runtime/__init__.py b/python/cinn/runtime/__init__.py index a9f32b12d0e22..70753e812e6b6 100644 --- a/python/cinn/runtime/__init__.py +++ b/python/cinn/runtime/__init__.py @@ -66,3 +66,7 @@ seed, set_cinn_cudnn_deterministic, ) + +from .cinn_jit import CinnLowerLevelIrJit + +__all__ = ["CinnLowerLevelIrJit"] diff --git a/python/cinn/runtime/cinn_jit.py b/python/cinn/runtime/cinn_jit.py new file mode 100644 index 0000000000000..7b85808593d62 --- /dev/null +++ b/python/cinn/runtime/cinn_jit.py @@ -0,0 +1,115 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import ast +import functools +import inspect +import textwrap +from typing import Callable, Generic, Optional, TypeVar, Union, cast + +from .utils import inspect_function_scope + +T = TypeVar('T') + + +class CinnLowerLevelIrJit(Generic[T]): + def __init__(self, fn): + self.fn = fn + # function prototype + signature = inspect.signature(fn) + self.arg_names = [v.name for v in signature.parameters.values()] + + self.src = textwrap.dedent(inspect.getsource(fn)) + self.src = self.src[self.src.find("def") :] + self.scope = inspect_function_scope(fn) + + # docs of warpped function + self.__doc__ = fn.__doc__ + self.__name__ = fn.__name__ + self.__globals__ = fn.__globals__ + self.__module__ = fn.__module__ + + # Encapsulates the compile and run processes + self.run = self._make_launcher() + + def _make_launcher(self): + # Gets information about runtime input parameters + jit_input_args = ', '.join(arg_name for arg_name in self.arg_names) + lazy_compile = f""" +import cinn +def {self.fn.__name__}({jit_input_args}, target=cinn.common.DefaultHostTarget()): + from cinn.compiler import compile + jit_inputs = {', '.join([f'{arg}' for arg in self.arg_names])} + jit_inputs_signature = {{ i: self._convert_arg_type(arg) \ + for i, arg in enumerate(jit_inputs)}} + module = compile(self, jit_inputs_signature=jit_inputs_signature, arg_names={ + self.arg_names}, target=target) + module({jit_input_args}) + + return module + """ + scope = { + "self": self, + } + exec(lazy_compile, scope) + return scope[self.fn.__name__] + + def convert_to_llir(self): + from cinn.compiler import compile + + return compile(self, just_convert=True) + + def parse(self): + tree = ast.parse(self.src) + assert isinstance(tree, ast.Module) + return tree + + def __getitem__(self, target): + return cast( + T, functools.partial(cast(Callable, self.run), target=target) + ) + + def _convert_arg_type(self, arg): + # arg is a Tensor + if hasattr(arg, "dtype"): + return arg + # arg is a Var + else: + if isinstance(arg, int): + if -(2**21) <= arg and arg <= 2**31 - 1: + return "i32" + elif 2**63 <= arg and arg <= 2**64 - 1: + return "u64" + else: + return "i64" + elif isinstance(arg, float): + return "fp32" + else: + raise TypeError(f'Unsupported type {type(arg)} for {arg}') + + def __str__(self): + return str(self.convert_to_llir()) + + +def to_cinn_llir( + fn: Optional[T] = None, +) -> Union[CinnLowerLevelIrJit[T]]: + def decorator(fn: T) -> CinnLowerLevelIrJit[T]: + return CinnLowerLevelIrJit(fn) + + if fn is not None: + return decorator(fn) + else: + return decorator diff --git a/python/cinn/runtime/utils.py b/python/cinn/runtime/utils.py new file mode 100644 index 0000000000000..8df8cccc772d1 --- /dev/null +++ b/python/cinn/runtime/utils.py @@ -0,0 +1,35 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import inspect + + +def get_func_global_vars(func): + if inspect.ismethod(func): + func = func.__func__ + + code = func.__code__ + global_vars = {} + if func.__closure__ is not None: + for k, v in zip(code.co_freevars, func.__closure__): + global_vars[k] = v.cell_contents + return global_vars + + +def inspect_function_scope(func): + scope = { + **func.__globals__, + **get_func_global_vars(func), + } + return scope From 0d84a9180837db0861211770ea2dea02933856dd Mon Sep 17 00:00:00 2001 From: BiynXu <62832681+BiynXu@users.noreply.github.com> Date: Mon, 9 Oct 2023 19:13:40 +0800 Subject: [PATCH 41/62] [CINN] Add ReductionFactoring rule (#57569) Add ReductionFactoring rule --- .../cinn/auto_schedule/analysis/analyze_ir.cc | 35 ++++ .../cinn/auto_schedule/analysis/analyze_ir.h | 10 + .../cooperative_process_test.cc | 4 +- .../search_space/auto_gen_rule/CMakeLists.txt | 11 +- .../auto_gen_rule/multi_level_tiling_test.cc | 20 +- .../auto_gen_rule/reduction_factoring.cc | 189 ++++++++++++++++++ .../auto_gen_rule/reduction_factoring.h | 59 ++++++ .../auto_gen_rule/reduction_factoring_test.cc | 131 ++++++++++++ .../search_space/auto_gen_rule/test_helper.cc | 3 +- .../hlir/framework/new_ir/op_lowering_impl.cc | 3 +- .../hlir/framework/new_ir/op_lowering_impl.h | 3 +- paddle/cinn/hlir/framework/op_lowering.h | 6 +- .../cinn/hlir/framework/op_lowering_impl.cc | 25 ++- paddle/cinn/hlir/framework/op_lowering_impl.h | 5 +- .../hlir/framework/op_lowering_impl_base.h | 8 +- test/cpp/cinn/concrete_program_builder.h | 16 ++ 16 files changed, 499 insertions(+), 29 deletions(-) create mode 100644 paddle/cinn/auto_schedule/search_space/auto_gen_rule/reduction_factoring.cc create mode 100644 paddle/cinn/auto_schedule/search_space/auto_gen_rule/reduction_factoring.h create mode 100644 paddle/cinn/auto_schedule/search_space/auto_gen_rule/reduction_factoring_test.cc diff --git a/paddle/cinn/auto_schedule/analysis/analyze_ir.cc b/paddle/cinn/auto_schedule/analysis/analyze_ir.cc index 64f2955f30d3d..fbfdc7af72e9a 100644 --- a/paddle/cinn/auto_schedule/analysis/analyze_ir.cc +++ b/paddle/cinn/auto_schedule/analysis/analyze_ir.cc @@ -190,5 +190,40 @@ ir::LoweredFunc UpdateFuncWithNewBody(const common::Target& target, return new_func; } +std::unordered_set GetReduceLoopVarNames(const ir::Expr block) { + const ir::ScheduleBlockRealize* block_realize = + block.As(); + CHECK_NOTNULL(block_realize); + const ir::ScheduleBlock* block_node = + block_realize->schedule_block.As(); + CHECK_NOTNULL(block_node); + std::vector iter_values = block_realize->iter_values; + std::vector iter_vars = block_node->iter_vars; + + std::unordered_set reduce_loop_var; + for (int i = 0; i < iter_vars.size(); ++i) { + if (iter_vars[i]->is_reduce_axis) { + ir::ir_utils::CollectIRNodesWithoutTensor( + iter_values[i], [&](const ir::Expr* x) { + if (x->as_var()) { + reduce_loop_var.insert(x->as_var_ref()->name); + } + return false; + }); + } + } + return reduce_loop_var; +} + +std::string GetBlockName(const ir::Expr block) { + const ir::ScheduleBlockRealize* block_realize = + block.As(); + CHECK_NOTNULL(block_realize); + const ir::ScheduleBlock* block_node = + block_realize->schedule_block.As(); + CHECK_NOTNULL(block_node); + return block_node->name; +} + } // namespace auto_schedule } // namespace cinn diff --git a/paddle/cinn/auto_schedule/analysis/analyze_ir.h b/paddle/cinn/auto_schedule/analysis/analyze_ir.h index 8fbdd52329f51..81d00dcb22ec3 100644 --- a/paddle/cinn/auto_schedule/analysis/analyze_ir.h +++ b/paddle/cinn/auto_schedule/analysis/analyze_ir.h @@ -48,5 +48,15 @@ ir::LoweredFunc UpdateFuncWithNewBody(const common::Target& target, const ir::LoweredFunc& old_func, ir::Expr& body); // NOLINT +/** + * Get loop var names of reduce axis + */ +std::unordered_set GetReduceLoopVarNames(const ir::Expr block); + +/** + * Get name of a ScheduleBlock + */ +std::string GetBlockName(const ir::Expr block); + } // namespace auto_schedule } // namespace cinn diff --git a/paddle/cinn/auto_schedule/post_schedule_rule/cooperative_process_test.cc b/paddle/cinn/auto_schedule/post_schedule_rule/cooperative_process_test.cc index a6e1db2a8b20e..0507c78ff2e1c 100644 --- a/paddle/cinn/auto_schedule/post_schedule_rule/cooperative_process_test.cc +++ b/paddle/cinn/auto_schedule/post_schedule_rule/cooperative_process_test.cc @@ -129,7 +129,7 @@ TEST_F(TestCooperativeProcess, Matmul) { { i0, i1 = axis.bind(((16 * i) + ((2 * i_0) + i_1)), ((16 * j) + ((8 * j_0) + j_1))) { - temp_matmul_out__reduce_init[((16 * i) + ((2 * i_0) + i_1)), ((16 * j) + ((8 * j_0) + j_1))] = 0.00000000f + temp_matmul_out__reduce_init[i0, i1] = 0.00000000f } } } @@ -181,7 +181,7 @@ TEST_F(TestCooperativeProcess, Matmul) { { i0_0, i1_0, i2 = axis.bind(((2 * (i_0_j_0_fused / 2)) + ((16 * (i_j_fused / 2)) + i_1)), ((8 * (i_0_j_0_fused % 2)) + ((16 * (i_j_fused % 2)) + j_1)), ((4 * reduce_k_0) + reduce_k_1)) { - temp_matmul_out[((2 * (i_0_j_0_fused / 2)) + ((16 * (i_j_fused / 2)) + i_1)), ((8 * (i_0_j_0_fused % 2)) + ((16 * (i_j_fused % 2)) + j_1))] = (temp_matmul_out[((2 * (i_0_j_0_fused / 2)) + ((16 * (i_j_fused / 2)) + i_1)), ((8 * (i_0_j_0_fused % 2)) + ((16 * (i_j_fused % 2)) + j_1))] + (X_reshape_shared_temp_buffer[((2 * (i_0_j_0_fused / 2)) + ((16 * (i_j_fused / 2)) + i_1)), ((4 * reduce_k_0) + reduce_k_1)] * Y_reshape_shared_temp_buffer[((4 * reduce_k_0) + reduce_k_1), ((8 * (i_0_j_0_fused % 2)) + ((16 * (i_j_fused % 2)) + j_1))])) + temp_matmul_out[i0_0, i1_0] = (temp_matmul_out[i0_0, i1_0] + (X_reshape_shared_temp_buffer[i0_0, i2] * Y_reshape_shared_temp_buffer[i2, i1_0])) } } } diff --git a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/CMakeLists.txt b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/CMakeLists.txt index 730575b79ad89..9965046f16635 100644 --- a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/CMakeLists.txt +++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/CMakeLists.txt @@ -8,7 +8,8 @@ gather_srcs( auto_unroll.cc multi_level_tiling.cc skip_rule.cc - auto_bind.cc) + auto_bind.cc + reduction_factoring.cc) if(WITH_TESTING) cinn_cc_library( @@ -51,3 +52,11 @@ endif() #cinn_cc_test(test_auto_inline SRCS auto_inline_test.cc DEPS cinncore auto_gen_rule_test_helper) cinn_cc_test(test_skip_rule SRCS skip_rule_test.cc DEPS cinncore) cinn_cc_test(test_auto_unroll SRCS auto_unroll_test.cc DEPS cinncore) +cinn_cc_test( + test_reduction_factoring + SRCS + reduction_factoring_test.cc + DEPS + cinncore + auto_gen_rule_test_helper + test_program_builder) diff --git a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/multi_level_tiling_test.cc b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/multi_level_tiling_test.cc index fa7206bdae7dd..5a5c68537e9a7 100644 --- a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/multi_level_tiling_test.cc +++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/multi_level_tiling_test.cc @@ -261,7 +261,7 @@ TEST_F(TestMultiLevelTiling, Matmul) { { i0, i1 = axis.bind(((8 * i_0_j_0_fused) + ((8 * i_1) + ((8 * i_2) + ((8 * i_j_fused) + i_3)))), ((32 * j_1) + ((32 * j_2) + j_3))) { - temp_matmul_out__reduce_init[((8 * i_0_j_0_fused) + ((8 * i_1) + ((8 * i_2) + ((8 * i_j_fused) + i_3)))), ((32 * j_1) + ((32 * j_2) + j_3))] = 0.00000000f + temp_matmul_out__reduce_init[i0, i1] = 0.00000000f } } } @@ -308,10 +308,10 @@ TEST_F(TestMultiLevelTiling, Matmul) { ScheduleBlock(temp_matmul_out_local_temp_buffer) { i0_0, i1_0, i2 = axis.bind(((8 * i_0_j_0_fused) + ((8 * i_1) + ((8 * i_2) + ((8 * i_j_fused) + i_3)))), ((32 * j_1) + ((32 * j_2) + j_3)), ((8 * reduce_k_0) + ((8 * reduce_k_1) + reduce_k_2))) - read_buffers(_temp_matmul_out[i(undefined:undefined), j(undefined:undefined)], _X[i(undefined:undefined), reduce_k(0:32)], _Y[reduce_k(0:32), j(undefined:undefined)]) - write_buffers(_temp_matmul_out[i(undefined:undefined), j(undefined:undefined)]) + read_buffers(_temp_matmul_out[i0_0(0:32), i1_0(0:32)], _X[i0_0(0:32), i2(0:32)], _Y[i2(0:32), i1_0(0:32)]) + write_buffers(_temp_matmul_out[i0_0(0:32), i1_0(0:32)]) { - temp_matmul_out_local_temp_buffer[((8 * i_0_j_0_fused) + ((8 * i_1) + ((8 * i_2) + ((8 * i_j_fused) + i_3)))), ((32 * j_1) + ((32 * j_2) + j_3))] = (temp_matmul_out_local_temp_buffer[((8 * i_0_j_0_fused) + ((8 * i_1) + ((8 * i_2) + ((8 * i_j_fused) + i_3)))), ((32 * j_1) + ((32 * j_2) + j_3))] + (X_reshape_shared_temp_buffer[((8 * i_0_j_0_fused) + ((8 * i_1) + ((8 * i_2) + ((8 * i_j_fused) + i_3)))), ((8 * reduce_k_0) + ((8 * reduce_k_1) + reduce_k_2))] * Y_reshape_shared_temp_buffer[((8 * reduce_k_0) + ((8 * reduce_k_1) + reduce_k_2)), ((32 * j_1) + ((32 * j_2) + j_3))])) + temp_matmul_out_local_temp_buffer[i0_0, i1_0] = (temp_matmul_out_local_temp_buffer[i0_0, i1_0] + (X_reshape_shared_temp_buffer[i0_0, i2] * Y_reshape_shared_temp_buffer[i2, i1_0])) } } } @@ -453,7 +453,7 @@ TEST_F(TestMultiLevelTiling, Pool2d) { { i0, i1, i2, i3 = axis.bind(i, j, k, a) { - pad_temp_0[i, j, k, a] = select(((a < 17) and ((a >= 1) and ((k < 17) and (k >= 1)))), input[i, j, (-1 + k), (-1 + a)], -3.40282347e+38f) + pad_temp_0[i0, i1, i2, i3] = select(((i3 < (1 + 16)) and ((i3 >= 1) and ((i2 < (1 + 16)) and (i2 >= 1)))), input[i0, i1, (i2 - 1), (i3 - 1)], -3.40282347e+38f) } } } @@ -477,7 +477,7 @@ TEST_F(TestMultiLevelTiling, Pool2d) { { i0_0, i1_0, i2_0, i3_0 = axis.bind(((((i_j_k_a_fused / 2) / 2) / 2) + ((i_0_j_0_k_0_a_0_fused / 4) + i_1)), ((4 * (((i_j_k_a_fused / 2) / 2) % 2)) + j_1), ((i_0_j_0_k_0_a_0_fused % 4) + ((4 * ((i_j_k_a_fused / 2) % 2)) + k_1)), ((4 * (i_j_k_a_fused % 2)) + a_1)) { - var_0__reduce_init[((((i_j_k_a_fused / 2) / 2) / 2) + ((i_0_j_0_k_0_a_0_fused / 4) + i_1)), ((4 * (((i_j_k_a_fused / 2) / 2) % 2)) + j_1), ((4 * ((i_j_k_a_fused / 2) % 2)) + ((i_0_j_0_k_0_a_0_fused % 4) + k_1)), ((4 * (i_j_k_a_fused % 2)) + a_1)] = -3.40282347e+38f + var_0__reduce_init[i0_0, i1_0, i2_0, i3_0] = -3.40282347e+38f } } } @@ -511,10 +511,10 @@ TEST_F(TestMultiLevelTiling, Pool2d) { ScheduleBlock(var_0_local_temp_buffer) { i0_1, i1_1, i2_1, i3_1, i4, i5 = axis.bind(((((i_j_k_a_fused / 2) / 2) / 2) + ((i_0_j_0_k_0_a_0_fused / 4) + i_1)), ((4 * (((i_j_k_a_fused / 2) / 2) % 2)) + j_1), ((i_0_j_0_k_0_a_0_fused % 4) + ((4 * ((i_j_k_a_fused / 2) % 2)) + k_1)), ((4 * (i_j_k_a_fused % 2)) + a_1), kernel_idx, kernel_idx_0) - read_buffers(_var_0[i(undefined:undefined), j(undefined:undefined), k(undefined:undefined), a(undefined:undefined)], _pad_temp_0[i(undefined:undefined), j(undefined:undefined)]) - write_buffers(_var_0[i(undefined:undefined), j(undefined:undefined), k(undefined:undefined), a(undefined:undefined)]) + read_buffers(_var_0[i0_1(0:2), i1_1(0:8), i2_1(0:8), i3_1(0:8)], _pad_temp_0[i0_1(0:2), i1_1(0:8)]) + write_buffers(_var_0[i0_1(0:2), i1_1(0:8), i2_1(0:8), i3_1(0:8)]) { - var_0_local_temp_buffer[((((i_j_k_a_fused / 2) / 2) / 2) + ((i_0_j_0_k_0_a_0_fused / 4) + i_1)), ((4 * (((i_j_k_a_fused / 2) / 2) % 2)) + j_1), ((4 * ((i_j_k_a_fused / 2) % 2)) + ((i_0_j_0_k_0_a_0_fused % 4) + k_1)), ((4 * (i_j_k_a_fused % 2)) + a_1)] = cinn_max(var_0_local_temp_buffer[((((i_j_k_a_fused / 2) / 2) / 2) + ((i_0_j_0_k_0_a_0_fused / 4) + i_1)), ((4 * (((i_j_k_a_fused / 2) / 2) % 2)) + j_1), ((i_0_j_0_k_0_a_0_fused % 4) + ((4 * ((i_j_k_a_fused / 2) % 2)) + k_1)), ((4 * (i_j_k_a_fused % 2)) + a_1)], pad_temp_0_shared_temp_buffer[((((i_j_k_a_fused / 2) / 2) / 2) + ((i_0_j_0_k_0_a_0_fused / 4) + i_1)), ((4 * (((i_j_k_a_fused / 2) / 2) % 2)) + j_1), ((8 * ((i_j_k_a_fused / 2) % 2)) + ((2 * (i_0_j_0_k_0_a_0_fused % 4)) + ((2 * k_1) + kernel_idx))), ((8 * (i_j_k_a_fused % 2)) + ((2 * a_1) + kernel_idx_0))]) + var_0_local_temp_buffer[i0_1, i1_1, i2_1, i3_1] = cinn_max(var_0_local_temp_buffer[i0_1, i1_1, i2_1, i3_1], pad_temp_0_shared_temp_buffer[i0_1, i1_1, ((2 * i2_1) + i4), ((2 * i3_1) + i5)]) } } } @@ -533,7 +533,7 @@ TEST_F(TestMultiLevelTiling, Pool2d) { { ScheduleBlock(var_0) { - v0, v1, v2, v3 = axis.bind((((((i_j_k_a_fused / 2) / 2) / 2) + (i_0_j_0_k_0_a_0_fused / 4)) + ax0_0), ((4 * (((i_j_k_a_fused / 2) / 2) % 2)) + ax1_0), (((4 * ((i_j_k_a_fused / 2) % 2)) + (i_0_j_0_k_0_a_0_fused % 4)) + ax2_0), ((4 * (i_j_k_a_fused % 2)) + ax3_0)) + v0, v1, v2, v3 = axis.bind((((((i_j_k_a_fused / 2) / 2) / 2) + (i_0_j_0_k_0_a_0_fused / 4)) + ax0_0), ((4 * (((i_j_k_a_fused / 2) / 2) % 2)) + ax1_0), (((i_0_j_0_k_0_a_0_fused % 4) + (4 * ((i_j_k_a_fused / 2) % 2))) + ax2_0), ((4 * (i_j_k_a_fused % 2)) + ax3_0)) attrs(reverse_compute_at_extra_var:ax0_0,ax1_0,ax2_0,ax3_0) { var_0[v0, v1, v2, v3] = var_0_local_temp_buffer[v0, v1, v2, v3] diff --git a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/reduction_factoring.cc b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/reduction_factoring.cc new file mode 100644 index 0000000000000..8c679f3f8b880 --- /dev/null +++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/reduction_factoring.cc @@ -0,0 +1,189 @@ +// Copyright (c) 2023 CINN Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/cinn/auto_schedule/search_space/auto_gen_rule/reduction_factoring.h" + +#include + +#include "paddle/cinn/auto_schedule/analysis/analyze_ir.h" +#include "paddle/cinn/ir/ir_printer.h" +#include "paddle/cinn/ir/schedule/ir_schedule.h" +#include "paddle/cinn/ir/schedule/ir_schedule_util.h" +#include "paddle/cinn/ir/tensor.h" +#include "paddle/cinn/ir/utils/ir_copy.h" +#include "paddle/cinn/ir/utils/ir_nodes_collector.h" + +namespace cinn { +namespace auto_schedule { + +bool ReductionFactoring::CanApply(const std::string& block_name, + ir::IRSchedule* ir_schedule) const { + ir::Expr block_expr = ir_schedule->GetBlock(block_name); + ir::ScheduleBlockRealize* block_realize = + block_expr.As(); + CHECK_NOTNULL(block_realize); + ir::ScheduleBlock* sch_block = + block_realize->schedule_block.As(); + CHECK_NOTNULL(sch_block); + AnalyzeScheduleBlockReadWriteBuffer(sch_block); + + // 1. The block must have write buffer + if (sch_block->write_buffers.empty()) { + return false; + } + + // 2. The block must have at least one reduce axis + const std::vector& iter_vars = sch_block->iter_vars; + bool find_reduce_axis = false; + for (int i = 0; i < iter_vars.size(); ++i) { + if (iter_vars[i]->is_reduce_axis) { + find_reduce_axis = true; + break; + } + } + if (!find_reduce_axis) { + return false; + } + + // 3. Each loop's body only contains one sub loop or block, except reduce_init + // block + std::vector loops = ir_schedule->GetLoops(block_name); + for (const ir::Expr& loop : loops) { + const ir::Expr& body = loop.As()->body; + if (body.As()) { + if (body.As()->stmts.size() == 1) { + if (body.As()->stmts[0].As() == nullptr && + body.As()->stmts[0].As() == + nullptr) { + return false; + } + } else if (body.As()->stmts.size() == 2) { + if (body.As()->stmts[0].As() == + nullptr || + !ir::IsReduceInitTensorName( + GetBlockName(body.As()->stmts[0]))) { + return false; + } + if (body.As()->stmts[1].As() == nullptr && + body.As()->stmts[1].As() == + nullptr) { + return false; + } + } else { + return false; + } + } else if (body.As() || body.As()) { + continue; + } else { + return false; + } + } + + return true; +} + +RuleApplyType ReductionFactoring::AnalyseApplyType( + SearchState state, const std::string& block_name) const { + return this->CanApply(block_name, &(state->ir_schedule)) + ? RuleApplyType::kApply + : RuleApplyType::kCannotApply; +} + +std::vector ReductionFactoring::ApplyOnBlock( + SearchState state, const std::string& block_name) { + SearchState new_state = state.Copy(); + Apply(block_name, &(new_state->ir_schedule)); + return {new_state}; +} + +void ReductionFactoring::Apply(const std::string& block_name, + ir::IRSchedule* ir_schedule) { + ir::Expr block = ir_schedule->GetBlock(block_name); + std::vector all_loops = ir_schedule->GetLoops(block_name); + + std::vector new_loop_order; + size_t num_spatial_loops = 0; + size_t num_reduction_loops = 0; + // 1. Add all spatial loops + std::unordered_set reduce_loop_var_names = + GetReduceLoopVarNames(block); + for (const ir::Expr& expr : all_loops) { + if (reduce_loop_var_names.count(expr.As()->loop_var->name) == 0) { + new_loop_order.push_back(expr); + ++num_spatial_loops; + } + } + // 2. Add all reduction loops + for (const ir::Expr& expr : all_loops) { + if (reduce_loop_var_names.count(expr.As()->loop_var->name) > 0) { + new_loop_order.push_back(expr); + ++num_reduction_loops; + } + } + if (num_reduction_loops == 0) { + return; + } + // 3. Reorder if new_loop_order differs from the original order + CHECK_EQ(all_loops.size(), new_loop_order.size()); + for (int i = 0; i < all_loops.size(); ++i) { + if (all_loops[i].As()->loop_var->name != + new_loop_order[i].As()->loop_var->name) { + ir_schedule->Reorder(new_loop_order); + break; + } + } + + // TODO(BiynXu): After implementing the factorize_reduction schedule + // primitive, restore the following annotations. The factorize_reduction + // schedule primitive needs to support complex subscripts to support pre + // schedule transformations. + + // // 4. Fuse all reduction loops + // ir::Expr fused_reduce_loop; + // if (num_reduction_loops > 1) { + // std::vector reduction_loop_indices; + // for (int i = num_spatial_loops - 1; i < all_loops.size(); ++i) { + // reduction_loop_indices.push_back(i); + // } + // CHECK_EQ(reduction_loop_indices.size(), num_reduction_loops); + // fused_reduce_loop = ir_schedule->Fuse(block_name, + // reduction_loop_indices); + // } else { + // all_loops = ir_schedule->GetLoops(block_name); + // fused_reduce_loop = all_loops.back(); + // } + // // 5. Split the reduction loop into 2 part + // int factor = 1; + // int extent = ir::GetLoopExtent(fused_reduce_loop); + // for (int i = ceil(sqrt(extent)); i >= 1; --i) { + // if (extent % i == 0) { + // factor = i; + // break; + // } + // } + // std::vector splited_reduction_loops = + // ir_schedule->Split(fused_reduce_loop, {-1, factor}); + // // Apply FactorizeReduction + // LOG(INFO) << "before FactorizeReduction: " << + // ir_schedule->GetModule().GetExprs()[0]; + // ir_schedule->FactorizeReduction(splited_reduction_loops[0], + // num_spatial_loops); + + // Apply rfactor + all_loops = ir_schedule->GetLoops(block_name); + ir_schedule->Rfactor(all_loops[num_spatial_loops], num_spatial_loops); +} + +} // namespace auto_schedule +} // namespace cinn diff --git a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/reduction_factoring.h b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/reduction_factoring.h new file mode 100644 index 0000000000000..889e3e94292d2 --- /dev/null +++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/reduction_factoring.h @@ -0,0 +1,59 @@ +// Copyright (c) 2023 CINN Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include + +#include "paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_gen_rule.h" +#include "paddle/cinn/ir/ir.h" +#include "paddle/cinn/ir/schedule/ir_schedule.h" + +namespace cinn { +namespace auto_schedule { + +class ReductionFactoring : public AutoGenRule { + public: + explicit ReductionFactoring(const common::Target& target) + : AutoGenRule(target) {} + ~ReductionFactoring() = default; + + // In the future, we will no longer use this interface. + RuleApplyType Init(ir::IRSchedule* init_schedule) override { + return RuleApplyType::kCannotApply; + } + // In the future, we will no longer use this interface. + void Apply(int index) override { + LOG(FATAL) << "This is a deprecated interface, please do not use it."; + return; + } + + RuleApplyType AnalyseApplyType(SearchState state, + const std::string& block_name) const override; + + std::string GetRuleName() const override { return "ReductionFactoring"; } + + std::vector ApplyOnBlock(SearchState state, + const std::string& block_name) override; + + void Apply(const std::string& block_name, ir::IRSchedule* ir_schedule); + + private: + bool CanApply(const std::string& block_name, + ir::IRSchedule* ir_schedule) const; +}; + +} // namespace auto_schedule +} // namespace cinn diff --git a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/reduction_factoring_test.cc b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/reduction_factoring_test.cc new file mode 100644 index 0000000000000..916f7b79ae351 --- /dev/null +++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/reduction_factoring_test.cc @@ -0,0 +1,131 @@ +// Copyright (c) 2023 CINN Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/cinn/auto_schedule/search_space/auto_gen_rule/reduction_factoring.h" + +#include +#include + +#include +#include +#include + +#include "paddle/cinn/auto_schedule/search_space/auto_gen_rule/test_helper.h" +#include "paddle/cinn/ir/ir_printer.h" +#include "test/cpp/cinn/concrete_program_builder.h" + +namespace cinn { +namespace auto_schedule { + +class TestReductionFactoring : public TestAutoGenRuleBase { + public: + std::vector default_input_names = {"X"}; + std::vector default_output_names = {"out"}; + + void TestApplyOnReduce(const std::vector& shape, + const std::vector& reduce_dim, + const std::string& block_name, + const std::string& expected_ir) { + Initialize(common::DefaultHostTarget()); + auto test_program = tests::ReduceBuilder().Build( + {{"X", shape}}, {{"reduce_dim", reduce_dim}}); + // construct input parameter + ir::IRSchedule ir_schedule = MakeIRSchedule(test_program); + SearchState state(ir_schedule, 0, {}); + std::vector func_bodys = ir_schedule.GetModule().GetExprs(); + ASSERT_EQ(func_bodys.size(), 1UL); + VLOG(6) << "Original Expr:\n" << func_bodys[0]; + + // apply + ReductionFactoring reduction_factoring(target_); + ASSERT_EQ(reduction_factoring.AnalyseApplyType(state, block_name), + RuleApplyType::kApply); + auto result = reduction_factoring.ApplyOnBlock(state, block_name)[0]; + std::vector exprs = result->ir_schedule.GetModule().GetExprs(); + EXPECT_EQ(exprs.size(), 1UL); + std::stringstream ir; + ir << exprs[0]; + VLOG(6) << "ReductionFactoring applied Expr: " << exprs[0]; + + // check + const std::vector& blocks = ir_schedule.GetAllBlocks(); + CHECK_EQ(blocks.size(), 2UL); + CHECK_EQ(ir.str(), expected_ir); + } +}; + +TEST_F(TestReductionFactoring, AnalyseApplyType) { + Initialize(common::DefaultHostTarget()); + auto test_program = + tests::OpBuilder("elementwise_add").Build({{"X", {4, 5}}, {"Y", {4, 5}}}); + ir::IRSchedule ir_schedule = MakeIRSchedule(test_program); + VLOG(6) << "Original Expr:\n" << ir_schedule.GetModule().GetExprs()[0]; + SearchState state(ir_schedule, 0, {}); + ReductionFactoring reduction_factoring(target_); + EXPECT_EQ(reduction_factoring.AnalyseApplyType(state, "var_1"), + RuleApplyType::kCannotApply); +} + +TEST_F(TestReductionFactoring, ApplyOnBlock) { + std::string expected_ir = R"({ + ScheduleBlock(root) + { + { + serial for (i, 0, 32) + { + serial for (rf_reduce_k_0, 0, 64) + { + ScheduleBlock(rf_var_0__reduce_init) + { + i0, i1_0 = axis.bind(i, rf_reduce_k_0) + rf_var_0__reduce_init[i0, i1_0] = 0.00000000f + } + serial for (reduce_k_1, 0, 128) + { + ScheduleBlock(rf_var_0) + { + i0_0, i1, i2 = axis.bind(i, rf_reduce_k_0, reduce_k_1) + read_buffers(_var_0[i0_0(0:32)], _X[i0_0(0:32), i1(0:64), i2(0:128)]) + write_buffers(_var_0[i0_0(0:32)]) + rf_var_0[i0_0, i1] = (rf_var_0[i0_0, i1] + X[i0_0, i1, i2]) + } + } + } + } + serial for (i, 0, 32) + { + ScheduleBlock(var_0__reduce_init) + { + i0 = axis.bind(i) + var_0__reduce_init[i0] = 0.00000000f + } + serial for (reduce_k_0, 0, 64) + { + ScheduleBlock(var_0) + { + i0_0, i1 = axis.bind(i, reduce_k_0) + read_buffers(_var_0[i0_0(0:32)], _X[i0_0(0:32), i1(0:64), i2(0:128)]) + write_buffers(_var_0[i0_0(0:32)]) + var_0[i0_0] = (var_0[i0_0] + rf_var_0[i0_0, i1]) + } + } + } + } + } +})"; + TestApplyOnReduce({32, 64, 128}, {1, 2}, "var_0", expected_ir); +} + +} // namespace auto_schedule +} // namespace cinn diff --git a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/test_helper.cc b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/test_helper.cc index ef7f2a4ab6dc5..11fabfe16df2f 100644 --- a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/test_helper.cc +++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/test_helper.cc @@ -67,7 +67,8 @@ ir::IRSchedule TestAutoGenRuleBase::MakeIRSchedule( lowered_funcs_ = op_lowerer.Lower(graph->fusion_groups.front(), /*apply_op_schedule = */ apply_manual_schedule, - /*apply_group_schedule = */ apply_manual_schedule); + /*apply_group_schedule = */ apply_manual_schedule, + /*apply_pass = */ apply_manual_schedule); CHECK(!lowered_funcs_.empty()) << "lowered_funcs_ is empty"; std::vector bodys; diff --git a/paddle/cinn/hlir/framework/new_ir/op_lowering_impl.cc b/paddle/cinn/hlir/framework/new_ir/op_lowering_impl.cc index 56282996b9e26..ea76d939bc45b 100644 --- a/paddle/cinn/hlir/framework/new_ir/op_lowering_impl.cc +++ b/paddle/cinn/hlir/framework/new_ir/op_lowering_impl.cc @@ -102,7 +102,8 @@ OpLowererImpl::OpLowererImpl(const Target& target) : target_(target) {} std::vector OpLowererImpl::Lower(const GroupPtr& group, bool apply_op_schedule, - bool apply_group_schedule) { + bool apply_group_schedule, + bool apply_pass) { VLOG(3) << "Lowering Group : " << group->group_id << " , Op Pattern : " << group->op_pattern_kind; group->input_names.clear(); diff --git a/paddle/cinn/hlir/framework/new_ir/op_lowering_impl.h b/paddle/cinn/hlir/framework/new_ir/op_lowering_impl.h index 705c1f6f8c12d..3fa859bbce880 100644 --- a/paddle/cinn/hlir/framework/new_ir/op_lowering_impl.h +++ b/paddle/cinn/hlir/framework/new_ir/op_lowering_impl.h @@ -58,7 +58,8 @@ class OpLowererImpl : public OpLowererImplBase { */ std::vector Lower(const GroupPtr& group, bool apply_op_schedule = true, - bool apply_group_schedule = true); + bool apply_group_schedule = true, + bool apply_pass = true); private: /** diff --git a/paddle/cinn/hlir/framework/op_lowering.h b/paddle/cinn/hlir/framework/op_lowering.h index b0e0ad7d97b11..ac52aea80de71 100644 --- a/paddle/cinn/hlir/framework/op_lowering.h +++ b/paddle/cinn/hlir/framework/op_lowering.h @@ -40,8 +40,10 @@ class OpLowerer { std::vector Lower(const T& group, bool apply_op_schedule = true, - bool apply_group_schedule = true) { - return impl_->Lower(group, apply_op_schedule, apply_group_schedule); + bool apply_group_schedule = true, + bool apply_pass = true) { + return impl_->Lower( + group, apply_op_schedule, apply_group_schedule, apply_pass); } private: diff --git a/paddle/cinn/hlir/framework/op_lowering_impl.cc b/paddle/cinn/hlir/framework/op_lowering_impl.cc index ad5a903bedadc..b380ee8aaba2e 100644 --- a/paddle/cinn/hlir/framework/op_lowering_impl.cc +++ b/paddle/cinn/hlir/framework/op_lowering_impl.cc @@ -49,7 +49,8 @@ OpLowererImpl::OpLowererImpl( std::vector OpLowererImpl::Lower(const GroupPtr& group, bool apply_op_schedule, - bool apply_group_schedule) { + bool apply_group_schedule, + bool apply_pass) { VLOG(3) << "Lowering Group : " << group->group_id << " , Op Pattern : " << group->op_pattern_kind; group->input_names.clear(); @@ -61,11 +62,13 @@ std::vector OpLowererImpl::Lower(const GroupPtr& group, return LowerGroup(group, apply_op_schedule, apply_group_schedule, + apply_pass, &OpLowererImpl::ElementwiseScheduleDetermineFunction); case framework::kReduction: return LowerGroup(group, apply_op_schedule, apply_group_schedule, + apply_pass, &OpLowererImpl::ReduceScheduleDetermineFunction); case framework::kOutFusible: LOG(FATAL) << "Group Pattern Kind kOutFusible Is Not Implemented!"; @@ -73,6 +76,7 @@ std::vector OpLowererImpl::Lower(const GroupPtr& group, return LowerGroup(group, apply_op_schedule, apply_group_schedule, + apply_pass, &OpLowererImpl::NonFusibleScheduleDetermineFunction); default: LOG(FATAL) << "Group Pattern Kind Is Unknown!"; @@ -96,6 +100,7 @@ std::vector OpLowererImpl::LowerGroup( const GroupPtr& group, bool apply_op_schedule, bool apply_group_schedule, + bool apply_pass, ScheduleDetermineFunction schedule_determine_func) { // 1.Do compute, lower and schedule for each op. VLOG(3) << "group->fused_sub_groups.size() is : " @@ -127,8 +132,12 @@ std::vector OpLowererImpl::LowerGroup( // 3.Do post-processing, // including preparing function args and temporary variables, // applying low-level optimization passes, etc. - return PostProcess( - group, tensor_map, do_op_schedule, &ir_sch, &group_func_arg_tensors); + return PostProcess(group, + tensor_map, + do_op_schedule, + apply_pass, + &ir_sch, + &group_func_arg_tensors); } std::vector OpLowererImpl::LowerCustomCall( @@ -222,6 +231,7 @@ std::vector OpLowererImpl::PostProcess( const GroupPtr& group, const std::unordered_map& tensor_map, bool done_op_schedule, + bool apply_pass, ir::IRSchedule* ir_sch, std::vector* group_func_arg_tensors) { // 1.Prepare function args @@ -278,9 +288,10 @@ std::vector OpLowererImpl::PostProcess( auto func_body = ir_sch->GetModule().GetExprs().at(0); #ifdef CINN_WITH_CUDA - optim::OptimizeExprGPU(&(func_body)); + if (apply_pass) { + optim::OptimizeExprGPU(&(func_body)); + } #endif - // 2.Prepare temp buffers poly::StageMap stages; auto temp_buffers = @@ -294,7 +305,9 @@ std::vector OpLowererImpl::PostProcess( func->PrepareBufferCastExprs(); } // 4.Apply low level pass - func = optim::Optimize(Expr(func), target_, false).as_lowered_func_ref(); + if (apply_pass) { + func = optim::Optimize(Expr(func), target_, false).as_lowered_func_ref(); + } return {func}; } diff --git a/paddle/cinn/hlir/framework/op_lowering_impl.h b/paddle/cinn/hlir/framework/op_lowering_impl.h index a4c79a3268004..99be348d5be32 100644 --- a/paddle/cinn/hlir/framework/op_lowering_impl.h +++ b/paddle/cinn/hlir/framework/op_lowering_impl.h @@ -56,7 +56,8 @@ class OpLowererImpl : public OpLowererImplBase { */ std::vector Lower(const GroupPtr& group, bool apply_op_schedule = true, - bool apply_group_schedule = true); + bool apply_group_schedule = true, + bool apply_pass = true); private: /** @@ -72,6 +73,7 @@ class OpLowererImpl : public OpLowererImplBase { const GroupPtr& group, bool apply_op_schedule, bool apply_group_schedule, + bool apply_pass, ScheduleDetermineFunction schedule_determine_func); /** @@ -96,6 +98,7 @@ class OpLowererImpl : public OpLowererImplBase { const GroupPtr& group, const std::unordered_map& tensor_map, bool done_op_schedule, + bool apply_pass, ir::IRSchedule* ir_sch, std::vector* group_func_arg_tensors); diff --git a/paddle/cinn/hlir/framework/op_lowering_impl_base.h b/paddle/cinn/hlir/framework/op_lowering_impl_base.h index 9f2c0e7a35dad..6479419852a2b 100644 --- a/paddle/cinn/hlir/framework/op_lowering_impl_base.h +++ b/paddle/cinn/hlir/framework/op_lowering_impl_base.h @@ -32,10 +32,10 @@ class OpLowererImplBase { OpLowererImplBase() = default; ~OpLowererImplBase() = default; - virtual std::vector Lower( - const T& group, - bool apply_op_schedule = true, - bool apply_group_schedule = true) = 0; + virtual std::vector Lower(const T& group, + bool apply_op_schedule = true, + bool apply_group_schedule = true, + bool apply_pass = true) = 0; }; } // namespace framework diff --git a/test/cpp/cinn/concrete_program_builder.h b/test/cpp/cinn/concrete_program_builder.h index 8da4bdab927c9..920f725e2d78a 100644 --- a/test/cpp/cinn/concrete_program_builder.h +++ b/test/cpp/cinn/concrete_program_builder.h @@ -112,5 +112,21 @@ class FillConstantAddBuilder : public ProgramBuilder { } }; +class ReduceBuilder : public ProgramBuilder { + public: + ReduceBuilder() : ProgramBuilder("reduce_builder") {} + frontend::Program Build(const std::vector& inputs_varinfo, + const utils::AttributeMap& attrs) { + CHECK_EQ(inputs_varinfo.size(), 1); + CHECK_EQ(attrs.count("reduce_dim"), 1); + std::vector reduce_dim = + absl::get>(attrs.at("reduce_dim")); + auto X = builder_.CreateInput( + inputs_varinfo[0].type, inputs_varinfo[0].shape, inputs_varinfo[0].id); + auto Y = builder_.ReduceSum(X, reduce_dim); + return builder_.Build(); + } +}; + } // namespace tests } // namespace cinn From ce837d26664dc8c29aa10fb9f54209ed466ef587 Mon Sep 17 00:00:00 2001 From: Charles-hit <56987902+Charles-hit@users.noreply.github.com> Date: Mon, 9 Oct 2023 19:39:22 +0800 Subject: [PATCH 42/62] [PRIM][PIR]Fix dropout rules (#57910) * test framework supports to_static and prim * fix dropout composite rules --------- Co-authored-by: cyber-pioneer --- python/paddle/decomposition/rules.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/python/paddle/decomposition/rules.py b/python/paddle/decomposition/rules.py index 4dd291475308e..924ccf1756b0e 100644 --- a/python/paddle/decomposition/rules.py +++ b/python/paddle/decomposition/rules.py @@ -155,7 +155,7 @@ def dropout(x, seed_tensor, p, is_test, mode, seed, fix_seed): train: out = input * mask inference: out = input * (1.0 - p) """ - from paddle import scale as pd_scale + from paddle import assign from paddle.base import core from paddle.base.data_feeder import convert_dtype @@ -179,9 +179,7 @@ def dropout(x, seed_tensor, p, is_test, mode, seed, fix_seed): shape=x.shape, value=(1.0 - p), dtype=x.dtype ), cast(mask, uint8_type) else: - return pd_scale(x, 1.0), cast( - mask, uint8_type - ) # assign(x), cast(mask, mask, core.VarDesc.VarType.UINT8) + return assign(x), cast(mask, uint8_type) else: if not is_test: return x * mask, cast(mask, uint8_type) From b06352674f98be38a13eeed14e70240e827f2395 Mon Sep 17 00:00:00 2001 From: wentao yu Date: Mon, 9 Oct 2023 20:33:26 +0800 Subject: [PATCH 43/62] fix new comm flags (#57952) --- paddle/fluid/operators/collective/c_comm_init_op.cc | 6 ++---- paddle/fluid/operators/collective/c_gen_nccl_id_op.cc | 6 ++---- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/paddle/fluid/operators/collective/c_comm_init_op.cc b/paddle/fluid/operators/collective/c_comm_init_op.cc index 9f34211a6169b..20d2d15187dfa 100644 --- a/paddle/fluid/operators/collective/c_comm_init_op.cc +++ b/paddle/fluid/operators/collective/c_comm_init_op.cc @@ -31,6 +31,7 @@ limitations under the License. */ #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #include "paddle/phi/core/distributed/nccl_comm_context.h" +PHI_DECLARE_bool(dynamic_static_unified_comm); #endif #include "paddle/phi/core/distributed/auto_parallel/reshard_utils.h" @@ -114,10 +115,7 @@ class CCommInitOp : public framework::OperatorBase { int rank_id = Attr("rank"); #endif #if defined(PADDLE_WITH_NCCL) - const char* dynamic_static_unified_comm = - getenv("FLAGS_dynamic_static_unified_comm"); - if (dynamic_static_unified_comm && - std::string(dynamic_static_unified_comm) == "1") { + if (FLAGS_dynamic_static_unified_comm) { VLOG(3) << "#### use new comm lab ####"; auto store = phi::distributed::CreateOrGetGlobalTCPStore(); phi::distributed::CommContextManager::SetDeviceId(device_id); diff --git a/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc b/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc index 3f6d6348ba925..4a07f7e98f793 100644 --- a/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc +++ b/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc @@ -23,6 +23,7 @@ limitations under the License. */ #include "paddle/fluid/platform/gen_comm_id_helper.h" #include "paddle/fluid/platform/place.h" +PHI_DECLARE_bool(dynamic_static_unified_comm); namespace paddle { namespace operators { @@ -70,10 +71,7 @@ class CGenNCCLIdOp : public framework::OperatorBase { std::vector nccl_ids; nccl_ids.resize(1); - const char* dynamic_static_unified_comm = - getenv("FLAGS_dynamic_static_unified_comm"); - if (!dynamic_static_unified_comm || - std::string(dynamic_static_unified_comm) != "1") { + if (!FLAGS_dynamic_static_unified_comm) { int server_fd = platform::SocketServer::GetInstance(endpoint).socket(); if (rank == 0) { GenNCCLID(&nccl_ids); From c8b4d6bfd1d07d2c43e46e62dc795e41a3e8f6b3 Mon Sep 17 00:00:00 2001 From: wanghuancoder Date: Tue, 10 Oct 2023 09:20:02 +0800 Subject: [PATCH 44/62] del unuseful op7 (#57807) --- paddle/fluid/operators/conv_shift_op.cc | 270 ------- paddle/fluid/operators/conv_shift_op.cu | 216 ------ paddle/fluid/operators/conv_shift_op.h | 33 - .../fluid/operators/detection/CMakeLists.txt | 3 - .../detection/roi_perspective_transform_op.cc | 711 ------------------ .../detection/roi_perspective_transform_op.cu | 551 -------------- paddle/fluid/operators/margin_rank_loss_op.cc | 191 ----- paddle/fluid/operators/margin_rank_loss_op.cu | 25 - paddle/fluid/operators/margin_rank_loss_op.h | 96 --- paddle/fluid/operators/marker_op.cc | 79 -- paddle/fluid/operators/marker_op.cu | 65 -- paddle/fluid/operators/unity_build_rule.cmake | 4 - .../ipu/popart_canonicalization/loss_ops.cc | 1 - test/legacy_test/test_conv_shift_op.py | 60 -- test/legacy_test/test_margin_rank_loss_op.py | 115 --- test/legacy_test/test_marker_op.py | 38 - .../test_roi_perspective_transform_op.py | 261 ------- test/white_list/check_shape_white_list.py | 1 - test/white_list/no_grad_set_white_list.py | 1 - test/white_list/op_accuracy_white_list.py | 2 - tools/gpups_test.sh | 1 - tools/parallel_UT_rule.py | 8 - tools/static_mode_white_list.py | 4 - 23 files changed, 2736 deletions(-) delete mode 100644 paddle/fluid/operators/conv_shift_op.cc delete mode 100644 paddle/fluid/operators/conv_shift_op.cu delete mode 100644 paddle/fluid/operators/conv_shift_op.h delete mode 100644 paddle/fluid/operators/detection/roi_perspective_transform_op.cc delete mode 100644 paddle/fluid/operators/detection/roi_perspective_transform_op.cu delete mode 100644 paddle/fluid/operators/margin_rank_loss_op.cc delete mode 100644 paddle/fluid/operators/margin_rank_loss_op.cu delete mode 100644 paddle/fluid/operators/margin_rank_loss_op.h delete mode 100644 paddle/fluid/operators/marker_op.cc delete mode 100644 paddle/fluid/operators/marker_op.cu delete mode 100644 test/legacy_test/test_conv_shift_op.py delete mode 100644 test/legacy_test/test_margin_rank_loss_op.py delete mode 100644 test/legacy_test/test_marker_op.py delete mode 100644 test/legacy_test/test_roi_perspective_transform_op.py diff --git a/paddle/fluid/operators/conv_shift_op.cc b/paddle/fluid/operators/conv_shift_op.cc deleted file mode 100644 index d2d8f56587cfd..0000000000000 --- a/paddle/fluid/operators/conv_shift_op.cc +++ /dev/null @@ -1,270 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/conv_shift_op.h" - -#include - -#include "paddle/fluid/framework/eigen.h" - -namespace paddle { -namespace operators { - -template -using EigenMatrix = framework::EigenMatrix; - -class ConvShiftOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext *ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "ConvShiftOp"); - OP_INOUT_CHECK(ctx->HasInput("Y"), "Input", "Y", "ConvShiftOp"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "ConvShiftOp"); - - auto x_dims = ctx->GetInputDim("X"); - auto y_dims = ctx->GetInputDim("Y"); - PADDLE_ENFORCE_EQ( - x_dims.size(), - 2, - platform::errors::InvalidArgument( - "Input(X)'s dimensions of ConvShiftOp should be 2. " - "But received X's shape = [%s] and the dimension is %d.", - x_dims, - x_dims.size())); - PADDLE_ENFORCE_EQ( - y_dims.size(), - 2, - platform::errors::InvalidArgument( - "Input(Y)'s dimensions of ConvShiftOp should be 2. " - "But received Y's shape = [%s] and the dimension is %d.", - y_dims, - y_dims.size())); - if (ctx->IsRuntime() || (x_dims[0] > 0 && y_dims[0] > 0)) - PADDLE_ENFORCE_EQ( - x_dims[0], - y_dims[0], - platform::errors::InvalidArgument( - "The first dimension of Input(X) and Input(Y) of ConvShiftOp " - "should be equal. " - "But received X's shape = [%s], Y's shape = [%s], " - "and the first dimensions are %d and %d respectively.", - x_dims, - y_dims, - x_dims[0], - y_dims[0])); - if (ctx->IsRuntime() || y_dims[1] > 0) - PADDLE_ENFORCE_EQ( - y_dims[1] % 2, - 1, - platform::errors::InvalidArgument( - "The second dimension of Input(Y) of ConvShiftOp should be odd." - "But received Y's shape = [%s] and the second dimension is %d.", - y_dims, - y_dims[1])); - if (ctx->IsRuntime() || (x_dims[1] > 0 && y_dims[1] > 0)) - PADDLE_ENFORCE_LE( - y_dims[1], - x_dims[1], - platform::errors::InvalidArgument( - "The second dimension of Input(Y) of ConvShiftOp should be less " - "than or equal to the 2nd dimension of Input(X)." - "But received X's shape = [%s], Y's shape = [%s], " - "and the second dimensions are %d and %d respectively.", - x_dims, - y_dims, - x_dims[1], - y_dims[1])); - ctx->ShareDim("X", /*->*/ "Out"); - ctx->ShareLoD("X", /*->*/ "Out"); - } -}; - -class ConvShiftGradOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext *ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "ConvShiftGradOp"); - OP_INOUT_CHECK(ctx->HasInput("Y"), "Input", "Y", "ConvShiftGradOp"); - OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")), - "Input", - "Out@GRAD", - "ConvShiftGradOp"); - - auto x_grad_name = framework::GradVarName("X"); - if (ctx->HasOutput(x_grad_name)) { - auto x_dims = ctx->GetInputDim("X"); - ctx->SetOutputDim(x_grad_name, x_dims); - } - - auto y_grad_name = framework::GradVarName("Y"); - if (ctx->HasOutput(y_grad_name)) { - auto y_dims = ctx->GetInputDim("Y"); - ctx->SetOutputDim(y_grad_name, y_dims); - } - } -}; - -class ConvShiftOpMaker : public framework::OpProtoAndCheckerMaker { - public: - void Make() override { - AddInput("X", - "(Tensor, default Tensor), a 2-D tensor with shape B x M, " - "where B is the batch size and M is the data dimension."); - AddInput("Y", - "(Tensor, default Tensor), a 2-D tensor with shape B x N, " - "where B is the batch size and N is the data dimension. N must " - "be odd."); - AddOutput("Out", - "(Tensor, default Tensor), a 2-D tensor with shape B x M, " - "i.e., the same shape as X."); - AddComment(R"DOC( -ConvShift Operator. - -A layer for circular convolution of two vectors, -as used in the Neural Turing Machine: https://arxiv.org/abs/1410.5401 - -The equation is: - -$$Out[i] = \sum_{j=-(N-1)/2}^{(N-1)/2} X_{i+j} * Y_{j}$$ - -where X's index is computed modulo M, and Y's index is computed modulo N. - -Both inputs X and Y can carry LoD (Level of Details) information. -However, the output only shares the LoD information with input X. - -)DOC"); - } -}; - -template -class ConvShiftKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &context) const override { - auto *X = context.Input("X"); - auto *Y = context.Input("Y"); - auto *Out = context.Output("Out"); - Out->mutable_data(context.GetPlace()); - - auto x = EigenMatrix::From(*X); - auto y = EigenMatrix::From(*Y); - auto out = EigenMatrix::From(*Out); - out.setZero(); - - size_t batch_size = X->dims()[0]; - size_t x_width = X->dims()[1]; - size_t y_width = Y->dims()[1]; - size_t y_half_width = (y_width - 1) / 2; - - for (size_t k = 0; k < batch_size; ++k) { - for (size_t i = 0; i < x_width; ++i) { - for (size_t j = 0; j < y_width; ++j) { - int index = - static_cast((i + j - y_half_width + x_width) % x_width); - out(k, i) += x(k, index) * y(k, j); - } - } - } - } -}; - -template -class ConvShiftGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &context) const override { - auto *X = context.Input("X"); - auto *Y = context.Input("Y"); - auto *dOut = context.Input(framework::GradVarName("Out")); - auto *dX = context.Output(framework::GradVarName("X")); - auto *dY = context.Output(framework::GradVarName("Y")); - - auto x = EigenMatrix::From(*X); - auto y = EigenMatrix::From(*Y); - auto dout = EigenMatrix::From(*dOut); - - auto x_dims = X->dims(); - auto y_dims = Y->dims(); - size_t batch_size = x_dims[0]; - size_t x_width = x_dims[1]; - size_t y_width = y_dims[1]; - size_t y_half_width = (y_width - 1) / 2; - - // The below trades code duplication for efficiency (keeping the if - // statement outside of the loop). - if (dX) { - dX->mutable_data(context.GetPlace()); - auto dx = EigenMatrix::From(*dX); - dx.setZero(); - for (size_t k = 0; k < batch_size; ++k) { - for (size_t i = 0; i < x_width; ++i) { - for (size_t j = 0; j < y_width; ++j) { - int index = - static_cast((i + j - y_half_width + x_width) % x_width); - dx(k, index) += dout(k, i) * y(k, j); - } - } - } - } - - if (dY) { - dY->mutable_data(context.GetPlace()); - auto dy = EigenMatrix::From(*dY); - dy.setZero(); - for (size_t k = 0; k < batch_size; ++k) { - for (size_t i = 0; i < x_width; ++i) { - for (size_t j = 0; j < y_width; ++j) { - int index = - static_cast((i + j - y_half_width + x_width) % x_width); - dy(k, j) += x(k, index) * dout(k, i); - } - } - } - } - } -}; - -template -class ConvShiftGradOpMaker : public framework::SingleGradOpMaker { - public: - using framework::SingleGradOpMaker::SingleGradOpMaker; - - protected: - void Apply(GradOpPtr op) const override { - op->SetType("conv_shift_grad"); - op->SetInput("X", this->Input("X")); - op->SetInput("Y", this->Input("Y")); - op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out")); - op->SetOutput(framework::GradVarName("X"), this->InputGrad("X")); - op->SetOutput(framework::GradVarName("Y"), this->InputGrad("Y")); - op->SetAttrMap(this->Attrs()); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OPERATOR(conv_shift, - ops::ConvShiftOp, - ops::ConvShiftOpMaker, - ops::ConvShiftGradOpMaker, - ops::ConvShiftGradOpMaker); -REGISTER_OPERATOR(conv_shift_grad, ops::ConvShiftGradOp); -PD_REGISTER_STRUCT_KERNEL( - conv_shift, CPU, ALL_LAYOUT, ops::ConvShiftKernel, float) {} -PD_REGISTER_STRUCT_KERNEL( - conv_shift_grad, CPU, ALL_LAYOUT, ops::ConvShiftGradKernel, float) {} diff --git a/paddle/fluid/operators/conv_shift_op.cu b/paddle/fluid/operators/conv_shift_op.cu deleted file mode 100644 index 2ac37ac8d6f8f..0000000000000 --- a/paddle/fluid/operators/conv_shift_op.cu +++ /dev/null @@ -1,216 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/conv_shift_op.h" -#include "paddle/phi/backends/gpu/gpu_primitives.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace paddle { -namespace operators { - -namespace { - -inline int DivUp(int x, int y) { return (x + y - 1) / y; } - -// Some notes on the design: -// -// Each thread is responsible for computing a single output out[k, i]. -// Thread blocks are based on tiles of x with height 1 in the batch dimension. -// -// This design is based on the typical use case where the filter -// y is fairly small. For large y, it would probably be more efficient -// to also tile across y. -template -__global__ void ConvShiftForward(const T *x, - const T *y, - int x_width, - int y_width, - int y_half_width, - int batch_size, - T *out) { - extern __shared__ T mem[]; - - int tx = threadIdx.x; - int i = blockIdx.x * blockDim.x + tx; // global x index - int k = blockIdx.y; // batch index - - // Check if we are in a boundary block with fewer x's to process than - // blockDim.x. - int num_x = - (blockIdx.x == gridDim.x - 1) ? (x_width % blockDim.x) : blockDim.x; - - T *sx = mem; - T *sx_pad = &mem[num_x]; - T *sy = &mem[blockDim.x + y_width]; - - // Collaboratively load y[k, :] and length-y padding of x into shared memory. - int pad_start = blockIdx.x * blockDim.x + num_x + x_width - y_half_width; - for (int j = tx; j < y_width; j += blockDim.x) { - sy[j] = y[k * y_width + j]; - sx_pad[j] = x[k * x_width + (pad_start + j) % x_width]; - } - - // Load a cyclically shifted slice of x into shared memory. - if (tx < num_x) { - int load_i = (i - y_half_width + x_width) % x_width; - sx[tx] = x[k * x_width + load_i]; - } - __syncthreads(); - - if (tx < num_x) { - // Compute dot product of sx[tx:tx + y_width] and sy. - T sum = 0; - for (int j = 0; j < y_width; ++j) { - sum += sx[tx + j] * sy[j]; - } - - // Save to out[k, i]. - out[k * x_width + i] = sum; - } -} - -// Compute x gradient - initial naive implementation with atomic add. -template -__global__ void ConvShiftGradX(const T *dout, - const T *y, - int x_width, - int y_width, - int y_half_width, - int batch_size, - T *dx) { - int i = blockIdx.x * blockDim.x + threadIdx.x; // x index - int j = blockIdx.y; // y index - int k = blockIdx.z; // batch index - - if (i < x_width) { - int index = (i + j - y_half_width + x_width) % x_width; - atomicAdd(&dx[k * x_width + index], - dout[k * x_width + i] * y[k * y_width + j]); - } -} - -// Compute y gradient - initial naive implementation with atomic add. -template -__global__ void ConvShiftDy(const T *x, - const T *dout, - int x_width, - int y_width, - int y_half_width, - int batch_size, - T *dy) { - int i = blockIdx.x * blockDim.x + threadIdx.x; // x index - int j = blockIdx.y; // y index - int k = blockIdx.z; // batch index - - if (i < x_width) { - int index = (i + j - y_half_width + x_width) % x_width; - atomicAdd(&dy[k * y_width + j], - x[k * x_width + index] * dout[k * x_width + i]); - } -} -} // namespace - -template -class ConvShiftKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &context) const override { - const phi::DenseTensor *X = context.Input("X"); - const phi::DenseTensor *Y = context.Input("Y"); - phi::DenseTensor *Out = context.Output("Out"); - const T *x_data = X->data(); - const T *y_data = Y->data(); - T *out_data = Out->mutable_data(context.GetPlace()); - - int batch_size = X->dims()[0]; - int x_width = X->dims()[1]; - int y_width = Y->dims()[1]; - int y_half_width = (y_width - 1) / 2; - - const int x_per_block = 256; - int num_x_blocks = DivUp(x_width, x_per_block); - int mem_per_block = (x_per_block + 2 * y_width) * sizeof(T); - - dim3 grid_dim(num_x_blocks, batch_size); - - auto stream = context.template device_context().stream(); - - ConvShiftForward<<>>( - x_data, y_data, x_width, y_width, y_half_width, batch_size, out_data); - } -}; - -template -class ConvShiftGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &context) const override { - const phi::DenseTensor *X = context.Input("X"); - const phi::DenseTensor *Y = context.Input("Y"); - const phi::DenseTensor *dOut = - context.Input(framework::GradVarName("Out")); - const T *x_data = X->data(); - const T *y_data = Y->data(); - const T *dout_data = dOut->data(); - - phi::DenseTensor *dX = - context.Output(framework::GradVarName("X")); - phi::DenseTensor *dY = - context.Output(framework::GradVarName("Y")); - - int batch_size = X->dims()[0]; - int x_width = X->dims()[1]; - int y_width = Y->dims()[1]; - int y_half_width = (y_width - 1) / 2; - - auto &device_ctx = context.template device_context(); - phi::funcs::SetConstant zero; - - const int x_per_block = 256; - int num_x_blocks = DivUp(x_width, x_per_block); - dim3 grid_dim(num_x_blocks, y_width, batch_size); - - if (dX) { - T *dx_data = dX->mutable_data(context.GetPlace()); - zero(device_ctx, dX, static_cast(0.0)); - ConvShiftGradX - <<>>(dout_data, - y_data, - x_width, - y_width, - y_half_width, - batch_size, - dx_data); - } - if (dY) { - T *dy_data = dY->mutable_data(context.GetPlace()); - zero(device_ctx, dY, static_cast(0.0)); - ConvShiftDy - <<>>(x_data, - dout_data, - x_width, - y_width, - y_half_width, - batch_size, - dy_data); - } - } -}; -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -PD_REGISTER_STRUCT_KERNEL( - conv_shift, GPU, ALL_LAYOUT, ops::ConvShiftKernel, float) {} -PD_REGISTER_STRUCT_KERNEL( - conv_shift_grad, GPU, ALL_LAYOUT, ops::ConvShiftGradKernel, float) {} diff --git a/paddle/fluid/operators/conv_shift_op.h b/paddle/fluid/operators/conv_shift_op.h deleted file mode 100644 index 603d1e5222155..0000000000000 --- a/paddle/fluid/operators/conv_shift_op.h +++ /dev/null @@ -1,33 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -template -class ConvShiftKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &context) const override; -}; - -template -class ConvShiftGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &context) const override; -}; -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/detection/CMakeLists.txt b/paddle/fluid/operators/detection/CMakeLists.txt index 1d990b4466a96..3c5c7df83440d 100644 --- a/paddle/fluid/operators/detection/CMakeLists.txt +++ b/paddle/fluid/operators/detection/CMakeLists.txt @@ -79,9 +79,6 @@ else() detection_library(collect_fpn_proposals_op SRCS collect_fpn_proposals_op.cc) endif() -detection_library( - roi_perspective_transform_op SRCS roi_perspective_transform_op.cc - roi_perspective_transform_op.cu) #Export local libraries to parent # set(DETECTION_LIBRARY ${LOCAL_DETECTION_LIBS} PARENT_SCOPE) diff --git a/paddle/fluid/operators/detection/roi_perspective_transform_op.cc b/paddle/fluid/operators/detection/roi_perspective_transform_op.cc deleted file mode 100644 index 51f058617edc6..0000000000000 --- a/paddle/fluid/operators/detection/roi_perspective_transform_op.cc +++ /dev/null @@ -1,711 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include -#include -#include - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace paddle { -namespace operators { - -template -bool GT_E(T a, T b) { - return (a > b) || fabs(a - b) < 1e-4; -} - -template -bool LT_E(T a, T b) { - return (a < b) || fabs(a - b) < 1e-4; -} - -template -bool GT(T a, T b) { - return (a - b) > 1e-4; -} - -/* - *check if (x, y) is in the boundary of roi - */ -template -bool in_quad(T x, T y, T roi_x[], T roi_y[]) { // NOLINT - for (int i = 0; i < 4; i++) { - T xs = roi_x[i]; - T ys = roi_y[i]; - T xe = roi_x[(i + 1) % 4]; - T ye = roi_y[(i + 1) % 4]; - if (fabs(ys - ye) < 1e-4) { - if (fabs(y - ys) < 1e-4 && fabs(y - ye) < 1e-4 && - GT_E(x, std::min(xs, xe)) && LT_E(x, std::max(xs, xe))) { - return true; - } - } else { - T intersec_x = (y - ys) * (xe - xs) / (ye - ys) + xs; - if (fabs(intersec_x - x) < 1e-4 && GT_E(y, std::min(ys, ye)) && - LT_E(y, std::max(ys, ye))) { - return true; - } - } - } - - int n_cross = 0; - for (int i = 0; i < 4; i++) { - T xs = roi_x[i]; - T ys = roi_y[i]; - T xe = roi_x[(i + 1) % 4]; - T ye = roi_y[(i + 1) % 4]; - if (fabs(ys - ye) < 1e-4) { - continue; - } - if (LT_E(y, std::min(ys, ye)) || GT(y, std::max(ys, ye))) { - continue; - } - T intersec_x = (y - ys) * (xe - xs) / (ye - ys) + xs; - if (fabs(intersec_x - x) < 1e-4) { - return true; - } - if (GT(intersec_x, x)) { - n_cross++; - } - } - return (n_cross % 2 == 1); -} - -/** - * Get the matrix of perspective transform. - * - * dx1 = x1 - x2 - * dx2 = x3 - x2 - * dx3 = x0 - x1 + x2 - x3 - * dy1 = y1 - y2 - * dy2 = y3 - y2 - * dy3 = y0 - y1 + y2 - y3 - * - * a11 = (x1 - x0 + a31 * (w - 1) * x1) / (w - 1) - * a12 = (x3 - x0 + a32 * (h - 1) * x3) / (h - 1) - * a13 = x0 - * a21 = (y1 - y0 + a31 * (w - 1) * y1) / (w - 1) - * a22 = (y3 - y0 + a32 * (h - 1) * y3) / (h - 1) - * a23 = y0 - * a31 = (dx3 * dy2 - dx2 * dy3) / (dx1 * dy2 - dx2 * dy1) / (w - 1) - * a32 = (dx1 * dy3 - dx3 * dy1) / (dx1 * dy2 - dx2 * dy1) / (h - 1) - * a33 = 1 - */ -template -void get_transform_matrix(const int transformed_width, - const int transformed_height, - T roi_x[], // NOLINT - T roi_y[], // NOLINT - T matrix[]) { // NOLINT - T x0 = roi_x[0]; - T x1 = roi_x[1]; - T x2 = roi_x[2]; - T x3 = roi_x[3]; - T y0 = roi_y[0]; - T y1 = roi_y[1]; - T y2 = roi_y[2]; - T y3 = roi_y[3]; - - // Estimate the height and width of RoI - T len1 = sqrt((x0 - x1) * (x0 - x1) + (y0 - y1) * (y0 - y1)); - T len2 = sqrt((x1 - x2) * (x1 - x2) + (y1 - y2) * (y1 - y2)); - T len3 = sqrt((x2 - x3) * (x2 - x3) + (y2 - y3) * (y2 - y3)); - T len4 = sqrt((x3 - x0) * (x3 - x0) + (y3 - y0) * (y3 - y0)); - T estimated_height = (len2 + len4) / 2.0; - T estimated_width = (len1 + len3) / 2.0; - - // Get the normalized height and normalized width - int normalized_height = std::max(2, transformed_height); - int normalized_width = - std::round(estimated_width * (normalized_height - 1) / estimated_height) + - 1; - normalized_width = std::max(2, std::min(normalized_width, transformed_width)); - - T dx1 = x1 - x2; - T dx2 = x3 - x2; - T dx3 = x0 - x1 + x2 - x3; - T dy1 = y1 - y2; - T dy2 = y3 - y2; - T dy3 = y0 - y1 + y2 - y3; - - matrix[6] = (dx3 * dy2 - dx2 * dy3) / (dx1 * dy2 - dx2 * dy1 + 1e-5) / - (normalized_width - 1); - matrix[7] = (dx1 * dy3 - dx3 * dy1) / (dx1 * dy2 - dx2 * dy1 + 1e-5) / - (normalized_height - 1); - matrix[8] = 1; - - matrix[3] = (y1 - y0 + matrix[6] * (normalized_width - 1) * y1) / - (normalized_width - 1); - matrix[4] = (y3 - y0 + matrix[7] * (normalized_height - 1) * y3) / - (normalized_height - 1); - matrix[5] = y0; - - matrix[0] = (x1 - x0 + matrix[6] * (normalized_width - 1) * x1) / - (normalized_width - 1); - matrix[1] = (x3 - x0 + matrix[7] * (normalized_height - 1) * x3) / - (normalized_height - 1); - matrix[2] = x0; -} - -/** - * Get the source coordinates in the input feature map. - * - * (u, v, w)^matrix = matrix * (out_w, out_h, 1)^matrix - * - * in_w = u / w - * in_h = v / w - * - */ -template -void get_source_coords( - T matrix[], int out_w, int out_h, T* in_w, T* in_h) { // NOLINT - T u = matrix[0] * out_w + matrix[1] * out_h + matrix[2]; - T v = matrix[3] * out_w + matrix[4] * out_h + matrix[5]; - T w = matrix[6] * out_w + matrix[7] * out_h + matrix[8]; - - in_w[0] = u / w; - in_h[0] = v / w; -} - -/** - * Perform bilinear interpolation in the input feature map. - */ -template -void bilinear_interpolate(const T* in_data, - const int channels, - const int width, - const int height, - int in_n, - int in_c, - T in_w, - T in_h, - T* val) { - // Deal with cases that source coords are out of feature map boundary - if (GT_E(-0.5, in_w) || GT_E(in_w, width - 0.5) || - GT_E(-0.5, in_h) || GT_E(in_h, height - 0.5)) { - // empty - val[0] = 0.0; - return; - } - - if (GT_E(0, in_w)) { - in_w = 0; - } - if (GT_E(0, in_h)) { - in_h = 0; - } - - int in_w_floor = floor(in_w); - int in_h_floor = floor(in_h); - int in_w_ceil; - int in_h_ceil; - - if (GT_E(in_w_floor, width - 1)) { - in_w_ceil = in_w_floor = width - 1; - in_w = static_cast(in_w_floor); - } else { - in_w_ceil = in_w_floor + 1; - } - - if (GT_E(in_h_floor, height - 1)) { - in_h_ceil = in_h_floor = height - 1; - in_h = static_cast(in_h_floor); - } else { - in_h_ceil = in_h_floor + 1; - } - T w_floor = in_w - in_w_floor; - T h_floor = in_h - in_h_floor; - T w_ceil = 1 - w_floor; - T h_ceil = 1 - h_floor; - const T* data = in_data + (in_n * channels + in_c) * height * width; - // Do bilinear interpolation - T v1 = data[in_h_floor * width + in_w_floor]; - T v2 = data[in_h_ceil * width + in_w_floor]; - T v3 = data[in_h_ceil * width + in_w_ceil]; - T v4 = data[in_h_floor * width + in_w_ceil]; - T w1 = w_ceil * h_ceil; - T w2 = w_ceil * h_floor; - T w3 = w_floor * h_floor; - T w4 = w_floor * h_ceil; - val[0] = w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4; -} - -template -class CPUROIPerspectiveTransformOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* in = ctx.Input("X"); - auto* rois = ctx.Input("ROIs"); - auto* out = ctx.Output("Out"); - auto* mask = ctx.Output("Mask"); - auto* out_transform_matrix = - ctx.Output("TransformMatrix"); - auto transformed_height = ctx.Attr("transformed_height"); - auto transformed_width = ctx.Attr("transformed_width"); - auto spatial_scale = ctx.Attr("spatial_scale"); - - auto in_dims = phi::vectorize(in->dims()); - int channels = static_cast(in_dims[1]); - int in_height = static_cast(in_dims[2]); - int in_width = static_cast(in_dims[3]); - int rois_num = static_cast(rois->dims()[0]); - - const T* input_data = in->data(); - int* mask_data = mask->mutable_data(ctx.GetPlace()); - - phi::DenseTensor roi2image; - roi2image.Resize({rois_num}); - int* roi2image_data = roi2image.mutable_data(ctx.GetPlace()); - auto lod = rois->lod().back(); - for (size_t i = 0; i < lod.size() - 1; ++i) { - for (size_t j = lod[i]; j < lod[i + 1]; ++j) { - roi2image_data[j] = static_cast(i); - } - } - - T* output_data = out->mutable_data(ctx.GetPlace()); - const T* rois_data = rois->data(); - - T* transform_matrix = - out_transform_matrix->mutable_data({rois_num, 9}, ctx.GetPlace()); - - for (int n = 0; n < rois_num; ++n) { - const T* n_rois = rois_data + n * 8; - std::array roi_x; - std::array roi_y; - for (int k = 0; k < 4; ++k) { - roi_x[k] = n_rois[2 * k] * spatial_scale; - roi_y[k] = n_rois[2 * k + 1] * spatial_scale; - } - int image_id = roi2image_data[n]; - // Get transform matrix - std::array matrix; - get_transform_matrix(transformed_width, - transformed_height, - roi_x.data(), - roi_y.data(), - matrix.data()); - for (int i = 0; i < 9; i++) { - transform_matrix[n * 9 + i] = matrix[i]; - } - for (int c = 0; c < channels; ++c) { - for (int out_h = 0; out_h < transformed_height; ++out_h) { - for (int out_w = 0; out_w < transformed_width; ++out_w) { - int out_index = - n * channels * transformed_height * transformed_width + - c * transformed_height * transformed_width + - out_h * transformed_width + out_w; - T in_w, in_h; - get_source_coords(matrix.data(), out_w, out_h, &in_w, &in_h); - if (in_quad(in_w, in_h, roi_x.data(), roi_y.data())) { - if (GT_E(-0.5, in_w) || - GT_E(in_w, static_cast(in_width - 0.5)) || - GT_E(-0.5, in_h) || - GT_E(in_h, static_cast(in_height - 0.5))) { - output_data[out_index] = 0.0; - mask_data[(n * transformed_height + out_h) * transformed_width + - out_w] = 0; - } else { - bilinear_interpolate(input_data, - channels, - in_width, - in_height, - image_id, - c, - in_w, - in_h, - output_data + out_index); - mask_data[(n * transformed_height + out_h) * transformed_width + - out_w] = 1; - } - } else { - output_data[out_index] = 0.0; - mask_data[(n * transformed_height + out_h) * transformed_width + - out_w] = 0; - } - } - } - } - } - } -}; - -template -T get_feature_gradient( - T xs, T ys, int w, int h, const int width, const int height) { - if (GT_E(-0.5, xs) || GT_E(xs, width - 0.5) || GT_E(-0.5, ys) || - GT_E(ys, height - 0.5)) { - return 0; - } - - if (GT_E(0, xs)) { - xs = 0; - } - if (GT_E(0, ys)) { - ys = 0; - } - - int xs_floor = floor(xs); - int ys_floor = floor(ys); - int xs_ceil; - int ys_ceil; - - if (GT_E(xs_floor, width - 1)) { - xs_ceil = xs_floor = width - 1; - xs = static_cast(xs_floor); - } else { - xs_ceil = xs_floor + 1; - } - - if (GT_E(ys_floor, height - 1)) { - ys_ceil = ys_floor = height - 1; - ys = static_cast(ys_floor); - } else { - ys_ceil = ys_floor + 1; - } - - T weight = 0; - if (w == xs_floor) { - if (h == ys_floor) { - weight = (w + 1 - xs) * (h + 1 - ys); - } else if (h == ys_ceil) { - weight = (w + 1 - xs) * (ys + 1 - h); - } - } else if (w == xs_ceil) { - if (h == ys_floor) { - weight = (xs + 1 - w) * (h + 1 - ys); - } else if (h == ys_ceil) { - weight = (xs + 1 - w) * (ys + 1 - h); - } - } - return weight; -} - -template -class CPUROIPerspectiveTransformGradOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* in = ctx.Input("X"); - auto* rois = ctx.Input("ROIs"); - auto* out_grad = ctx.Input(framework::GradVarName("Out")); - auto* in_grad = ctx.Output(framework::GradVarName("X")); - - auto transformed_height = ctx.Attr("transformed_height"); - auto transformed_width = ctx.Attr("transformed_width"); - auto spatial_scale = ctx.Attr("spatial_scale"); - - auto in_dims = phi::vectorize(in->dims()); - int batch_size = in_dims[0]; - int channels = in_dims[1]; - int in_height = in_dims[2]; - int in_width = in_dims[3]; - int rois_num = static_cast(rois->dims()[0]); - - T* in_grad_data = in_grad->mutable_data(ctx.GetPlace()); - const T* out_grad_data = out_grad->data(); - const T* rois_data = rois->data(); - - phi::DenseTensor roi2image; - roi2image.Resize({rois_num}); - int* roi2image_data = roi2image.mutable_data(ctx.GetPlace()); - auto lod = rois->lod().back(); - for (size_t i = 0; i < lod.size() - 1; ++i) { - for (size_t j = lod[i]; j < lod[i + 1]; ++j) { - roi2image_data[j] = static_cast(i); - } - } - - for (int n = 0; n < batch_size; ++n) { - for (int c = 0; c < channels; ++c) { - for (int in_h = 0; in_h < in_height; ++in_h) { - for (int in_w = 0; in_w < in_width; ++in_w) { - T gradient = 0.0; - for (size_t roi_idx = lod[n]; roi_idx < lod[n + 1]; ++roi_idx) { - const T* rois = rois_data + roi_idx * 8; - std::array roi_x; - std::array roi_y; - for (int k = 0; k < 4; ++k) { - roi_x[k] = rois[2 * k] * spatial_scale; - roi_y[k] = rois[2 * k + 1] * spatial_scale; - } - - // Get transform matrix - std::array matrix; - get_transform_matrix(transformed_width, - transformed_height, - roi_x.data(), - roi_y.data(), - matrix.data()); - const T* out_grad_ptr = out_grad_data + (roi_idx * channels + c) * - transformed_height * - transformed_width; - for (int out_h = 0; out_h < transformed_height; ++out_h) { - for (int out_w = 0; out_w < transformed_width; ++out_w) { - T src_w; - T src_h; - get_source_coords( - matrix.data(), out_w, out_h, &src_w, &src_h); - if (in_quad(src_w, src_h, roi_x.data(), roi_y.data())) { - if (GT_E(-0.5, src_w) || - GT_E(src_w, static_cast(in_width - 0.5)) || - GT_E(-0.5, src_h) || - GT_E(src_h, static_cast(in_height - 0.5))) { - continue; - } - T weight = get_feature_gradient( - src_w, src_h, in_w, in_h, in_width, in_height); - gradient += - out_grad_ptr[out_h * transformed_width + out_w] * - weight; - } - } - } - } - int out_idx = (n * channels + c) * in_height * in_width + - in_h * in_width + in_w; - in_grad_data[out_idx] = gradient; - } - } - } - } - } -}; - -class ROIPerspectiveTransformOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK( - ctx->HasInput("X"), "Input", "X", "roi_perspective_transform"); - OP_INOUT_CHECK( - ctx->HasInput("ROIs"), "Input", "ROIs", "roi_perspective_transform"); - OP_INOUT_CHECK( - ctx->HasOutput("Out"), "Ountput", "Out", "roi_perspective_transform"); - - auto input_dims = ctx->GetInputDim("X"); - auto rois_dims = ctx->GetInputDim("ROIs"); - - PADDLE_ENFORCE_EQ(input_dims.size(), - 4, - platform::errors::InvalidArgument( - "The format of input tensor must be NCHW. But " - "received input dims is %d.", - input_dims.size())); - PADDLE_ENFORCE_EQ( - rois_dims.size(), - 2, - platform::errors::InvalidArgument( - "ROIs should be a 2-D phi::DenseTensor of shape (num_rois, 8)" - "given as [[x0, y0, x1, y1, x2, y2, x3, y3], ...]. But received " - "rois dims is %d", - rois_dims.size())); - PADDLE_ENFORCE_EQ( - rois_dims[1], - 8, - platform::errors::InvalidArgument( - "ROIs should be a 2-D phi::DenseTensor of shape (num_rois, 8)" - "given as [[x0, y0, x1, y1, x2, y2, x3, y3], ...]. But received %d", - rois_dims[1])); - - int transformed_height = ctx->Attrs().Get("transformed_height"); - int transformed_width = ctx->Attrs().Get("transformed_width"); - float spatial_scale = ctx->Attrs().Get("spatial_scale"); - - PADDLE_ENFORCE_GT( - transformed_height, - 0, - platform::errors::InvalidArgument("The transformed output height must " - "greater than 0. But received %d.", - transformed_height)); - PADDLE_ENFORCE_GT( - transformed_width, - 0, - platform::errors::InvalidArgument("The transformed output width must " - "greater than 0. But received %d.", - transformed_width)); - PADDLE_ENFORCE_GT( - spatial_scale, - 0.0f, - platform::errors::InvalidArgument( - "The spatial scale must greater than 0. But received %f.", - spatial_scale)); - std::vector out_dims_v({rois_dims[0], // num_rois - input_dims[1], // channels - static_cast(transformed_height), - static_cast(transformed_width)}); - auto out_dims = phi::make_ddim(out_dims_v); - - std::vector mask_dims_v({rois_dims[0], // num_rois - 1, // channels - static_cast(transformed_height), - static_cast(transformed_width)}); - auto mask_dims = phi::make_ddim(mask_dims_v); - - std::vector matrix_dims_v({rois_dims[0], 9}); - auto matrix_dims = phi::make_ddim(matrix_dims_v); - - ctx->SetOutputDim("Out", out_dims); - ctx->SetOutputDim("Mask", mask_dims); - ctx->SetOutputDim("TransformMatrix", matrix_dims); - ctx->SetOutputDim("Out2InIdx", out_dims); - ctx->SetOutputDim("Out2InWeights", out_dims); - ctx->ShareLoD("ROIs", /*->*/ "Out"); - } - - protected: - phi::KernelKey GetExpectedKernelType( - const framework::ExecutionContext& ctx) const override { - return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(ctx, "X"), - ctx.GetPlace()); - } -}; - -class ROIPerspectiveTransformGradOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")), - "Input", - "Out@Grad", - "roi_perspective_transform_grad"); - OP_INOUT_CHECK(ctx->HasOutputs(framework::GradVarName("X")), - "Output", - "X@Grad", - "roi_perspective_transform_grad"); - - ctx->SetOutputsDim(framework::GradVarName("X"), ctx->GetInputsDim("X")); - } - - protected: - phi::KernelKey GetExpectedKernelType( - const framework::ExecutionContext& ctx) const override { - return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(ctx, "X"), - ctx.GetPlace()); - } -}; - -class ROIPerspectiveTransformOpMaker - : public framework::OpProtoAndCheckerMaker { - public: - void Make() override { - AddInput("X", - "(phi::DenseTensor), " - "the input of ROIPerspectiveTransformOp. " - "The format of input tensor is NCHW. Where N is batch size, " - "C is the number of input channels, " - "H is the height of the feature, and " - "W is the width of the feature."); - AddInput("ROIs", - "(phi::DenseTensor), " - "ROIs (Regions of Interest) to be transformed. " - "should be a 2-D phi::DenseTensor of shape (num_rois, 8)" - "given as [[x1, y1, x2, y2, x3, y3, x4, y4], ...]." - "(x1, y1) is the top left coordinates, and " - "(x2, y2) is the top right coordinates, and" - "(x3, y3) is the bottom right coordinates, and" - "(x4, y4) is the bottom left coordinates."); - AddOutput( - "Out", - "(phi::DenseTensor), " - "The output of ROIPerspectiveTransformOp is a 4-D tensor with shape " - "(num_rois, channels, transformed_h, transformed_w)."); - AddOutput("Mask", - "(phi::DenseTensor), " - "The output mask of ROIPerspectiveTransformOp is a 4-D tensor " - "with shape " - "(num_rois, 1, transformed_h, transformed_w)."); - AddOutput("TransformMatrix", - "(phi::DenseTensor), " - "The output transform matrix of ROIPerspectiveTransformOp is a " - "1-D tensor with shape " - "(num_rois, 9)."); - AddOutput("Out2InIdx", - "(phi::DenseTensor), " - "An intermediate tensor used to map indexes of input feature map " - "and indexes of output feature map." - "The shape of the tensor is [out_size, 4] and out_size is the " - "number of elements in output feature map.") - .AsIntermediate(); - AddOutput("Out2InWeights", - "(phi::DenseTensor), " - "An intermediate tensor used to record the weights of bilinear " - "interpolatein for each element in output. The shape of the " - "tensor is [out_size, 4] and out_size is the number of elements " - "in output feature map.") - .AsIntermediate(); - AddAttr("spatial_scale", - "(float, default 1.0), " - "Spatial scale factor to scale ROI coords.") - .SetDefault(1.0); - AddAttr("transformed_height", - "(int, default 1), " - "The height of transformed output.") - .SetDefault(1); - AddAttr("transformed_width", - "(int, default 1), " - "The width of transformed output.") - .SetDefault(1); - AddComment(R"DOC( -**ROIPerspectiveTransform Operator** - - )DOC"); - } -}; - -template -class ROIPerspectiveTransformGradMaker - : public framework::SingleGradOpMaker { - public: - using framework::SingleGradOpMaker::SingleGradOpMaker; - - protected: - void Apply(GradOpPtr op) const override { - op->SetType("roi_perspective_transform_grad"); - op->SetInput("X", this->Input("X")); - op->SetInput("ROIs", this->Input("ROIs")); - op->SetInput("Out2InIdx", this->Output("Out2InIdx")); - op->SetInput("Out2InWeights", this->Output("Out2InWeights")); - op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out")); - op->SetOutput(framework::GradVarName("X"), this->InputGrad("X")); - op->SetAttrMap(this->Attrs()); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OPERATOR( - roi_perspective_transform, - ops::ROIPerspectiveTransformOp, - ops::ROIPerspectiveTransformOpMaker, - ops::ROIPerspectiveTransformGradMaker, - ops::ROIPerspectiveTransformGradMaker); -REGISTER_OPERATOR(roi_perspective_transform_grad, - ops::ROIPerspectiveTransformGradOp); -PD_REGISTER_STRUCT_KERNEL(roi_perspective_transform, - CPU, - ALL_LAYOUT, - ops::CPUROIPerspectiveTransformOpKernel, - float) {} -PD_REGISTER_STRUCT_KERNEL(roi_perspective_transform_grad, - CPU, - ALL_LAYOUT, - ops::CPUROIPerspectiveTransformGradOpKernel, - float) {} diff --git a/paddle/fluid/operators/detection/roi_perspective_transform_op.cu b/paddle/fluid/operators/detection/roi_perspective_transform_op.cu deleted file mode 100644 index 3a94bcafd669f..0000000000000 --- a/paddle/fluid/operators/detection/roi_perspective_transform_op.cu +++ /dev/null @@ -1,551 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/platform/float16.h" -#include "paddle/phi/backends/gpu/gpu_primitives.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -using paddle::platform::float16; -using phi::PADDLE_CUDA_NUM_THREADS; - -namespace paddle { -namespace operators { - -// CUDA: index helpers -#define idx4_4(index, d1, d2, d3, d4) (index % d4) -#define idx4_3(index, d1, d2, d3, d4) ((index / d4) % d3) -#define idx4_2(index, d1, d2, d3, d4) ((index / d4 / d3) % d2) -#define idx4_1(index, d1, d2, d3, d4) ((index / d4 / d3 / d2) % d1) - -template -__device__ bool GT_E(T a, T b) { - return (a > b) || Eigen::numext::abs(a - b) < 1e-4; -} - -template -__device__ bool LT_E(T a, T b) { - return (a < b) || Eigen::numext::abs(a - b) < 1e-4; -} - -template -__device__ bool GT(T a, T b) { - return (a - b) > 1e-4; -} - -template -__device__ T max(T a, T b) { - return a > b ? a : b; -} - -template -__device__ T min(T a, T b) { - return a < b ? a : b; -} - -/* - * check if (x, y) is in the boundary of roi - */ -template -__device__ bool in_quad(T x, T y, T roi_x[], T roi_y[]) { - for (int i = 0; i < 4; i++) { - T start_w = roi_x[i]; - T start_h = roi_y[i]; - T end_w = roi_x[(i + 1) % 4]; - T end_h = roi_y[(i + 1) % 4]; - if (fabs(start_h - end_h) < 1e-4) { - if (fabs(y - start_h) < 1e-4 && fabs(y - end_h) < 1e-4 && - GT_E(x, min(start_w, end_w)) && - LT_E(x, max(start_w, end_w))) { - return true; - } - } else { - T intersec_x = - (y - start_h) * (end_w - start_w) / (end_h - start_h) + start_w; - if (fabs(intersec_x - x) < 1e-4 && GT_E(y, min(start_h, end_h)) && - LT_E(y, max(start_h, end_h))) { - return true; - } - } - } - - int n_cross = 0; - for (int i = 0; i < 4; i++) { - T start_w = roi_x[i]; - T start_h = roi_y[i]; - T end_w = roi_x[(i + 1) % 4]; - T end_h = roi_y[(i + 1) % 4]; - if (fabs(start_h - end_h) < 1e-4) { - continue; - } - if (LT_E(y, min(start_h, end_h)) || - GT(y, max(start_h, end_h))) { - continue; - } - T intersec_x = - (y - start_h) * (end_w - start_w) / (end_h - start_h) + start_w; - if (fabs(intersec_x - x) < 1e-4) { - return true; - } - if (GT(intersec_x, x)) { - n_cross++; - } - } - return (n_cross % 2 == 1); -} - -/** - * Perform bilinear interpolation in the input feature map. - */ -template -__device__ void bilinear_interpolate(const T* in_data, - const int channels, - const int width, - const int height, - int in_n, - int in_c, - T in_w, - T in_h, - T* val, - int out_idx, - int* out2in_idx, - T* out2in_w) { - // Deal with cases that source coords are out of feature map boundary - if (GT_E(-0.5, in_w) || GT_E(in_w, width - 0.5) || - GT_E(-0.5, in_h) || GT_E(in_h, height - 0.5)) { - val[0] = 0.0; - return; - } - - if (GT_E(0, in_w)) { - in_w = 0; - } - if (GT_E(0, in_h)) { - in_h = 0; - } - - int in_w_floor = floor(in_w); - int in_h_floor = floor(in_h); - int in_w_ceil; - int in_h_ceil; - - if (GT_E(in_w_floor, width - 1)) { - in_w_ceil = in_w_floor = width - 1; - in_w = static_cast(in_w_floor); - } else { - in_w_ceil = in_w_floor + 1; - } - - if (GT_E(in_h_floor, height - 1)) { - in_h_ceil = in_h_floor = height - 1; - in_h = static_cast(in_h_floor); - } else { - in_h_ceil = in_h_floor + 1; - } - - T w_floor = in_w - in_w_floor; - T h_floor = in_h - in_h_floor; - T w_ceil = 1 - w_floor; - T h_ceil = 1 - h_floor; - const T* data = in_data + (in_n * channels + in_c) * height * width; - // Do bilinear interpolation - T v1 = data[in_h_floor * width + in_w_floor]; - T v2 = data[in_h_ceil * width + in_w_floor]; - T v3 = data[in_h_ceil * width + in_w_ceil]; - T v4 = data[in_h_floor * width + in_w_ceil]; - T w1 = w_ceil * h_ceil; - T w2 = w_ceil * h_floor; - T w3 = w_floor * h_floor; - T w4 = w_floor * h_ceil; - val[0] = w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4; - - int base_idx = (in_n * channels + in_c) * height * width; - out2in_idx[out_idx * 4] = base_idx + in_h_floor * width + in_w_floor; - out2in_idx[out_idx * 4 + 1] = base_idx + in_h_ceil * width + in_w_floor; - out2in_idx[out_idx * 4 + 2] = base_idx + in_h_ceil * width + in_w_ceil; - out2in_idx[out_idx * 4 + 3] = base_idx + in_h_floor * width + in_w_ceil; - out2in_w[out_idx * 4] = w1; - out2in_w[out_idx * 4 + 1] = w2; - out2in_w[out_idx * 4 + 2] = w3; - out2in_w[out_idx * 4 + 3] = w4; -} - -/** - * Get the source coordinates in the input feature map. - * - * (u, v, w)^matrix = T * (out_w, out_h, 1)^matrix - * - * in_w = u / w - * in_h = v / w - * - */ -template -__device__ void get_source_coords( - T matrix[], int out_w, int out_h, T* in_w, T* in_h) { - T u = matrix[0] * out_w + matrix[1] * out_h + matrix[2]; - T v = matrix[3] * out_w + matrix[4] * out_h + matrix[5]; - T w = matrix[6] * out_w + matrix[7] * out_h + matrix[8]; - - in_w[0] = u / w; - in_h[0] = v / w; -} - -/** - * Get the matrix of perspective transform. - * - * dx1 = x1 - x2 - * dx2 = x3 - x2 - * dx3 = x0 - x1 + x2 - x3 - * dy1 = y1 - y2 - * dy2 = y3 - y2 - * dy3 = y0 - y1 + y2 - y3 - * - * a11 = (x1 - x0 + a31 * (w - 1) * x1) / (w - 1) - * a12 = (x3 - x0 + a32 * (h - 1) * x3) / (h - 1) - * a13 = x0 - * a21 = (y1 - y0 + a31 * (w - 1) * y1) / (w - 1) - * a22 = (y3 - y0 + a32 * (h - 1) * y3) / (h - 1) - * a23 = y0 - * a31 = (dx3 * dy2 - dx2 * dy3) / (dx1 * dy2 - dx2 * dy1) / (w - 1) - * a32 = (dx1 * dy3 - dx3 * dy1) / (dx1 * dy2 - dx2 * dy1) / (h - 1) - * a33 = 1 - * - */ -template -__device__ void get_transform_matrix(const int transformed_width, - const int transformed_height, - T roi_x[], - T roi_y[], - T matrix[]) { - T x0 = roi_x[0]; - T x1 = roi_x[1]; - T x2 = roi_x[2]; - T x3 = roi_x[3]; - T y0 = roi_y[0]; - T y1 = roi_y[1]; - T y2 = roi_y[2]; - T y3 = roi_y[3]; - - // Estimate the height and width of RoI - T len1 = sqrt((x0 - x1) * (x0 - x1) + (y0 - y1) * (y0 - y1)); - T len2 = sqrt((x1 - x2) * (x1 - x2) + (y1 - y2) * (y1 - y2)); - T len3 = sqrt((x2 - x3) * (x2 - x3) + (y2 - y3) * (y2 - y3)); - T len4 = sqrt((x3 - x0) * (x3 - x0) + (y3 - y0) * (y3 - y0)); - T estimated_height = (len2 + len4) / 2.0; - T estimated_width = (len1 + len3) / 2.0; - - // Get the normalized height and normalized width - int normalized_height = max(2, transformed_height); - int normalized_width = - round(estimated_width * (normalized_height - 1) / estimated_height) + 1; - normalized_width = max(2, min(normalized_width, transformed_width)); - - T dx1 = x1 - x2; - T dx2 = x3 - x2; - T dx3 = x0 - x1 + x2 - x3; - T dy1 = y1 - y2; - T dy2 = y3 - y2; - T dy3 = y0 - y1 + y2 - y3; - - matrix[6] = (dx3 * dy2 - dx2 * dy3) / (dx1 * dy2 - dx2 * dy1 + 1e-5) / - (normalized_width - 1); - matrix[7] = (dx1 * dy3 - dx3 * dy1) / (dx1 * dy2 - dx2 * dy1 + 1e-5) / - (normalized_height - 1); - matrix[8] = 1; - - matrix[3] = (y1 - y0 + matrix[6] * (normalized_width - 1) * y1) / - (normalized_width - 1); - matrix[4] = (y3 - y0 + matrix[7] * (normalized_height - 1) * y3) / - (normalized_height - 1); - matrix[5] = y0; - - matrix[0] = (x1 - x0 + matrix[6] * (normalized_width - 1) * x1) / - (normalized_width - 1); - matrix[1] = (x3 - x0 + matrix[7] * (normalized_height - 1) * x3) / - (normalized_height - 1); - matrix[2] = x0; -} - -template -__global__ void RoiTransformKernel(const float* input_data, - const float* rois_data, - const int* roi2image_data, - int num_rois, - int in_height, - int in_width, - int channels, - int transformed_height, - int transformed_width, - float spatial_scale, - T* output_data, - int* out2in_idx, - T* out2in_w, - int* mask, - T* transform_matrix) { - int output_size = - num_rois * transformed_height * transformed_width * channels; - CUDA_KERNEL_LOOP(index, output_size) { - // (n, c, out_h, out_w) is an element in the transformed output - int out_w = idx4_4( - index, num_rois, channels, transformed_height, transformed_width); - int out_h = idx4_3( - index, num_rois, channels, transformed_height, transformed_width); - int c = idx4_2( - index, num_rois, channels, transformed_height, transformed_width); - int n = idx4_1( - index, num_rois, channels, transformed_height, transformed_width); - - auto bottom_rois = rois_data + n * 8; - int roi_batch_ind = bottom_rois[0]; - T roi_x[4]; - T roi_y[4]; - for (int k = 0; k < 4; ++k) { - roi_x[k] = bottom_rois[2 * k] * spatial_scale; - roi_y[k] = bottom_rois[2 * k + 1] * spatial_scale; - } - - // Get transform matrix - T matrix[9]; - get_transform_matrix( - transformed_width, transformed_height, roi_x, roi_y, matrix); - for (int i = 0; i < 9; i++) { - transform_matrix[n * 9 + i] = matrix[i]; - } - // Get source coords - T in_w; - T in_h; - get_source_coords(matrix, out_w, out_h, &in_w, &in_h); - - if (in_quad(in_w, in_h, roi_x, roi_y)) { - if (GT_E(-0.5, in_w) || - GT_E(in_w, static_cast(in_width - 0.5)) || - GT_E(-0.5, in_h) || - GT_E(in_h, static_cast(in_height - 0.5))) { - // Skip if source coords is not in input image - output_data[index] = 0.0; - mask[(n * transformed_height + out_h) * transformed_width + out_w] = 0; - } else { - // Perform bilinear interpolation - int in_n = roi2image_data[n]; - bilinear_interpolate(input_data, - channels, - in_width, - in_height, - in_n, - c, - in_w, - in_h, - output_data + index, - index, - out2in_idx, - out2in_w); - mask[(n * transformed_height + out_h) * transformed_width + out_w] = 1; - } - - } else { - // Skip if source coords is not in quad - output_data[index] = 0.0; - mask[(n * transformed_height + out_h) * transformed_width + out_w] = 0; - } - } -} - -template -class CUDAROIPerspectiveTransformOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* in = ctx.Input("X"); - auto* rois = ctx.Input("ROIs"); - auto* out = ctx.Output("Out"); - auto* out2in_idx = ctx.Output("Out2InIdx"); - auto* out2in_w = ctx.Output("Out2InWeights"); - auto* mask = ctx.Output("Mask"); - auto* out_transform_matrix = - ctx.Output("TransformMatrix"); - - int* mask_data = mask->mutable_data(ctx.GetPlace()); - int* out2in_idx_data = - out2in_idx->mutable_data({out->numel(), 4}, ctx.GetPlace()); - T* out2in_w_data = - out2in_w->mutable_data({out->numel(), 4}, ctx.GetPlace()); - - phi::funcs::SetConstant init; - init(ctx.cuda_device_context(), out2in_idx, static_cast(-1)); - - auto transformed_height = ctx.Attr("transformed_height"); - auto transformed_width = ctx.Attr("transformed_width"); - auto spatial_scale = ctx.Attr("spatial_scale"); - - auto in_dims = in->dims(); - int batch_size = in_dims[0]; - int channels = in_dims[1]; - int in_height = in_dims[2]; - int in_width = in_dims[3]; - int rois_num = rois->dims()[0]; - - const T* input_data = in->data(); - T* output_data = out->mutable_data(ctx.GetPlace()); - const T* rois_data = rois->data(); - - phi::DenseTensor roi2image; - phi::DenseTensor roi2image_dev; - roi2image.Resize({rois_num}); - int* roi2image_data = roi2image.mutable_data(platform::CPUPlace()); - auto lod = rois->lod().back(); - for (size_t i = 0; i < lod.size() - 1; ++i) { - for (size_t j = lod[i]; j < lod[i + 1]; ++j) { - roi2image_data[j] = i; - } - } - paddle::framework::TensorCopySync( - roi2image, ctx.GetPlace(), &roi2image_dev); - - int out_size = rois_num * transformed_height * transformed_width * channels; - auto stream = ctx.cuda_device_context().stream(); - int block = 512; - int grid = (out_size + block - 1) / block; - - // Get transform matrix - T* matrix = - out_transform_matrix->mutable_data({rois_num, 9}, ctx.GetPlace()); - - RoiTransformKernel<<>>(input_data, - rois_data, - roi2image_dev.data(), - rois_num, - in_height, - in_width, - channels, - transformed_height, - transformed_width, - spatial_scale, - output_data, - out2in_idx_data, - out2in_w_data, - mask_data, - matrix); - } -}; - -template -__device__ T get_feature_gradient( - T xs, T ys, int w, int h, const int width, const int height) { - if (GT_E(-0.5, xs) || GT_E(xs, width - 0.5) || GT_E(-0.5, ys) || - GT_E(ys, height - 0.5)) { - return 0; - } - - if (GT_E(0, xs)) { - xs = 0; - } - if (GT_E(0, ys)) { - ys = 0; - } - - int xs_floor = floor(xs); - int ys_floor = floor(ys); - int xs_ceil; - int ys_ceil; - - if (GT_E(xs_floor, width - 1)) { - xs_ceil = xs_floor = width - 1; - xs = static_cast(xs_floor); - } else { - xs_ceil = xs_floor + 1; - } - - if (GT_E(ys_floor, height - 1)) { - ys_ceil = ys_floor = height - 1; - ys = static_cast(ys_floor); - } else { - ys_ceil = ys_floor + 1; - } - - T weight = 0; - if (w == xs_floor) { - if (h == ys_floor) { - weight = (w + 1 - xs) * (h + 1 - ys); - } else if (h == ys_ceil) { - weight = (w + 1 - xs) * (ys + 1 - h); - } - } else if (w == xs_ceil) { - if (h == ys_floor) { - weight = (xs + 1 - w) * (h + 1 - ys); - } else if (h == ys_ceil) { - weight = (xs + 1 - w) * (ys + 1 - h); - } - } - return weight; -} - -template -__global__ void RoiTransformGradKernel(int out_size, - const int* out2in_idx_data, - const T* out2in_w_data, - const T* out_grad_data, - T* in_grad_data) { - CUDA_KERNEL_LOOP(index, out_size * 4) { - int in_idx = out2in_idx_data[index]; - if (in_idx >= 0) { - int out_idx = index / 4; - atomicAdd(in_grad_data + in_idx, - out_grad_data[out_idx] * out2in_w_data[index]); - } - } -} - -template -class CUDAROIPerspectiveTransformGradOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* out2in_idx = ctx.Input("Out2InIdx"); - auto* out2in_w = ctx.Input("Out2InWeights"); - auto* out_grad = ctx.Input(framework::GradVarName("Out")); - auto* in_grad = ctx.Output(framework::GradVarName("X")); - - T* in_grad_data = in_grad->mutable_data(ctx.GetPlace()); - - phi::funcs::SetConstant set_zero; - set_zero(ctx.cuda_device_context(), in_grad, static_cast(0)); - - const T* out_grad_data = out_grad->data(); - const int* out2in_idx_data = out2in_idx->data(); - const T* out2in_w_data = out2in_w->data(); - - int out_size = out_grad->numel(); - auto stream = ctx.cuda_device_context().stream(); - int block = 512; - int grid = (out_size * 4 + block - 1) / block; - - RoiTransformGradKernel<<>>( - out_size, out2in_idx_data, out2in_w_data, out_grad_data, in_grad_data); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -PD_REGISTER_STRUCT_KERNEL(roi_perspective_transform, - GPU, - ALL_LAYOUT, - ops::CUDAROIPerspectiveTransformOpKernel, - float) {} -PD_REGISTER_STRUCT_KERNEL(roi_perspective_transform_grad, - GPU, - ALL_LAYOUT, - ops::CUDAROIPerspectiveTransformGradOpKernel, - float) {} diff --git a/paddle/fluid/operators/margin_rank_loss_op.cc b/paddle/fluid/operators/margin_rank_loss_op.cc deleted file mode 100644 index 2aaf8f74af359..0000000000000 --- a/paddle/fluid/operators/margin_rank_loss_op.cc +++ /dev/null @@ -1,191 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/margin_rank_loss_op.h" - -#include - -#include "paddle/fluid/platform/enforce.h" - -namespace paddle { -namespace operators { - -class MarginRankLossOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext *ctx) const override { - // input check - OP_INOUT_CHECK( - ctx->HasInput("Label"), "Input", "Label", "margin_rank_loss"); - OP_INOUT_CHECK(ctx->HasInput("X1"), "Input", "X1", "margin_rank_loss"); - OP_INOUT_CHECK(ctx->HasInput("X2"), "Input", "X2", "margin_rank_loss"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "margin_rank_loss"); - - auto label_dims = ctx->GetInputDim("Label"); - auto x1_dims = ctx->GetInputDim("X1"); - auto x2_dims = ctx->GetInputDim("X2"); - - PADDLE_ENFORCE_EQ( - label_dims, - x1_dims, - platform::errors::InvalidArgument( - "The shape of Input(Label) shape should equals the shape of " - "Input(X1). Received: Input(Label)'s shape: [%s], Input(X1)'s " - "shape: [%s].", - label_dims, - x1_dims)); - PADDLE_ENFORCE_EQ( - x1_dims, - x2_dims, - platform::errors::InvalidArgument( - "The shape of Input(X1) shape should equals the shape of " - "Input(X2). Received: Input(X1)'s shape: [%s], Input(X2)'s shape: " - "[%s].", - x1_dims, - x2_dims)); - PADDLE_ENFORCE_EQ( - label_dims.size(), - 2, - platform::errors::InvalidArgument( - "The dimensions of Input(Label) should be 2. Received: " - "the shape of Input(Label): [%s], the dimensions of Input(Label): " - "%d.", - label_dims, - label_dims.size())); - PADDLE_ENFORCE_EQ(label_dims[1], - 1, - platform::errors::InvalidArgument( - "The second dimension of Input(Lable) should be 1" - "Received: the shape of Input(Label): [%s].", - label_dims)); - ctx->SetOutputDim("Activated", label_dims); - ctx->SetOutputDim("Out", label_dims); - } -}; - -template -class MarginRankLossOpMaker : public framework::OpProtoAndCheckerMaker { - public: - void Make() override { - AddInput("X1", - "(2-D tensor with shape [batch_size x 1]) The score for " - "one item X1 to be ranked, from pairwise ranking model."); - AddInput("X2", - "(2-D tensor with shape [batch_size x 1]) The score for " - "another item X2 to be ranked, from pairwise ranking model."); - AddInput("Label", - "(2-D tensor with shape [batch_size x 1]) " - "The label indicating X1 ranked higher than X2 or not, " - "can only be +1 or -1."); - AddOutput("Activated", - "(2-D tensor with shape [batch_size x 1]) Intermediate tensor " - "to indicate whether each element of Output(Out) is activated.") - .AsIntermediate(); - AddOutput("Out", - "(2-D tensor with shape [batch_size x 1]) " - "The output loss of MarginRankLoss operator."); - AddAttr("margin", "(scalar, default 0) Margin for MarginRankLossOp.") - .SetDefault(static_cast(0)); - AddComment(R"DOC( -MarginRankLoss Operator. - -This operator measures the loss given a pair of training sample -{`X1`, `X2`} and the `Label` with attribute `margin`, where `Label = +1` -indicating X1 is ranked higher than `X2` and `Label = -1` otherwise. The loss -is calculated as: - -$loss(X1, X2, Label) = \max(0, -Label * (X1 - X2) + margin)$ - -The attribute `margin` here helps make the predictions more robust. -Denote the item ranked higher as the positive sample, otherwise the negative -sample. If the score of the two samples satisfies - -$positive sample - negative sample < margin$ - -the pair of samples will contribute to the final loss, which will backpropagate -and train the ranking model to enlarge the difference between the two scores. - -For batch input with size `batch_size`, `X1`, `X2` and `Label` -all have the same shape [batch_size x 1]. - -)DOC"); - } -}; - -class MarginRankLossGradOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext *ctx) const override { - OP_INOUT_CHECK( - ctx->HasInput("Label"), "Input", "Label", "margin_rank_loss_grad"); - OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")), - "Input", - framework::GradVarName("Out"), - "margin_rank_loss_grad"); - OP_INOUT_CHECK(ctx->HasInput("Activated"), - "Input", - "Activated", - "margin_rank_loss_grad"); - OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("X1")), - "Output", - framework::GradVarName("X1"), - "margin_rank_loss_grad"); - OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("X2")), - "Output", - framework::GradVarName("X2"), - "margin_rank_loss_grad"); - - auto dims = ctx->GetInputDim("Label"); - ctx->SetOutputDim(framework::GradVarName("X1"), dims); - ctx->SetOutputDim(framework::GradVarName("X2"), dims); - } -}; - -template -class MarginRankLossGradMaker : public framework::SingleGradOpMaker { - public: - using framework::SingleGradOpMaker::SingleGradOpMaker; - - protected: - void Apply(GradOpPtr op) const override { - op->SetType("margin_rank_loss_grad"); - op->SetInput("Activated", this->Output("Activated")); - op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out")); - op->SetInput("Label", this->Input("Label")); - op->SetOutput(framework::GradVarName("X1"), this->InputGrad("X1")); - op->SetOutput(framework::GradVarName("X2"), this->InputGrad("X2")); - op->SetAttrMap(this->Attrs()); - } -}; - -} // namespace operators -} // namespace paddle -namespace ops = paddle::operators; - -REGISTER_OPERATOR(margin_rank_loss, - ops::MarginRankLossOp, - ops::MarginRankLossOpMaker, - ops::MarginRankLossGradMaker, - ops::MarginRankLossGradMaker); -REGISTER_OPERATOR(margin_rank_loss_grad, ops::MarginRankLossGradOp); - -PD_REGISTER_STRUCT_KERNEL( - margin_rank_loss, CPU, ALL_LAYOUT, ops::MarginRankLossKernel, float) {} -PD_REGISTER_STRUCT_KERNEL(margin_rank_loss_grad, - CPU, - ALL_LAYOUT, - ops::MarginRankLossGradKernel, - float) {} diff --git a/paddle/fluid/operators/margin_rank_loss_op.cu b/paddle/fluid/operators/margin_rank_loss_op.cu deleted file mode 100644 index 8c6c2ee055f9c..0000000000000 --- a/paddle/fluid/operators/margin_rank_loss_op.cu +++ /dev/null @@ -1,25 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/margin_rank_loss_op.h" - -namespace ops = paddle::operators; - -PD_REGISTER_STRUCT_KERNEL( - margin_rank_loss, GPU, ALL_LAYOUT, ops::MarginRankLossKernel, float) {} -PD_REGISTER_STRUCT_KERNEL(margin_rank_loss_grad, - GPU, - ALL_LAYOUT, - ops::MarginRankLossGradKernel, - float) {} diff --git a/paddle/fluid/operators/margin_rank_loss_op.h b/paddle/fluid/operators/margin_rank_loss_op.h deleted file mode 100644 index 49cbb1168f1b5..0000000000000 --- a/paddle/fluid/operators/margin_rank_loss_op.h +++ /dev/null @@ -1,96 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -template -struct ReLU { - HOSTDEVICE T operator()(const T& val) const { - return val > 0 ? val : static_cast(0); - } -}; - -template -struct Heaviside { - HOSTDEVICE T operator()(const T& val) const { - return static_cast(val > 0 ? 1 : 0); - } -}; - -template -class MarginRankLossKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const { - auto* out_t = ctx.Output("Out"); - auto* act_t = ctx.Output("Activated"); - - auto* label_t = ctx.Input("Label"); - auto* x1_t = ctx.Input("X1"); - auto* x2_t = ctx.Input("X2"); - - out_t->mutable_data(ctx.GetPlace()); - act_t->mutable_data(ctx.GetPlace()); - - auto margin = static_cast(ctx.Attr("margin")); - auto out = framework::EigenVector::Flatten(*out_t); - auto act = framework::EigenVector::Flatten(*act_t); - - auto label = framework::EigenVector::Flatten(*label_t); - auto x1 = framework::EigenVector::Flatten(*x1_t); - auto x2 = framework::EigenVector::Flatten(*x2_t); - - auto& dev = *ctx.template device_context().eigen_device(); - out.device(dev) = (-label * (x1 - x2) + margin).unaryExpr(ReLU()); - act.device(dev) = out.unaryExpr(Heaviside()); - } -}; - -template -class MarginRankLossGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const { - auto* d_x1_t = ctx.Output(framework::GradVarName("X1")); - auto* d_x2_t = ctx.Output(framework::GradVarName("X2")); - - auto* act_t = ctx.Input("Activated"); - auto* d_out_t = ctx.Input(framework::GradVarName("Out")); - auto* label_t = ctx.Input("Label"); - - auto d_out = framework::EigenVector::Flatten(*d_out_t); - auto act = framework::EigenVector::Flatten(*act_t); - auto label = framework::EigenVector::Flatten(*label_t); - auto& dev = *ctx.template device_context().eigen_device(); - - // compute d_x1 - if (d_x1_t) { - d_x1_t->mutable_data(ctx.GetPlace()); - auto d_x1 = framework::EigenVector::Flatten(*d_x1_t); - d_x1.device(dev) = -d_out * act * label; - } - // compute d_x2 - if (d_x2_t) { - d_x2_t->mutable_data(ctx.GetPlace()); - auto d_x2 = framework::EigenVector::Flatten(*d_x2_t); - d_x2.device(dev) = d_out * act * label; - } - } -}; -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/marker_op.cc b/paddle/fluid/operators/marker_op.cc deleted file mode 100644 index 0735e63c229b7..0000000000000 --- a/paddle/fluid/operators/marker_op.cc +++ /dev/null @@ -1,79 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - -http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/platform/profiler/event_tracing.h" - -namespace paddle { -namespace operators { - -class MarkerOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - std::string marker_role = ctx->Attrs().Get("marker_role"); - std::string marker_pos = ctx->Attrs().Get("marker_pos"); - - VLOG(3) << "The role is:" << marker_role << ";" - << "The position is:" << marker_pos << "."; - } - - protected: - phi::KernelKey GetExpectedKernelType( - const framework::ExecutionContext& ctx) const override { - return phi::KernelKey(framework::proto::VarType::FP32, ctx.GetPlace()); - } -}; - -class MarkerOpMaker : public framework::OpProtoAndCheckerMaker { - public: - void Make() override { - AddAttr("marker_role", - "(string, default forward)forward or backward," - " mark different stages of porcess.") - .SetDefault("forward"); - AddAttr( - "marker_pos", - "(string, default B)the posititon where the marker is placed, " - "B stands for begin of duration," - " E stands for end of duration.") - .SetDefault("B"); - AddComment( - R"DOC(Marker Operator - Add marker at the beginning/end of a forward/backward process.)DOC"); - } -}; - -template -class MarkerOpCPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto marker_role = ctx.Attr("marker_role"); - auto marker_pos = ctx.Attr("marker_pos"); - - platform::RecordEvent record_event( - "MarkerCPU", - "marker_" + marker_role + "_" + marker_pos, - platform::TracerEventType::OperatorInner, - 1, - platform::EventRole::kInnerOp); - } -}; -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -REGISTER_OP_WITHOUT_GRADIENT(marker, ops::MarkerOp, ops::MarkerOpMaker); -PD_REGISTER_STRUCT_KERNEL( - marker, CPU, ALL_LAYOUT, ops::MarkerOpCPUKernel, float) {} diff --git a/paddle/fluid/operators/marker_op.cu b/paddle/fluid/operators/marker_op.cu deleted file mode 100644 index 1feb6a2b2616f..0000000000000 --- a/paddle/fluid/operators/marker_op.cu +++ /dev/null @@ -1,65 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - -http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/data_type.h" -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/tensor_util.h" -#include "paddle/fluid/platform/profiler/event_tracing.h" - -namespace paddle { -namespace operators { - -template -__global__ void SimpleMarkerKernel(T* in, T* out, int ndim) { - int idx = threadIdx.x + blockIdx.x * blockDim.x; - for (; idx < ndim; idx += blockDim.x * gridDim.x) { - out[idx] = in[idx]; - } -} - -template -class MarkerOpCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto& dev_ctx = ctx.template device_context(); - - auto marker_role = ctx.Attr("marker_role"); - auto marker_pos = ctx.Attr("marker_pos"); - VLOG(3) << "marker role: " << marker_role - << " marker position: " << marker_pos; - - phi::DenseTensor A; - phi::DenseTensor B; - auto* in_temp = A.mutable_data({32, 1}, ctx.GetPlace()); - auto* out_temp = B.mutable_data({32, 1}, ctx.GetPlace()); - platform::RecordEvent record_event( - "MarkerCUDA", - "marker_" + marker_role + "_" + marker_pos, - platform::TracerEventType::OperatorInner, - 1, - platform::EventRole::kInnerOp); - SimpleMarkerKernel - <<<1, 32, 0, dev_ctx.stream()>>>(in_temp, out_temp, 32); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -PD_REGISTER_STRUCT_KERNEL( - marker, GPU, ALL_LAYOUT, ops::MarkerOpCUDAKernel, float) {} diff --git a/paddle/fluid/operators/unity_build_rule.cmake b/paddle/fluid/operators/unity_build_rule.cmake index 9151e1b4a2c5c..50a4ab2cc5eca 100644 --- a/paddle/fluid/operators/unity_build_rule.cmake +++ b/paddle/fluid/operators/unity_build_rule.cmake @@ -152,7 +152,6 @@ register_unity_group( cc log_loss_op.cc lookup_table_v2_op.cc - margin_rank_loss_op.cc masked_select_op.cc match_matrix_tensor_op.cc matmul_op.cc @@ -165,7 +164,6 @@ register_unity_group( register_unity_group( cc concat_op.cc - conv_shift_op.cc dequantize_log_op.cc dropout_op.cc expand_op.cc @@ -430,13 +428,11 @@ register_unity_group( cu log_loss_op.cu lookup_table_v2_op.cu - margin_rank_loss_op.cu masked_select_op.cu shuffle_channel_op.cu softmax_cudnn_op.cu) register_unity_group( cu - conv_shift_op.cu dequantize_log_op.cu dropout_op.cu fake_quantize_op.cu diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/loss_ops.cc b/paddle/fluid/platform/device/ipu/popart_canonicalization/loss_ops.cc index 48962456d4ca7..f645a5862c716 100644 --- a/paddle/fluid/platform/device/ipu/popart_canonicalization/loss_ops.cc +++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/loss_ops.cc @@ -717,4 +717,3 @@ REGISTER_HANDLER(bce_loss, binary_cross_entropy_handler); REGISTER_HANDLER(huber_loss, huber_loss_handler); REGISTER_HANDLER(warpctc, warpctc_handler); REGISTER_HANDLER(rank_loss, rank_loss_handler); -REGISTER_HANDLER(margin_rank_loss, margin_rank_loss_handler); diff --git a/test/legacy_test/test_conv_shift_op.py b/test/legacy_test/test_conv_shift_op.py deleted file mode 100644 index 26965d9b393cb..0000000000000 --- a/test/legacy_test/test_conv_shift_op.py +++ /dev/null @@ -1,60 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import unittest - -import numpy as np -from op_test import OpTest - - -def conv_shift_forward(x, y): - out = np.zeros_like(x) - M = x.shape[1] - N = y.shape[1] - y_half_width = (N - 1) // 2 - for i in range(M): - for j in range(N): - out[:, i] += x[:, (i + j + M - y_half_width) % M] * y[:, j] - return out - - -class TestConvShiftOp(OpTest): - def setUp(self): - self.op_type = "conv_shift" - - batch_size = 10 - x_dim = 17 - y_dim = 11 # must be odd and <= x_dim - x = np.random.random((batch_size, x_dim)).astype("float32") - y = np.random.random((batch_size, y_dim)).astype("float32") - self.inputs = {'X': x, 'Y': y} - - out = conv_shift_forward(x, y) - self.outputs = {'Out': out} - - def test_check_output(self): - self.check_output() - - def test_check_grad_normal(self): - self.check_grad(['X', 'Y'], 'Out') - - def test_check_grad_ignore_x(self): - self.check_grad(['Y'], 'Out') - - def test_check_grad_ignore_y(self): - self.check_grad(['X'], 'Out') - - -if __name__ == '__main__': - unittest.main() diff --git a/test/legacy_test/test_margin_rank_loss_op.py b/test/legacy_test/test_margin_rank_loss_op.py deleted file mode 100644 index a795bc23694b3..0000000000000 --- a/test/legacy_test/test_margin_rank_loss_op.py +++ /dev/null @@ -1,115 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import unittest - -import numpy as np -from op_test import OpTest, paddle_static_guard - -import paddle -from paddle import base - - -class TestMarginRankLossOp(OpTest): - def setUp(self): - self.op_type = "margin_rank_loss" - batch_size = 5 - margin = 0.5 - # labels_{i} = {-1, 1} - label = ( - 2 * np.random.randint(0, 2, size=(batch_size, 1)).astype("float32") - - 1 - ) - x1 = np.random.random((batch_size, 1)).astype("float32") - x2 = np.random.random((batch_size, 1)).astype("float32") - # loss = max(0, -label * (x1 - x2) + margin) - loss = -label * (x1 - x2) + margin - loss = np.where(loss > 0, loss, 0) - act = np.where(loss > 0, 1.0, 0.0) - - self.attrs = {'margin': margin} - self.inputs = {'Label': label, 'X1': x1, 'X2': x2} - self.outputs = {'Activated': act, 'Out': loss} - - def test_check_output(self): - self.check_output() - - def test_check_grad(self): - self.check_grad(["X1", "X2"], "Out") - - def test_check_grad_ignore_x1(self): - self.check_grad(["X2"], "Out", no_grad_set=set('X1')) - - def test_check_grad_ignore_x2(self): - self.check_grad(["X1"], "Out", no_grad_set=set('X2')) - - -class TestMarginRankLossLayer(unittest.TestCase): - def setUp(self): - self.batch_size = 5 - self.margin = 0.5 - # labels_{i} = {-1, 1} - self.label = ( - 2 - * np.random.randint(0, 2, size=(self.batch_size, 1)).astype( - "float32" - ) - - 1 - ) - self.x1 = np.random.random((self.batch_size, 1)).astype("float32") - self.x2 = np.random.random((self.batch_size, 1)).astype("float32") - # loss = max(0, -label * (x1 - x2) + margin) - loss = -self.label * (self.x1 - self.x2) + self.margin - loss = np.where(loss > 0, loss, 0) - self.loss = loss - - def test_identity(self): - place = base.CPUPlace() - self.check_identity(place) - - if base.is_compiled_with_cuda(): - place = base.CUDAPlace(0) - self.check_identity(place) - - def check_identity(self, place): - with paddle_static_guard(): - main = base.Program() - start = base.Program() - with base.unique_name.guard(): - with base.program_guard(main, start): - label = paddle.static.data( - "label", (self.batch_size, 1), "float32" - ) - x1 = paddle.static.data( - "x1", (self.batch_size, 1), "float32" - ) - x2 = paddle.static.data( - "x2", (self.batch_size, 1), "float32" - ) - out = paddle.nn.functional.margin_ranking_loss( - x1, x2, label, self.margin, 'none' - ) - - exe = base.Executor(place) - exe.run(start) - (out_np,) = exe.run( - main, - feed={"label": self.label, "x1": self.x1, "x2": self.x2}, - fetch_list=[out], - ) - np.testing.assert_allclose(out_np, self.loss) - - -if __name__ == '__main__': - unittest.main() diff --git a/test/legacy_test/test_marker_op.py b/test/legacy_test/test_marker_op.py deleted file mode 100644 index 21895d962318f..0000000000000 --- a/test/legacy_test/test_marker_op.py +++ /dev/null @@ -1,38 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import unittest - -from op_test import OpTest - -from paddle.distributed.fleet.meta_optimizers.common import OpRole - - -class TestMarkerOp(OpTest): - def setUp(self): - self.op_type = "marker" - self.inputs = {} - self.attrs = { - 'marker_role': 'forward', - 'marker_pos': 'B', - 'op_role': OpRole.Forward, - } - self.outputs = {} - - def test_check_output(self): - # NODE(yjjiang11): This op will be deprecated. - self.check_output(check_dygraph=False) - - -if __name__ == "__main__": - unittest.main() diff --git a/test/legacy_test/test_roi_perspective_transform_op.py b/test/legacy_test/test_roi_perspective_transform_op.py deleted file mode 100644 index 59a7a3f3b4a11..0000000000000 --- a/test/legacy_test/test_roi_perspective_transform_op.py +++ /dev/null @@ -1,261 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License") -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUWARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import unittest -from math import floor, sqrt - -import numpy as np - - -def gt_e(a, b): - return a > b or abs(a - b) < 1e-4 - - -def gt(a, b): - return (a - b) > 1e-4 - - -def lt_e(a, b): - return a < b or abs(a - b) < 1e-4 - - -def in_quad(x, y, roi_x, roi_y): - # check if (x, y) is in the boundary of roi - for i in range(4): - xs = roi_x[i] - ys = roi_y[i] - xe = roi_x[(i + 1) % 4] - ye = roi_y[(i + 1) % 4] - if abs(ys - ye) < 1e-4: - if ( - abs(y - ys) < 1e-4 - and abs(y - ye) < 1e-4 - and gt_e(x, min(xs, xe)) - and lt_e(x, max(xs, xe)) - ): - return True - else: - intersec_x = (y - ys) * (xe - xs) / (ye - ys) + xs - if ( - abs(intersec_x - x) < 1e-4 - and gt_e(y, min(ys, ye)) - and lt_e(y, max(ys, ye)) - ): - return True - n_cross = 0 - for i in range(4): - xs = roi_x[i] - ys = roi_y[i] - xe = roi_x[(i + 1) % 4] - ye = roi_y[(i + 1) % 4] - if abs(ys - ye) < 1e-4: - continue - if lt_e(y, min(ys, ye)) or gt(y, max(ys, ye)): - continue - intersec_x = (y - ys) * (xe - xs) / (ye - ys) + xs - if abs(intersec_x - x) < 1e-4: - return True - if gt(intersec_x, x): - n_cross += 1 - return n_cross % 2 == 1 - - -def get_transform_matrix(transformed_width, transformed_height, roi_x, roi_y): - x0 = roi_x[0] - x1 = roi_x[1] - x2 = roi_x[2] - x3 = roi_x[3] - y0 = roi_y[0] - y1 = roi_y[1] - y2 = roi_y[2] - y3 = roi_y[3] - - len1 = sqrt((x0 - x1) * (x0 - x1) + (y0 - y1) * (y0 - y1)) - len2 = sqrt((x1 - x2) * (x1 - x2) + (y1 - y2) * (y1 - y2)) - len3 = sqrt((x2 - x3) * (x2 - x3) + (y2 - y3) * (y2 - y3)) - len4 = sqrt((x3 - x0) * (x3 - x0) + (y3 - y0) * (y3 - y0)) - estimated_height = (len2 + len4) / 2.0 - estimated_width = (len1 + len3) / 2.0 - - normalized_height = max(2, transformed_height) - normalized_width = ( - round(estimated_width * (normalized_height - 1) / estimated_height) + 1 - ) - normalized_width = max(2, min(normalized_width, transformed_width)) - - dx1 = x1 - x2 - dx2 = x3 - x2 - dx3 = x0 - x1 + x2 - x3 - dy1 = y1 - y2 - dy2 = y3 - y2 - dy3 = y0 - y1 + y2 - y3 - matrix = np.zeros([9]) - matrix[6] = ( - (dx3 * dy2 - dx2 * dy3) - / (dx1 * dy2 - dx2 * dy1 + 1e-5) - / (normalized_width - 1) - ) - matrix[7] = ( - (dx1 * dy3 - dx3 * dy1) - / (dx1 * dy2 - dx2 * dy1 + 1e-5) - / (normalized_height - 1) - ) - matrix[8] = 1 - - matrix[3] = (y1 - y0 + matrix[6] * (normalized_width - 1) * y1) / ( - normalized_width - 1 - ) - matrix[4] = (y3 - y0 + matrix[7] * (normalized_height - 1) * y3) / ( - normalized_height - 1 - ) - matrix[5] = y0 - - matrix[0] = (x1 - x0 + matrix[6] * (normalized_width - 1) * x1) / ( - normalized_width - 1 - ) - matrix[1] = (x3 - x0 + matrix[7] * (normalized_height - 1) * x3) / ( - normalized_height - 1 - ) - matrix[2] = x0 - return matrix - - -def get_source_coords(matrix, out_w, out_h): - u = matrix[0] * out_w + matrix[1] * out_h + matrix[2] - v = matrix[3] * out_w + matrix[4] * out_h + matrix[5] - w = matrix[6] * out_w + matrix[7] * out_h + matrix[8] - in_w = u / w - in_h = v / w - return in_w, in_h - - -def bilinear_interpolate(in_data, in_n, in_c, in_w, in_h): - batch_size = in_data.shape[0] - channels = in_data.shape[1] - height = in_data.shape[2] - width = in_data.shape[3] - - if ( - gt_e(-0.5, in_w) - or gt_e(in_w, width - 0.5) - or gt_e(-0.5, in_h) - or gt_e(in_h, height - 0.5) - ): - return 0.0 - - if gt_e(0, in_w): - in_w = 0 - if gt_e(0, in_h): - in_h = 0 - - in_w_floor = floor(in_w) - in_h_floor = floor(in_h) - - if gt_e(in_w_floor, width - 1): - in_w_ceil = width - 1 - in_w_floor = width - 1 - in_w = in_w_floor - else: - in_w_ceil = in_w_floor + 1 - - if gt_e(in_h_floor, height - 1): - in_h_ceil = height - 1 - in_h_floor = height - 1 - in_h = in_h_floor - else: - in_h_ceil = in_h_floor + 1 - - w_floor = in_w - in_w_floor - h_floor = in_h - in_h_floor - w_ceil = 1 - w_floor - h_ceil = 1 - h_floor - v1 = in_data[in_n][in_c][int(in_h_floor)][int(in_w_floor)] - v2 = in_data[in_n][in_c][int(in_h_ceil)][int(in_w_floor)] - v3 = in_data[in_n][in_c][int(in_h_ceil)][int(in_w_ceil)] - v4 = in_data[in_n][in_c][int(in_h_floor)][int(in_w_ceil)] - w1 = w_ceil * h_ceil - w2 = w_ceil * h_floor - w3 = w_floor * h_floor - w4 = w_floor * h_ceil - val = w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4 - return val - - -def lod_convert(lod): - ret = [0] - for count in lod: - ret.append(ret[-1] + count) - return ret - - -def roi_transform( - in_data, - rois, - rois_lod, - transformed_height, - transformed_width, - spatial_scale, -): - channels = in_data.shape[1] - in_height = in_data.shape[2] - in_width = in_data.shape[3] - rois_num = rois.shape[0] - - roi2image = [0] * rois_num - rois_lod = lod_convert(rois_lod[0]) - for i in range(len(rois_lod) - 1): - for j in range(rois_lod[i], rois_lod[i + 1]): - roi2image[j] = i - - out = np.zeros([rois_num, channels, transformed_height, transformed_width]) - mask = np.zeros( - [rois_num, 1, transformed_height, transformed_width] - ).astype('int') - matrix = np.zeros([rois_num, 9], dtype=in_data.dtype) - for n in range(rois_num): - roi_x = [] - roi_y = [] - for k in range(4): - roi_x.append(rois[n][2 * k] * spatial_scale) - roi_y.append(rois[n][2 * k + 1] * spatial_scale) - image_id = roi2image[n] - transform_matrix = get_transform_matrix( - transformed_width, transformed_height, roi_x, roi_y - ) - matrix[n] = transform_matrix - for c in range(channels): - for out_h in range(transformed_height): - for out_w in range(transformed_width): - in_w, in_h = get_source_coords( - transform_matrix, out_w, out_h - ) - if ( - in_quad(in_w, in_h, roi_x, roi_y) - and gt(in_w, -0.5) - and gt(in_width - 0.5, in_w) - and gt(in_h, -0.5) - and gt(in_height - 0.5, in_h) - ): - out[n][c][out_h][out_w] = bilinear_interpolate( - in_data, image_id, c, in_w, in_h - ) - mask[n][0][out_h][out_w] = 1 - else: - out[n][c][out_h][out_w] = 0.0 - mask[n][0][out_h][out_w] = 0 - return out.astype("float32"), mask, matrix - - -if __name__ == '__main__': - unittest.main() diff --git a/test/white_list/check_shape_white_list.py b/test/white_list/check_shape_white_list.py index 5785a51372e79..144505f3d75d9 100644 --- a/test/white_list/check_shape_white_list.py +++ b/test/white_list/check_shape_white_list.py @@ -18,7 +18,6 @@ 'conv2d_transpose', 'depthwise_conv2d_transpose', 'grid_sampler', - 'margin_rank_loss', 'matmul', 'scatter', 'soft_relu', diff --git a/test/white_list/no_grad_set_white_list.py b/test/white_list/no_grad_set_white_list.py index 23c9994715f7d..36210a8175025 100644 --- a/test/white_list/no_grad_set_white_list.py +++ b/test/white_list/no_grad_set_white_list.py @@ -57,7 +57,6 @@ 'lookup_table', 'lookup_table_v2', 'lstm', - 'margin_rank_loss', 'matmul', 'matmul_v2', 'mul', diff --git a/test/white_list/op_accuracy_white_list.py b/test/white_list/op_accuracy_white_list.py index f145e9e1f62e2..d2520739339eb 100644 --- a/test/white_list/op_accuracy_white_list.py +++ b/test/white_list/op_accuracy_white_list.py @@ -21,7 +21,6 @@ 'conv2d_transpose', 'conv3d', 'conv3d_transpose', - 'conv_shift', 'cudnn_lstm', 'cvm', 'data_norm', @@ -40,7 +39,6 @@ 'log_loss', 'logit', 'lrn', - 'margin_rank_loss', 'match_matrix_tensor', 'matmul', 'max_pool2d_with_index', diff --git a/tools/gpups_test.sh b/tools/gpups_test.sh index 31ad58a86456e..fff44b872461e 100644 --- a/tools/gpups_test.sh +++ b/tools/gpups_test.sh @@ -47,7 +47,6 @@ parallel_list="^init_phi_test$|\ ^test_conv3d_transpose_op$|\ ^test_conv_bn_fuse_pass_cc$|\ ^test_conv_nn_grad$|\ -^test_conv_shift_op$|\ ^test_conv_transpose_nn_grad$|\ ^test_convert_call$|\ ^test_convert_call_generator$|\ diff --git a/tools/parallel_UT_rule.py b/tools/parallel_UT_rule.py index e5055272e9c94..b684db687f236 100755 --- a/tools/parallel_UT_rule.py +++ b/tools/parallel_UT_rule.py @@ -866,7 +866,6 @@ 'test_distribution', 'test_box_clip_op', 'custom_tensor_test', - 'test_marker_op', 'test_dataloader_early_reset', 'test_gather_nd_op', 'test_tensor_register_hook', @@ -900,7 +899,6 @@ 'test_dygraph_spectral_norm', 'test_scale_mkldnn_op', 'test_load_state_dict_from_old_format', - 'test_margin_rank_loss_op', 'test_lookup_table_v2_op', 'test_mix_precision_all_reduce_fuse', 'test_spp_op', @@ -1047,7 +1045,6 @@ 'test_huber_loss_op', 'test_slice', 'test_label_smooth_functional', - 'test_conv_shift_op', 'test_imperative_optimizer_v2', 'test_len', 'test_imperative_named_members', @@ -1402,7 +1399,6 @@ 'test_imperative_save_load_v2', 'test_lookahead', 'test_moving_average_abs_max_scale_op', - 'test_roi_perspective_transform_op', 'test_tensorrt_engine', 'test_affine_grid_function', 'test_nonzero_api', @@ -2494,7 +2490,6 @@ 'test_complex_abs', 'test_subtract_op', 'test_complex_elementwise_layers', - 'test_marker_op', 'test_typing', 'test_cuda_empty_cache', 'test_randn_op', @@ -2701,7 +2696,6 @@ 'test_conv1d_transpose_layer', 'test_sequence_pool', 'test_conv_elementwise_add_fuse_pass', - 'test_conv_shift_op', 'test_sequence_expand_as', 'test_cos_sim_op', 'test_sequence_concat', @@ -2753,7 +2747,6 @@ 'test_lookup_table_v2_op', 'test_l1_norm_op', 'test_lstm_op', - 'test_margin_rank_loss_op', 'test_index_sample_op', 'test_imperative_save_load', 'test_imperative_ptb_rnn_sorted_gradient', @@ -2790,7 +2783,6 @@ 'test_device_guard', 'test_rnn_cells_static', 'test_deformable_psroi_pooling', - 'test_roi_perspective_transform_op', 'test_segment_ops', 'test_cvm_op', 'test_selu_op', diff --git a/tools/static_mode_white_list.py b/tools/static_mode_white_list.py index 095c2e2646f9f..f6165107624d1 100755 --- a/tools/static_mode_white_list.py +++ b/tools/static_mode_white_list.py @@ -113,7 +113,6 @@ 'test_conv3d_transpose_part2_op', 'test_conv_nn_grad', 'test_conv_transpose_nn_grad', - 'test_conv_shift_op', 'test_cos_sim_op', 'test_create_global_var', 'test_crf_decoding_op', @@ -315,7 +314,6 @@ 'test_lrn_op', 'test_lstm_op', 'test_lstmp_op', - 'test_margin_rank_loss_op', 'test_math_op_patch', 'test_matmul_op', 'test_matmul_v2_op', @@ -428,7 +426,6 @@ 'test_rmsprop_op', 'test_rnn_cell_api', 'test_roi_align_op', - 'test_roi_perspective_transform_op', 'test_roi_pool_op', 'test_roll_op', 'test_row_conv', @@ -696,7 +693,6 @@ 'test_lamb_op_xpu', 'test_model_cast_to_bf16', 'test_sgd_op_bf16', - 'test_marker_op', 'test_c_embedding_op', 'test_class_center_sample_op', 'test_fill_diagonal_tensor_op', From 03de3f963494380743ae715bbbbd4d7e50324344 Mon Sep 17 00:00:00 2001 From: wanghuancoder Date: Tue, 10 Oct 2023 09:20:14 +0800 Subject: [PATCH 45/62] [CleanOps]del_unuseful op8 (#57808) * del_unuseful op8 --- .../framework/ir/sync_batch_norm_pass.cc | 7 - paddle/fluid/framework/unused_var_check.cc | 2 - paddle/fluid/operators/inplace_abn_op.cc | 615 ------------------ paddle/fluid/operators/inplace_abn_op.cu | 237 ------- paddle/fluid/operators/inplace_abn_op.h | 130 ---- paddle/fluid/operators/unity_build_rule.cmake | 2 - paddle/fluid/pybind/eager_generator.h | 10 - paddle/phi/api/yaml/op_compat.yaml | 5 - test/legacy_test/CMakeLists.txt | 1 - test/legacy_test/test_inplace_abn_op.py | 113 ---- tools/parallel_UT_rule.py | 2 - tools/static_mode_white_list.py | 1 - 12 files changed, 1125 deletions(-) delete mode 100644 paddle/fluid/operators/inplace_abn_op.cc delete mode 100644 paddle/fluid/operators/inplace_abn_op.cu delete mode 100644 paddle/fluid/operators/inplace_abn_op.h delete mode 100644 test/legacy_test/test_inplace_abn_op.py diff --git a/paddle/fluid/framework/ir/sync_batch_norm_pass.cc b/paddle/fluid/framework/ir/sync_batch_norm_pass.cc index 2fc711979194a..828418597e623 100644 --- a/paddle/fluid/framework/ir/sync_batch_norm_pass.cc +++ b/paddle/fluid/framework/ir/sync_batch_norm_pass.cc @@ -39,13 +39,6 @@ class SyncBatchNormPass : public Pass { if (op->Type() == "batch_norm_grad") { op->SetType("sync_batch_norm_grad"); } - // process synchronize in inplace_abn - if (op->Type() == "inplace_abn") { - op->SetAttr("use_sync_bn", true); - } - if (op->Type() == "inplace_abn_grad") { - op->SetAttr("use_sync_bn", true); - } } } } diff --git a/paddle/fluid/framework/unused_var_check.cc b/paddle/fluid/framework/unused_var_check.cc index ad21fdf45698b..16e6109a60657 100644 --- a/paddle/fluid/framework/unused_var_check.cc +++ b/paddle/fluid/framework/unused_var_check.cc @@ -59,8 +59,6 @@ static const std::unordered_set &GetOpWithUnusedVarAllowSet() { "batch_norm_grad", // 0 "sync_batch_norm", // 0 "sync_batch_norm_grad", // 0 - "inplace_abn", // 0 - "inplace_abn_grad", // 0 "dgc_momentum", // 0 "fake_quantize_range_abs_max", // 0 "rmsprop", // 0 diff --git a/paddle/fluid/operators/inplace_abn_op.cc b/paddle/fluid/operators/inplace_abn_op.cc deleted file mode 100644 index a53a9867b9903..0000000000000 --- a/paddle/fluid/operators/inplace_abn_op.cc +++ /dev/null @@ -1,615 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/operators/inplace_abn_op.h" - -#include -#include -#include -#include "paddle/phi/kernels/batch_norm_grad_kernel.h" -#include "paddle/phi/kernels/batch_norm_kernel.h" - -namespace paddle { -namespace operators { - -class InplaceABNOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "BatchNorm"); - OP_INOUT_CHECK(ctx->HasInput("Scale"), "Input", "Scale", "BatchNorm"); - OP_INOUT_CHECK(ctx->HasInput("Bias"), "Input", "Bias", "BatchNorm"); - OP_INOUT_CHECK(ctx->HasInput("Mean"), "Input", "Mean", "BatchNorm"); - OP_INOUT_CHECK(ctx->HasInput("Variance"), "Input", "Variance", "BatchNorm"); - OP_INOUT_CHECK(ctx->HasOutput("Y"), "Output", "Y", "BatchNorm"); - - bool is_test = ctx->Attrs().Get("is_test"); - bool trainable_stats = ctx->Attrs().Get("trainable_statistics"); - bool test_mode = is_test && (!trainable_stats); - if (!test_mode) { - OP_INOUT_CHECK( - ctx->HasOutput("MeanOut"), "Output", "MeanOut", "BatchNorm"); - OP_INOUT_CHECK( - ctx->HasOutput("VarianceOut"), "Output", "VarianceOut", "BatchNorm"); - OP_INOUT_CHECK( - ctx->HasOutput("SavedMean"), "Output", "SavedMean", "BatchNorm"); - OP_INOUT_CHECK(ctx->HasOutput("SavedVariance"), - "Output", - "SavedVariance", - "BatchNorm"); - } - - // make sure Mean/MeanOut and Variance/VarianceOut share memory in Python - PADDLE_ENFORCE_EQ(ctx->Inputs("Mean")[0], - ctx->Outputs("MeanOut")[0], - platform::errors::InvalidArgument( - "Mean and MeanOut should share the same memory")); - PADDLE_ENFORCE_EQ( - ctx->Inputs("Variance")[0], - ctx->Outputs("VarianceOut")[0], - platform::errors::InvalidArgument( - "Variance and VarianceOut should share the same memory")); - - const auto x_dims = ctx->GetInputDim("X"); - - for (int i = 0; i < x_dims.size(); i++) { - PADDLE_ENFORCE_EQ( - (x_dims[i] == -1) || (x_dims[i] > 0), - true, - platform::errors::InvalidArgument( - "Each dimension of input tensor is expected to be -1 or a " - "positive number, but received %d. Input's shape is [%s].", - x_dims[i], - x_dims)); - } - - const DataLayout data_layout = - phi::StringToDataLayout(ctx->Attrs().Get("data_layout")); - - if (ctx->IsRuntime() && ctx->HasInput("MomentumTensor")) { - auto mom = ctx->Inputs("MomentumTensor"); - PADDLE_ENFORCE_EQ(mom.size(), - 1, - platform::errors::InvalidArgument( - "The input tensor MomentumTensor's size must be 1" - "But received: MomentumTensor's size is [%d]", - mom.size())); - } - - PADDLE_ENFORCE_GE(x_dims.size(), - 2, - platform::errors::InvalidArgument( - "ShapeError: the dimension of input " - "X must greater than or equal to 2. But received: " - "the shape of input " - "X = [%s], the dimension of input X =[%d]", - x_dims, - x_dims.size())); - PADDLE_ENFORCE_LE(x_dims.size(), - 5, - platform::errors::InvalidArgument( - "ShapeError: the dimension of input X " - "must smaller than or equal to 5. But received: the " - "shape of input X " - "= [%s], the dimension of input X = [%d]", - x_dims, - x_dims.size())); - VLOG(4) << ctx->IsRunMKLDNNKernel(); - VLOG(4) << data_layout; - const int64_t C = ((ctx->IsRunMKLDNNKernel() == true) || - (data_layout == DataLayout::kNCHW) - ? x_dims[1] - : x_dims[x_dims.size() - 1]); - - auto scale_dim = ctx->GetInputDim("Scale"); - auto bias_dim = ctx->GetInputDim("Bias"); - - PADDLE_ENFORCE_EQ( - scale_dim.size(), - 1UL, - platform::errors::InvalidArgument( - "ShapeError: the dimension of scale must equal to 1." - "But received: the shape of scale is [%s], the dimension " - "of scale is [%d]", - scale_dim, - scale_dim.size())); - PADDLE_ENFORCE_EQ( - bias_dim.size(), - 1UL, - platform::errors::InvalidArgument( - "ShapeError: the dimension of bias must equal to 1." - "But received: the shape of bias is [%s],the dimension " - "of bias is [%d]", - bias_dim, - bias_dim.size())); - - bool check = true; - if ((!ctx->IsRuntime()) && - (phi::product(scale_dim) <= 0 || phi::product(bias_dim) <= 0)) { - check = false; - } - - if (check) { - PADDLE_ENFORCE_EQ(scale_dim[0], - C, - platform::errors::InvalidArgument( - "ShapeError: the shape of scale must equal to [%d]" - "But received: the shape of scale is [%d]", - C, - scale_dim[0])); - PADDLE_ENFORCE_EQ(bias_dim[0], - C, - platform::errors::InvalidArgument( - "ShapeError: the shape of bias must equal to [%d]" - "But received: the shape of bias is [%d]", - C, - bias_dim[0])); - } - ctx->SetOutputDim("Y", x_dims); - ctx->ShareLoD("X", "Y"); - VLOG(4) << x_dims; - ctx->SetOutputDim("MeanOut", {C}); - ctx->SetOutputDim("VarianceOut", {C}); - if (!test_mode) { - ctx->SetOutputDim("SavedMean", {C}); - ctx->SetOutputDim("SavedVariance", {C}); - } - if (ctx->HasOutput("ReserveSpace")) { - ctx->SetOutputDim("ReserveSpace", {-1}); - } - } - - protected: - phi::KernelKey GetExpectedKernelType( - const framework::ExecutionContext& ctx) const override { - auto input_data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X"); - // By default, the type of the scale, bias, mean, - // and var tensors should both be float. (For float or float16 input tensor) - // or double (For double input tensor). - auto bn_param_type = framework::proto::VarType::FP32; - if (input_data_type == framework::proto::VarType::FP64) { - bn_param_type = framework::proto::VarType::FP64; - } - PADDLE_ENFORCE_EQ(bn_param_type, - framework::TransToProtoVarType( - ctx.Input("Scale")->dtype()), - platform::errors::InvalidArgument( - "Scale input should be of float type")); - PADDLE_ENFORCE_EQ(bn_param_type, - framework::TransToProtoVarType( - ctx.Input("Bias")->dtype()), - platform::errors::InvalidArgument( - "Bias input should be of float type")); - PADDLE_ENFORCE_EQ(bn_param_type, - framework::TransToProtoVarType( - ctx.Input("Mean")->dtype()), - platform::errors::InvalidArgument( - "Mean input should be of float type")); - PADDLE_ENFORCE_EQ(bn_param_type, - framework::TransToProtoVarType( - ctx.Input("Variance")->dtype()), - platform::errors::InvalidArgument( - "Variance input should be of float type")); - - return phi::KernelKey(input_data_type, ctx.GetPlace()); - } -}; - -class InplaceABNGradOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - // check input - OP_INOUT_CHECK(ctx->HasInput("Scale"), "Input", "Scale", "InplaceABNGrad"); - OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Y")), - "Input", - "Y@GRAD", - "InplaceABNGrad"); - OP_INOUT_CHECK( - ctx->HasInput("SavedMean"), "Input", "SavedMean", "InplaceABNGrad"); - OP_INOUT_CHECK(ctx->HasInput("SavedVariance"), - "Input", - "SavedVariance", - "InplaceABNGrad"); - - // check output - OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("X")), - "Output", - "X@GRAD", - "InplaceABNGrad"); - - const bool has_scale_grad = ctx->HasOutput(framework::GradVarName("Scale")); - const bool has_bias_grad = ctx->HasOutput(framework::GradVarName("Bias")); - - PADDLE_ENFORCE_EQ( - has_scale_grad, - has_bias_grad, - platform::errors::InvalidArgument( - "Output(Scale@GRAD) and Output(Bias@GRAD) must be null " - "or not be null at same time. But now, " - "has Scale@Grad=[%d], has Bias@GRAD=[%d]", - has_scale_grad, - has_bias_grad)); - - const bool use_global_stats = ctx->Attrs().Get("use_global_stats"); - if (use_global_stats) { - PADDLE_ENFORCE_EQ( - !ctx->Attrs().Get("use_mkldnn"), - true, - platform::errors::InvalidArgument( - "Using global stats during training is not supported " - "in oneDNN version of batch_norm_gradient kernel now.")); - } - - OP_INOUT_CHECK(ctx->HasInput("Y"), "Input", "Y", "InplaceABNGrad"); - const auto y_dims = ctx->GetInputDim("Y"); - const DataLayout data_layout = - phi::StringToDataLayout(ctx->Attrs().Get("data_layout")); - - const int C = static_cast((ctx->IsRunMKLDNNKernel() == true) || - (data_layout == DataLayout::kNCHW) - ? y_dims[1] - : y_dims[y_dims.size() - 1]); - - ctx->SetOutputDim(framework::GradVarName("X"), y_dims); - // has_scale_grad == has_bias_grad, judge has_scale_grad is enough - if (has_scale_grad) { - ctx->SetOutputDim(framework::GradVarName("Scale"), {C}); - ctx->SetOutputDim(framework::GradVarName("Bias"), {C}); - } - } - - protected: - phi::KernelKey GetExpectedKernelType( - const framework::ExecutionContext& ctx) const override { - const auto* var = ctx.InputVar(framework::GradVarName("Y")); - auto input_data_type = framework::TransToProtoVarType( - ctx.Input("Y")->dtype()); - if (var == nullptr) { - PADDLE_THROW(platform::errors::InvalidArgument( - "can't find gradient variable of Y")); - } - const phi::DenseTensor* t = nullptr; - if (var->IsType()) { - t = &var->Get(); - } - if (t == nullptr) { - PADDLE_THROW( - platform::errors::InvalidArgument("gradient variable of Y is empty")); - } - - return phi::KernelKey(input_data_type, ctx.GetPlace()); - } -}; - -class InplaceABNOpMaker : public framework::OpProtoAndCheckerMaker { - public: - void Make() override { - AddAttr("is_test", - "(bool, default false) Set to true for inference only, false " - "for training. Some layers may run faster when this is true.") - .SetDefault(false); - AddAttr("momentum", "").SetDefault(0.9); - AddAttr("epsilon", "") - .SetDefault(1e-5) - .AddCustomChecker([](const float& epsilon) { - PADDLE_ENFORCE_GE( - epsilon, - 0.0f, - platform::errors::InvalidArgument( - "'epsilon' should be greater or equal than 0.0.")); - PADDLE_ENFORCE_LE( - epsilon, - 0.001f, - platform::errors::InvalidArgument( - "'epsilon' should be less or equal than 0.001.")); - }); - AddAttr("data_layout", "").SetDefault("NCHW"); - AddInput("X", "The input tensor"); - AddInput("Scale", - "Scale is a 1-dimensional tensor of size C " - "that is applied to the output"); - AddInput("Bias", - "Bias is a 1-dimensional tensor of size C " - "that is applied to the output"); - AddInput("Mean", - "The global mean (for training) or " - "estimated mean (for testing)"); - AddInput("Variance", - "The global variance (for training) " - "or estimated Variance (for testing)"); - AddInput( - "MomentumTensor", - "(phi::DenseTensor, optional) If provided, batch_norm will " - "use this as momentum, this has a higher priority than " - "attr(momentum), the shape of this tensor MUST BE [1].") - .AsDispensable(); - AddOutput("Y", "result after normalization"); - AddOutput("MeanOut", - "Share memory with Mean. " - "Store the global mean when training"); - AddOutput("VarianceOut", - "Share memory with Variance. " - "Store the global Variance when training"); - AddOutput("SavedMean", - "Mean of the current mini batch, " - "will apply to output when training") - .AsIntermediate(); - AddOutput("SavedVariance", - "Variance of the current mini batch, " - "will apply to output when training") - .AsIntermediate(); - AddOutput("ReserveSpace", - "Reserve GPU space for triggering the new semi-persistent " - "NHWC kernel") - .AsDispensable() - .AsExtra(); - AddAttr("use_global_stats", - "(bool, default false) Whether to use global mean and " - "variance. In inference or test mode, set use_global_stats " - "to true or is_test true. the behavior is equivalent. " - "In train mode, when setting use_global_stats True, the " - "global mean and variance are also used during train time, " - "the BN acts as scaling and shiffting.") - .SetDefault(false); - AddAttr( - "trainable_statistics", - "(bool, default false) Whether to calculate mean and variance " - "in test mode. If setting true in test mode, mean and variace " - "will be calculated by current batch statistics.") - .SetDefault(false); - AddAttr( - "activation", - "(enum string, default identity, can be identity|elu|leaky-relu) " - "The activation type used for output candidate {h}_t.") - .SetDefault(""); - AddAttr("alpha", - "(float, default 1.0) Only used in inplace-abn kernel," - "the activation type(identity|elu|leakyrelu) would be fused " - "with batch_norm, " - "this is the alpha value for elu|leakyrelu.") - .SetDefault(0.1f); - AddAttr("use_sync_bn", - "(bool, default false) Whether use synchronize batch " - "normalization.") - .SetDefault(false); - AddComment(R"DOC( -Batch Normalization. - -Batch Norm has been implemented as discussed in the paper: -https://arxiv.org/pdf/1502.03167.pdf -Can be used as a normalizer function for conv2d and fully_connected operations. -The required data format for this layer is one of the following: -1. NHWC `[batch, in_height, in_width, in_channels]` -2. NCHW `[batch, in_channels, in_height, in_width]` - -)DOC"); - } -}; - -template -class InplaceABNOpGradMaker : public framework::SingleGradOpMaker { - public: - using framework::SingleGradOpMaker::SingleGradOpMaker; - - protected: - void Apply(GradOpPtr op) const override { - op->SetType(this->ForwardOpType() + "_grad"); - op->SetInput("Y", this->Output("Y")); - op->SetInput(framework::GradVarName("Y"), this->OutputGrad("Y")); - - op->SetInput("Scale", this->Input("Scale")); - op->SetInput("Bias", this->Input("Bias")); - op->SetInput("SavedMean", this->Output("SavedMean")); - op->SetInput("SavedVariance", this->Output("SavedVariance")); - if (this->HasOutput("ReserveSpace")) { - op->SetInput("ReserveSpace", this->Output("ReserveSpace")); - } - - // used when setting use_global_stats True during training - if (PADDLE_GET_CONST(bool, this->GetAttr("use_global_stats"))) { - op->SetInput("Mean", this->Output("MeanOut")); - op->SetInput("Variance", this->Output("VarianceOut")); - } - - op->SetAttrMap(this->Attrs()); - - op->SetOutput(framework::GradVarName("X"), this->InputGrad("X")); - op->SetOutput(framework::GradVarName("Scale"), this->InputGrad("Scale")); - op->SetOutput(framework::GradVarName("Bias"), this->InputGrad("Bias")); - } -}; - -template -class InplaceABNKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* y = ctx.Output("Y"); - PADDLE_ENFORCE_EQ(x, - y, - platform::errors::InvalidArgument( - "X and Y not inplaced in inplace mode")); - auto activation = - GetInplaceABNActivationType(ctx.Attr("activation")); - auto& place = *ctx.template device_context().eigen_device(); - - auto* scale = ctx.Input("Scale"); - auto* bias = ctx.Input("Bias"); - auto* mean = ctx.Input("Mean"); - auto* variance = ctx.Input("Variance"); - - auto momentum = ctx.Attr("momentum"); - auto epsilon = ctx.Attr("epsilon"); - auto data_layout = ctx.Attr("data_layout"); - auto is_test = ctx.Attr("is_test"); - auto use_global_stats = ctx.Attr("use_global_stats"); - auto trainable_statistics = ctx.Attr("trainable_statistics"); - - auto* mean_out = ctx.Output("MeanOut"); - auto* variance_out = ctx.Output("VarianceOut"); - auto* saved_mean = ctx.Output("SavedMean"); - auto* saved_variance = ctx.Output("SavedVariance"); - auto* reserve_space = ctx.Output("ReserveSpace"); - - auto& dev_ctx = ctx.device_context(); - phi::BatchNormKernel( - static_cast::TYPE&>(dev_ctx), - *x, - *mean, - *variance, - *scale, - *bias, - is_test, - momentum, - epsilon, - data_layout, - use_global_stats, - trainable_statistics, - y, - mean_out, - variance_out, - saved_mean, - saved_variance, - reserve_space); - - auto cur_y = EigenVector::Flatten(*y); - InplaceABNActivation functor; - functor.Compute(ctx, activation, place, cur_y, cur_y); - } -}; - -template -class InplaceABNGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* y = ctx.Input("Y"); - auto* d_y = ctx.Input(framework::GradVarName("Y")); - auto* d_x = ctx.Output(framework::GradVarName("X")); - PADDLE_ENFORCE_EQ(d_x, - d_y, - platform::errors::InvalidArgument( - "X@GRAD and Y@GRAD not inplaced in inplace mode")); - auto& place = *ctx.template device_context().eigen_device(); - auto activation = - GetInplaceABNActivationType(ctx.Attr("activation")); - - auto py = *y; - auto pd_y = *d_y; - auto cur_y = EigenVector::Flatten(py); - auto cur_dy = EigenVector::Flatten(pd_y); - - InplaceABNActivation functor; - functor.GradCompute(ctx, activation, place, cur_y, cur_y, cur_dy, cur_dy); - - // BatchNormGradKernel::Compute(ctx); - - auto* scale = ctx.Input("Scale"); - auto* bias = ctx.Input("Bias"); - auto* saved_mean = ctx.Input("SavedMean"); - auto* saved_variance = ctx.Input("SavedVariance"); - - auto momentum = ctx.Attr("momentum"); - auto epsilon = ctx.Attr("epsilon"); - auto data_layout = ctx.Attr("data_layout"); - auto is_test = ctx.Attr("is_test"); - auto use_global_stats = ctx.Attr("use_global_stats"); - auto trainable_statistics = ctx.Attr("trainable_statistics"); - - auto* scale_grad = - ctx.Output(framework::GradVarName("Scale")); - auto* bias_grad = - ctx.Output(framework::GradVarName("Bias")); - - auto* reserve_space = ctx.Input("ReserveSpace"); - auto* mean = ctx.Input("ReserveSpace"); - auto* variance = ctx.Input("ReserveSpace"); - - paddle::optional space_opt; - paddle::optional mean_opt; - paddle::optional variance_opt; - - if (reserve_space != nullptr) { - space_opt = *reserve_space; - } - - if (mean != nullptr) { - mean_opt = *mean; - } - - if (variance != nullptr) { - variance_opt = *variance; - } - - auto& dev_ctx = ctx.device_context(); - phi::BatchNormGradFunctor( - static_cast::TYPE&>(dev_ctx), - *y, - *scale, - *bias, - mean_opt, - variance_opt, - *saved_mean, - *saved_variance, - space_opt, - *d_y, - momentum, - epsilon, - data_layout, - is_test, - use_global_stats, - trainable_statistics, - true, - d_x, - scale_grad, - bias_grad); - } -}; - -class InplaceABNOpInferVarType - : public framework::PassInDtypeAndVarTypeToOutput { - protected: - std::unordered_map& GetInputOutputWithSameType() - const override { - static std::unordered_map m{{"X", /*->*/ "Y"}}; - return m; - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -DECLARE_INPLACE_OP_INFERER(InplaceAbnOpInplaceInferer, {"X", "Y"}); -REGISTER_OPERATOR(inplace_abn, - ops::InplaceABNOp, - ops::InplaceABNOpMaker, - ops::InplaceABNOpInferVarType, - ops::InplaceABNOpGradMaker, - ops::InplaceABNOpGradMaker, - InplaceAbnOpInplaceInferer) -REGISTER_OPERATOR(inplace_abn_grad, ops::InplaceABNGradOp) - -PD_REGISTER_STRUCT_KERNEL( - inplace_abn, CPU, ALL_LAYOUT, ops::InplaceABNKernel, float, double) {} -PD_REGISTER_STRUCT_KERNEL(inplace_abn_grad, - CPU, - ALL_LAYOUT, - ops::InplaceABNGradKernel, - float, - double) {} diff --git a/paddle/fluid/operators/inplace_abn_op.cu b/paddle/fluid/operators/inplace_abn_op.cu deleted file mode 100644 index b18a75073dd57..0000000000000 --- a/paddle/fluid/operators/inplace_abn_op.cu +++ /dev/null @@ -1,237 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/inplace_abn_op.h" -#include "paddle/fluid/operators/batch_norm_op.h" -#include "paddle/fluid/operators/sync_batch_norm_utils.h" -#include "paddle/phi/kernels/batch_norm_grad_kernel.h" -#include "paddle/phi/kernels/batch_norm_kernel.h" -#include "paddle/phi/kernels/sync_batch_norm_grad_kernel.h" -#include "paddle/phi/kernels/sync_batch_norm_kernel.h" - -namespace paddle { -namespace operators { - -template -class InplaceABNKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* y = ctx.Output("Y"); - auto* x = ctx.Input("X"); - PADDLE_ENFORCE_EQ(x, - y, - platform::errors::InvalidArgument( - "X and Y not inplaced in inplace mode")); - auto activation = - GetInplaceABNActivationType(ctx.Attr("activation")); - auto& place = *ctx.template device_context().eigen_device(); - - auto* scale = ctx.Input("Scale"); - auto* bias = ctx.Input("Bias"); - auto* mean = ctx.Input("Mean"); - auto* variance = ctx.Input("Variance"); - - auto momentum = ctx.Attr("momentum"); - auto epsilon = ctx.Attr("epsilon"); - auto data_layout = ctx.Attr("data_layout"); - auto is_test = ctx.Attr("is_test"); - auto use_global_stats = ctx.Attr("use_global_stats"); - auto trainable_statistics = ctx.Attr("trainable_statistics"); - - auto* mean_out = ctx.Output("MeanOut"); - auto* variance_out = ctx.Output("VarianceOut"); - auto* saved_mean = ctx.Output("SavedMean"); - auto* saved_variance = ctx.Output("SavedVariance"); - auto* reserve_space = ctx.Output("ReserveSpace"); - - if (ctx.Attr("use_sync_bn")) { - auto& dev_ctx = ctx.device_context(); - phi::SyncBatchNormKernel( - static_cast::TYPE&>(dev_ctx), - *x, - *mean, - *variance, - *scale, - *bias, - is_test, - momentum, - epsilon, - data_layout, - use_global_stats, - trainable_statistics, - y, - mean_out, - variance_out, - saved_mean, - saved_variance, - reserve_space); - } else { - auto& dev_ctx = ctx.device_context(); - phi::BatchNormKernel( - static_cast::TYPE&>(dev_ctx), - *x, - *mean, - *variance, - *scale, - *bias, - is_test, - momentum, - epsilon, - data_layout, - use_global_stats, - trainable_statistics, - y, - mean_out, - variance_out, - saved_mean, - saved_variance, - reserve_space); - } - - auto cur_y = EigenVector::Flatten(*y); - InplaceABNActivation functor; - functor.Compute(ctx, activation, place, cur_y, cur_y); - } -}; - -// Deriving the Gradient for the Backward Pass of Batch Normalization -// https://kevinzakka.github.io/2016/09/14/batch_normalization/ -template -class InplaceABNGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - const auto* y = ctx.Input("Y"); - auto* d_y = ctx.Input(framework::GradVarName("Y")); - auto* d_x = ctx.Output(framework::GradVarName("X")); - PADDLE_ENFORCE_EQ(d_x, - d_y, - platform::errors::InvalidArgument( - "X@GRAD and Y@GRAD not inplaced in inplace mode")); - auto& place = *ctx.template device_context().eigen_device(); - auto activation = - GetInplaceABNActivationType(ctx.Attr("activation")); - - auto py = *y; - auto pd_y = *d_y; - auto cur_y = EigenVector::Flatten(py); - auto cur_dy = EigenVector::Flatten(pd_y); - - InplaceABNActivation functor; - functor.GradCompute(ctx, activation, place, cur_y, cur_y, cur_dy, cur_dy); - - auto* scale = ctx.Input("Scale"); - auto* bias = ctx.Input("Bias"); - auto* saved_mean = ctx.Input("SavedMean"); - auto* saved_variance = ctx.Input("SavedVariance"); - - auto momentum = ctx.Attr("momentum"); - auto epsilon = ctx.Attr("epsilon"); - auto data_layout = ctx.Attr("data_layout"); - auto is_test = ctx.Attr("is_test"); - auto use_global_stats = ctx.Attr("use_global_stats"); - auto trainable_statistics = ctx.Attr("trainable_statistics"); - - auto* scale_grad = - ctx.Output(framework::GradVarName("Scale")); - auto* bias_grad = - ctx.Output(framework::GradVarName("Bias")); - - auto* reserve_space = ctx.Input("ReserveSpace"); - auto* mean = ctx.Input("ReserveSpace"); - auto* variance = ctx.Input("ReserveSpace"); - - if (ctx.Attr("use_sync_bn")) { - auto& dev_ctx = ctx.device_context(); - phi::SyncBatchNormGradFunctor( - static_cast::TYPE&>(dev_ctx), - nullptr, - y, - *scale, - *bias, - *saved_mean, - *saved_variance, - *d_y, - epsilon, - data_layout, - d_x, - scale_grad, - bias_grad); - } else { - paddle::optional space_opt; - paddle::optional mean_opt; - paddle::optional variance_opt; - - if (reserve_space != nullptr) { - space_opt = *reserve_space; - } - - if (mean != nullptr) { - mean_opt = *mean; - } - - if (variance != nullptr) { - variance_opt = *variance; - } - - auto& dev_ctx = ctx.device_context(); - phi::BatchNormGradFunctor( - static_cast::TYPE&>(dev_ctx), - *y, - *scale, - *bias, - mean_opt, - variance_opt, - *saved_mean, - *saved_variance, - space_opt, - *d_y, - momentum, - epsilon, - data_layout, - is_test, - use_global_stats, - trainable_statistics, - true, - d_x, - scale_grad, - bias_grad); - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; -#ifdef PADDLE_WITH_HIP -// MIOPEN do not support double -PD_REGISTER_STRUCT_KERNEL( - inplace_abn, GPU, ALL_LAYOUT, ops::InplaceABNKernel, float) {} -PD_REGISTER_STRUCT_KERNEL( - inplace_abn_grad, GPU, ALL_LAYOUT, ops::InplaceABNGradKernel, float) {} -#else -PD_REGISTER_STRUCT_KERNEL( - inplace_abn, GPU, ALL_LAYOUT, ops::InplaceABNKernel, float, double) {} -PD_REGISTER_STRUCT_KERNEL(inplace_abn_grad, - GPU, - ALL_LAYOUT, - ops::InplaceABNGradKernel, - float, - double) {} -#endif diff --git a/paddle/fluid/operators/inplace_abn_op.h b/paddle/fluid/operators/inplace_abn_op.h deleted file mode 100644 index abdb1e33aaae8..0000000000000 --- a/paddle/fluid/operators/inplace_abn_op.h +++ /dev/null @@ -1,130 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#include - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/activation_op.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace paddle { -namespace operators { - -USE_PHI_FUNCTOR(LeakyRelu) - -template -using EigenMatrix = framework::EigenMatrix; - -template -using EigenVector = framework::EigenVector; - -enum InplaceABNActivationType { identity = 0, leakyrelu = 1, elu = 2 }; - -inline InplaceABNActivationType GetInplaceABNActivationType( - const std::string& type) { - if (type == "leaky_relu") { - return InplaceABNActivationType::leakyrelu; - } else if (type == "elu") { - return InplaceABNActivationType::elu; - } else if (type == "identity" || type == "") { - return InplaceABNActivationType::identity; - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "unsupported activation type %s for Op(inplace_abn)", type)); - } -} - -template -class InplaceABNActivation { - private: - template - void setAttrs(const framework::ExecutionContext& ctx, Functor* functor) { - auto attrs = functor->GetAttrs(); - for (auto& attr : attrs) { - *attr.second = ctx.Attr(attr.first); - } - } - - template - void compute(const framework::ExecutionContext& ctx, - Functor* functor, - Args... args) { - setAttrs(ctx, functor); - (*functor)(args...); - } - - public: - template - void Compute(const framework::ExecutionContext& ctx, - const int act_type, - const Device& d, - X x, - Y y) { - if (act_type == InplaceABNActivationType::identity) { - y.device(d) = x; - } else if (act_type == InplaceABNActivationType::leakyrelu) { - LeakyReluFunctor functor; - compute(ctx, &functor, d, x, y); - } else if (act_type == InplaceABNActivationType::elu) { - ELUFunctor functor; - compute(ctx, &functor, d, x, y); - } else { - PADDLE_THROW( - platform::errors::InvalidArgument("unsupported activation type")); - } - } - - template - void GradCompute(const framework::ExecutionContext& ctx, - const int act_type, - const Device& d, - X x, - Y y, - DX dx, - DY dy) { - const float alpha = ctx.Attr("alpha"); - - if (act_type == InplaceABNActivationType::identity) { - x.device(d) = y; - dx.device(d) = dy; - } else if (act_type == InplaceABNActivationType::leakyrelu) { - auto temp1 = (y < static_cast(0)).template cast().eval() / - static_cast(alpha); - auto temp2 = (y >= static_cast(0)).template cast().eval(); - x.device(d) = y * (temp1 + temp2).template cast(); - - LeakyReluGradFunctor functor; - compute(ctx, &functor, d, x, y, dy, dx); - } else if (act_type == InplaceABNActivationType::elu) { - auto temp1 = (y >= static_cast(0)).template cast().eval(); - auto temp = (y < static_cast(0)).template cast().eval(); - auto temp2 = (y * temp / static_cast(alpha) + static_cast(1)).log(); - x.device(d) = (y * temp1 + temp2).template cast(); - - ELUGradNegativeAlphaFunctor functor; - compute(ctx, &functor, d, x, y, dy, dx); - } else { - PADDLE_THROW( - platform::errors::InvalidArgument("unsupported activation type")); - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/unity_build_rule.cmake b/paddle/fluid/operators/unity_build_rule.cmake index 50a4ab2cc5eca..4c8bff3a899f8 100644 --- a/paddle/fluid/operators/unity_build_rule.cmake +++ b/paddle/fluid/operators/unity_build_rule.cmake @@ -123,7 +123,6 @@ register_unity_group( isfinite_v2_op.cc) register_unity_group( cc - inplace_abn_op.cc interpolate_v2_op.cc inverse_op.cc is_empty_op.cc @@ -406,7 +405,6 @@ register_unity_group( isfinite_v2_op.cu) register_unity_group( cu - inplace_abn_op.cu interpolate_v2_op.cu isfinite_op.cu l1_norm_op.cu diff --git a/paddle/fluid/pybind/eager_generator.h b/paddle/fluid/pybind/eager_generator.h index 03b8690569c22..9e7a2ec10063b 100644 --- a/paddle/fluid/pybind/eager_generator.h +++ b/paddle/fluid/pybind/eager_generator.h @@ -249,8 +249,6 @@ std::map> op_ins_map = { {"crop", {"X", "Y", "Offsets"}}, {"batch_norm", {"X", "Scale", "Bias", "Mean", "Variance", "MomentumTensor"}}, - {"inplace_abn", - {"X", "Scale", "Bias", "Mean", "Variance", "MomentumTensor"}}, {"linear_interp", {"X", "OutSize"}}, {"bilinear_interp", {"X", "OutSize"}}, {"trilinear_interp", {"X", "OutSize"}}, @@ -297,13 +295,6 @@ std::map> op_outs_map = { "SavedVariance", "ReserveSpace"}}, {"lstsq", {"Solution", "Residuals", "Rank", "SingularValues"}}, - {"inplace_abn", - {"Y", - "MeanOut", - "VarianceOut", - "SavedMean", - "SavedVariance", - "ReserveSpace"}}, {"fused_attention", {"LnMean", "LnVariance", "LnOut", "QKVOut", "QKVBiasOut", "TransposeOut2", @@ -485,7 +476,6 @@ std::map> op_passing_outs_map = { {"merged_momentum", {"ParamOut", "VelocityOut", "MasterParamOut"}}, {"sparse_momentum", {"ParamOut", "VelocityOut", "MasterParamOut"}}, {"batch_norm", {"MeanOut", "VarianceOut"}}, - {"inplace_abn", {"MeanOut", "VarianceOut"}}, {"sync_batch_norm", {"MeanOut", "VarianceOut"}}, {"accuracy", {"Correct", "Total"}}, {"fill_constant", {"Out"}}, diff --git a/paddle/phi/api/yaml/op_compat.yaml b/paddle/phi/api/yaml/op_compat.yaml index f74df02af26d2..30041bf323b1e 100755 --- a/paddle/phi/api/yaml/op_compat.yaml +++ b/paddle/phi/api/yaml/op_compat.yaml @@ -1490,11 +1490,6 @@ attrs : axis : dim -- op : inplace_abn - backward : inplace_abn_grad - extra : - attrs : [bool use_mkldnn = false, bool fuse_with_relu = false] - - op : instance_norm inputs : x : X diff --git a/test/legacy_test/CMakeLists.txt b/test/legacy_test/CMakeLists.txt index 2768babd07f13..8d88d7db8de1d 100644 --- a/test/legacy_test/CMakeLists.txt +++ b/test/legacy_test/CMakeLists.txt @@ -852,7 +852,6 @@ set_tests_properties( test_buffer_shared_memory_reuse_pass PROPERTIES LABELS "RUN_TYPE=DIST") set_tests_properties( test_sync_batch_norm_op - test_inplace_abn_op test_parallel_executor_seresnext_base_gpu test_parallel_executor_seresnext_with_reduce_gpu test_parallel_executor_seresnext_with_fuse_all_reduce_gpu diff --git a/test/legacy_test/test_inplace_abn_op.py b/test/legacy_test/test_inplace_abn_op.py deleted file mode 100644 index d56a467a2ed79..0000000000000 --- a/test/legacy_test/test_inplace_abn_op.py +++ /dev/null @@ -1,113 +0,0 @@ -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import unittest - -import numpy as np - -import paddle -from paddle import base -from paddle.base import core - - -class TestInplaceANBOpTraining(unittest.TestCase): - def setUp(self): - self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64 - self.N = 4 - self.C = 5 - self.H = 7 - self.W = 9 - self.dshape = [self.N, self.C, self.H, self.W] - - def build_program( - self, - place, - layout, - seed, - only_forward=False, - activation="identity", - alpha=1.0, - use_cuda=False, - inplace=False, - ): - main = base.Program() - startup = base.Program() - main.random_seed = seed - startup.random_seed = seed - with base.unique_name.guard(): - with base.program_guard(main, startup): - data = paddle.static.data( - name='input', - shape=self.dshape, - dtype=self.dtype, - ) - data.stop_gradient = False - data.desc.set_need_check_feed(False) - - bn = paddle.static.nn.batch_norm( - data, - param_attr=base.ParamAttr(name='bn_scale'), - bias_attr=base.ParamAttr(name='bn_bias'), - moving_mean_name='bn_moving_mean', - moving_variance_name='bn_moving_variance', - data_layout=layout, - is_test=only_forward, - in_place=inplace, - ) - if activation == 'leaky_relu': - bn = paddle.nn.functional.leaky_relu(bn, alpha) - if activation == 'elu': - bn = paddle.nn.functional.elu(bn, alpha) - - # NOTE: in inplace mode input and output of bn - # may have same name, multiply 1. to generate - # a new Variable for fetch - bn = bn * 1.0 - sigmoid = paddle.nn.functional.sigmoid(bn) - out = paddle.sum(sigmoid) - if not only_forward: - sgd_opt = paddle.optimizer.SGD(learning_rate=0.0) - sgd_opt.backward(out) - return main, startup, [out, bn] - - def test_all_branches(self): - seed = 10 - os.environ['FLAGS_cudnn_deterministic'] = "1" - data = np.random.random(size=self.dshape).astype(self.dtype) * 4.0 - 2 - use_cudas = [False, True] if core.is_compiled_with_cuda() else [False] - alpha = 0.1 - layouts = ["NCHW", "NHWC"] - for use_cuda in use_cudas: - place = core.CUDAPlace(0) if use_cuda else core.CPUPlace() - for layout in layouts: - for activation in ['identity', 'leaky_relu']: - main, startup, outs = self.build_program( - place, - layout, - seed, - False, - activation, - alpha, - use_cuda, - False, - ) - exe = base.Executor(place) - exe.run(startup) - exe.run(program=main, feed={'input': data}) - - -if __name__ == '__main__': - paddle.enable_static() - unittest.main() diff --git a/tools/parallel_UT_rule.py b/tools/parallel_UT_rule.py index b684db687f236..bce8ec2ef55c0 100755 --- a/tools/parallel_UT_rule.py +++ b/tools/parallel_UT_rule.py @@ -1486,7 +1486,6 @@ 'test_save_inference_model', 'test_smooth_l1_loss', 'test_bilateral_slice_op', - 'test_inplace_abn_op', 'test_parallel_executor_seresnext_base_gpu', 'test_parallel_executor_seresnext_with_fuse_all_reduce_gpu', 'test_parallel_executor_seresnext_with_reduce_gpu', @@ -2307,7 +2306,6 @@ 'test_standalone_controlflow', 'test_standalone_multiply_write', 'test_reshape_op', - 'test_inplace_abn_op', 'test_fused_transformer_encoder_layer', 'test_eager_deletion_while_op', 'test_dataloader_unkeep_order', diff --git a/tools/static_mode_white_list.py b/tools/static_mode_white_list.py index f6165107624d1..80d14655af003 100755 --- a/tools/static_mode_white_list.py +++ b/tools/static_mode_white_list.py @@ -275,7 +275,6 @@ 'test_infer_no_need_buffer_slots', 'test_inference_model_io', 'test_initializer', - 'test_inplace_abn_op', 'test_inplace_addto_strategy', 'test_inplace_softmax_with_cross_entropy', 'test_input_spec', From ba25e4f5258c56a2cd601b115d37351bd5189444 Mon Sep 17 00:00:00 2001 From: wanghuancoder Date: Tue, 10 Oct 2023 09:21:30 +0800 Subject: [PATCH 46/62] refine (#57805) --- .../fluid/operators/detection/CMakeLists.txt | 3 - .../detection/sigmoid_focal_loss_op.cc | 276 --------------- .../detection/sigmoid_focal_loss_op.cu | 201 ----------- .../detection/sigmoid_focal_loss_op.h | 130 ------- .../operators/detection/target_assign_op.cc | 191 ---------- .../operators/detection/target_assign_op.cu | 70 ---- .../operators/detection/target_assign_op.h | 181 ---------- paddle/fluid/operators/mean_iou_op.cc | 112 ------ paddle/fluid/operators/mean_iou_op.cu | 170 --------- paddle/fluid/operators/mean_iou_op.h | 126 ------- .../optimizers/proximal_adagrad_op.cc | 137 -------- .../optimizers/proximal_adagrad_op.cu | 17 - .../optimizers/proximal_adagrad_op.h | 66 ---- .../optimizers/unity_build_rule.cmake | 2 - paddle/fluid/operators/sample_logits_op.cc | 282 --------------- paddle/fluid/operators/sample_logits_op.cu | 301 ---------------- paddle/fluid/operators/sample_logits_op.h | 330 ------------------ paddle/fluid/operators/unity_build_rule.cmake | 4 - test/legacy_test/test_mean_iou.py | 158 --------- test/legacy_test/test_proximal_adagrad_op.py | 51 --- test/legacy_test/test_sample_logits_op.py | 119 ------- .../legacy_test/test_sigmoid_focal_loss_op.py | 145 -------- test/legacy_test/test_target_assign_op.py | 195 ----------- tools/parallel_UT_rule.py | 10 - tools/static_mode_white_list.py | 5 - 25 files changed, 3282 deletions(-) delete mode 100644 paddle/fluid/operators/detection/sigmoid_focal_loss_op.cc delete mode 100644 paddle/fluid/operators/detection/sigmoid_focal_loss_op.cu delete mode 100644 paddle/fluid/operators/detection/sigmoid_focal_loss_op.h delete mode 100644 paddle/fluid/operators/detection/target_assign_op.cc delete mode 100644 paddle/fluid/operators/detection/target_assign_op.cu delete mode 100644 paddle/fluid/operators/detection/target_assign_op.h delete mode 100644 paddle/fluid/operators/mean_iou_op.cc delete mode 100644 paddle/fluid/operators/mean_iou_op.cu delete mode 100644 paddle/fluid/operators/mean_iou_op.h delete mode 100644 paddle/fluid/operators/optimizers/proximal_adagrad_op.cc delete mode 100644 paddle/fluid/operators/optimizers/proximal_adagrad_op.cu delete mode 100644 paddle/fluid/operators/optimizers/proximal_adagrad_op.h delete mode 100644 paddle/fluid/operators/sample_logits_op.cc delete mode 100644 paddle/fluid/operators/sample_logits_op.cu delete mode 100644 paddle/fluid/operators/sample_logits_op.h delete mode 100644 test/legacy_test/test_mean_iou.py delete mode 100644 test/legacy_test/test_proximal_adagrad_op.py delete mode 100644 test/legacy_test/test_sample_logits_op.py delete mode 100644 test/legacy_test/test_sigmoid_focal_loss_op.py delete mode 100644 test/legacy_test/test_target_assign_op.py diff --git a/paddle/fluid/operators/detection/CMakeLists.txt b/paddle/fluid/operators/detection/CMakeLists.txt index 3c5c7df83440d..fe32cc32d02d4 100644 --- a/paddle/fluid/operators/detection/CMakeLists.txt +++ b/paddle/fluid/operators/detection/CMakeLists.txt @@ -43,7 +43,6 @@ detection_library(bipartite_match_op SRCS bipartite_match_op.cc) detection_library(mine_hard_examples_op SRCS mine_hard_examples_op.cc) detection_library(anchor_generator_op SRCS anchor_generator_op.cc anchor_generator_op.cu) -detection_library(target_assign_op SRCS target_assign_op.cc target_assign_op.cu) detection_library(polygon_box_transform_op SRCS polygon_box_transform_op.cc polygon_box_transform_op.cu) detection_library(rpn_target_assign_op SRCS rpn_target_assign_op.cc) @@ -54,8 +53,6 @@ detection_library(locality_aware_nms_op SRCS locality_aware_nms_op.cc DEPS phi) detection_library(box_clip_op SRCS box_clip_op.cc box_clip_op.cu) detection_library(box_decoder_and_assign_op SRCS box_decoder_and_assign_op.cc box_decoder_and_assign_op.cu) -detection_library(sigmoid_focal_loss_op SRCS sigmoid_focal_loss_op.cc - sigmoid_focal_loss_op.cu) detection_library(retinanet_detection_output_op SRCS retinanet_detection_output_op.cc) diff --git a/paddle/fluid/operators/detection/sigmoid_focal_loss_op.cc b/paddle/fluid/operators/detection/sigmoid_focal_loss_op.cc deleted file mode 100644 index fe716adb9f20a..0000000000000 --- a/paddle/fluid/operators/detection/sigmoid_focal_loss_op.cc +++ /dev/null @@ -1,276 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/detection/sigmoid_focal_loss_op.h" - -#include -#include -#include - -namespace paddle { -namespace operators { - -class SigmoidFocalLossOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "sigmoid_focal_loss"); - OP_INOUT_CHECK( - ctx->HasInput("Label"), "Input", "Label", "sigmoid_focal_loss"); - OP_INOUT_CHECK( - ctx->HasInput("FgNum"), "Input", "FgNum", "sigmoid_focal_loss"); - OP_INOUT_CHECK( - ctx->HasOutput("Out"), "Output", "Out", "sigmoid_focal_loss"); - - auto x_dims = ctx->GetInputDim("X"); - auto labels_dims = ctx->GetInputDim("Label"); - auto fg_dims = ctx->GetInputDim("FgNum"); - - int rank = x_dims.size(); - PADDLE_ENFORCE_EQ( - rank, - labels_dims.size(), - platform::errors::InvalidArgument( - "The rank of Input(X) should be equal to the rank of Input(Label), " - "but received X rank is:%d, X shape is:[%s], " - "Label rank is:%d, Label shape is:[%s].", - rank, - x_dims, - labels_dims.size(), - labels_dims)); - PADDLE_ENFORCE_EQ( - fg_dims.size(), - 1, - platform::errors::InvalidArgument( - "The rank of Input(FgNum) must be 1, but received FgNum rank is " - ":%d, FgNum shape is:[%s].", - fg_dims.size(), - fg_dims)); - bool check = true; - if ((!ctx->IsRuntime()) && - (phi::product(x_dims) <= 0 || phi::product(labels_dims) <= 0)) { - check = false; - } - - if (check) { - PADDLE_ENFORCE_EQ( - phi::slice_ddim(x_dims, 0, rank - 1), - phi::slice_ddim(labels_dims, 0, rank - 1), - platform::errors::InvalidArgument( - "Input(X) and Input(Label) should have the same shape " - "except the last dimension, but received X shape is:[%s], " - "Label shape is:[%s].", - x_dims, - labels_dims)); - } - - PADDLE_ENFORCE_EQ( - labels_dims[rank - 1], - 1UL, - platform::errors::InvalidArgument( - "The last dimension of Input(Label) should be 1, but received " - "Label shape is:[%s].", - labels_dims)); - - ctx->ShareDim("X", /*->*/ "Out"); - ctx->ShareLoD("X", /*->*/ "Out"); - } - - protected: - phi::KernelKey GetExpectedKernelType( - const framework::ExecutionContext& ctx) const override { - return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(ctx, "X"), - ctx.GetPlace()); - } -}; - -class SigmoidFocalLossGradOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "sigmoid_focal_loss"); - OP_INOUT_CHECK( - ctx->HasInput("Label"), "Input", "Label", "sigmoid_focal_loss"); - OP_INOUT_CHECK( - ctx->HasInput("FgNum"), "Input", "FgNum", "sigmoid_focal_loss"); - OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")), - "Input", - "Out@GRAD", - "sigmoid_focal_loss"); - OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("X")), - "Output", - "X@GRAD", - "sigmoid_focal_loss"); - - auto x_dims = ctx->GetInputDim("X"); - auto labels_dims = ctx->GetInputDim("Label"); - auto fg_dims = ctx->GetInputDim("FgNum"); - auto dout_dims = ctx->GetInputDim(framework::GradVarName("Out")); - - int rank = x_dims.size(); - PADDLE_ENFORCE_EQ( - rank, - labels_dims.size(), - platform::errors::InvalidArgument( - "The rank of Input(X) should be equal to the rank of Input(Label), " - "but received X rank is:%d, X shape is:[%s], " - "Label rank is:%d, Label shape is:[%s].", - rank, - x_dims, - labels_dims.size(), - labels_dims)); - PADDLE_ENFORCE_EQ( - fg_dims.size(), - 1, - platform::errors::InvalidArgument( - "The rank of Input(FgNum) must be 1, but received FgNum rank is " - ":%d, FgNum shape is:[%s].", - fg_dims.size(), - fg_dims)); - bool check = true; - if ((!ctx->IsRuntime()) && - (phi::product(x_dims) <= 0 || phi::product(labels_dims) <= 0)) { - check = false; - } - - if (check) { - PADDLE_ENFORCE_EQ( - phi::slice_ddim(x_dims, 0, rank - 1), - phi::slice_ddim(labels_dims, 0, rank - 1), - platform::errors::InvalidArgument( - "Input(X) and Input(Label) should have the same shape " - "except the last dimension, but received X shape is:[%s], " - "Label shape is:[%s].", - x_dims, - labels_dims)); - - PADDLE_ENFORCE_EQ( - labels_dims[rank - 1], - 1UL, - platform::errors::InvalidArgument( - "The last dimension of Input(Label) should be 1, but received " - "Label shape is:[%s].", - labels_dims)); - - PADDLE_ENFORCE_EQ(phi::slice_ddim(x_dims, 0, rank), - phi::slice_ddim(dout_dims, 0, rank), - platform::errors::InvalidArgument( - "Input(X) and Input(Out@Grad) should have the same " - "shape, but received " - "X shape is:[%s], Out@Grad shape is:[%s].", - x_dims, - dout_dims)); - } - - ctx->SetOutputDim(framework::GradVarName("X"), x_dims); - } - - protected: - phi::KernelKey GetExpectedKernelType( - const framework::ExecutionContext& ctx) const override { - return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(ctx, "X"), - ctx.GetPlace()); - } -}; - -class SigmoidFocalLossOpMaker : public framework::OpProtoAndCheckerMaker { - public: - void Make() override { - AddInput("X", - "(Tensor, default Tensor), a 2-D tensor with shape [N, D], " - "where N is the batch size and D is the number of classes " - "(excluding background). This input is a tensor of logits " - "computed by the previous operator."); - AddInput("Label", - "(Tensor, default Tensor), a 2-D tensor with shape [N, 1]. " - "This input is a tensor of probabilistic labels."); - AddInput("FgNum", - "(Tensor, default Tensor), a 1-D tensor with shape [1]. " - "This input is the number of foreground."); - AddOutput( - "Out", - "(Tensor, default Tensor), a 2-D tensor with shape [N, D]. " - "This output is the focal loss."); - AddAttr( - "gamma", - "Hyper-parameter of sigmoid focal loss op, which is to balance the " - "easy and hard examples. " - "A float scalar with default value 2.0.") - .SetDefault(2.0); - AddAttr( - "alpha", - "Hyper-parameter of sigmoid focal loss op, which is to balance the " - "positive and negative examples. " - "A float scalar with default value 0.5.") - .SetDefault(0.25); - AddComment(R"DOC( -Sigmoid Focal Loss Operator. - -Focal loss is used to address the foreground-background class imbalance existed -on the training phase of one-stage detectors. This operator computes the sigmoid -value for each element in the input tensor, after which focal loss is measured. - -The focal loss is given as follows: - -$$Loss_j = (-Label_j * alpha * \pow(1 - \sigma(X_j), gamma) * \log(\sigma(X_j)) - -(1 - Labels_j) * (1 - alpha) * \pow(\sigma(X_j), gamma) * \log(1 - \sigma(X_j))) -/ FgNum, j = 1,...,K$$ - -We know that $$\sigma(X_j) = \\frac{1}{1 + \exp(-X_j)}$$. - -)DOC"); - } -}; - -template -class SigmoidFocalLossGradOpMaker : public framework::SingleGradOpMaker { - public: - using framework::SingleGradOpMaker::SingleGradOpMaker; - - protected: - void Apply(GradOpPtr op) const override { - op->SetType("sigmoid_focal_loss_grad"); - op->SetInput("X", this->Input("X")); - op->SetInput("Label", this->Input("Label")); - op->SetInput("FgNum", this->Input("FgNum")); - op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out")); - op->SetOutput(framework::GradVarName("X"), this->InputGrad("X")); - op->SetAttrMap(this->Attrs()); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OPERATOR(sigmoid_focal_loss, - ops::SigmoidFocalLossOp, - ops::SigmoidFocalLossOpMaker, - ops::SigmoidFocalLossGradOpMaker, - ops::SigmoidFocalLossGradOpMaker); -REGISTER_OPERATOR(sigmoid_focal_loss_grad, ops::SigmoidFocalLossGradOp); -PD_REGISTER_STRUCT_KERNEL(sigmoid_focal_loss, - CPU, - ALL_LAYOUT, - ops::SigmoidFocalLossKernel, - float, - double) {} -PD_REGISTER_STRUCT_KERNEL(sigmoid_focal_loss_grad, - CPU, - ALL_LAYOUT, - ops::SigmoidFocalLossGradKernel, - float, - double) {} diff --git a/paddle/fluid/operators/detection/sigmoid_focal_loss_op.cu b/paddle/fluid/operators/detection/sigmoid_focal_loss_op.cu deleted file mode 100644 index 5d29d52669d4f..0000000000000 --- a/paddle/fluid/operators/detection/sigmoid_focal_loss_op.cu +++ /dev/null @@ -1,201 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#include "paddle/fluid/operators/detection/sigmoid_focal_loss_op.h" -#include "paddle/phi/backends/gpu/gpu_primitives.h" -#include "paddle/phi/core/hostdevice.h" -#include "paddle/phi/kernels/funcs/math.h" - -namespace paddle { -namespace operators { - -static constexpr int kNumCUDAThreads = 512; -static constexpr int kNumMaxinumNumBlocks = 4096; - -static inline int NumBlocks(const int N) { - return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads, - kNumMaxinumNumBlocks); -} - -template -__global__ void GPUSigmoidFocalLossForward(const T *x_data, - const int *label_data, - const int *fg_num_data, - const T gamma, - const T alpha, - const int num_classes, - const int limit, - T *out_data) { - CUDA_KERNEL_LOOP(i, limit) { - T x = x_data[i]; - int a = i / num_classes; // current sample - int d = i % num_classes; // current class - int g = label_data[a]; // target - - // check whether the input data is positive or negative - // the target classes are in range 1-81 - // and the d is in range 0-80 - T c_pos = static_cast(g == (d + 1)); - T c_neg = static_cast((g != -1) & (g != (d + 1))); - - T fg_num = static_cast((fg_num_data[0] > 1) ? fg_num_data[0] : 1); - T s_neg = (1.0 - alpha) / fg_num; - T s_pos = alpha / fg_num; - - // p = 1. / 1. + expf(-x) - T p = 1. / (1. + phi::funcs::real_exp(-x)); - - // (1 - p)**gamma * log(p) - T term_pos = std::pow(static_cast(1. - p), gamma) * - phi::funcs::real_log(p > FLT_MIN ? p : FLT_MIN); - // p**gamma * log(1 - p) - T term_neg = std::pow(p, gamma) * - (-1. * x * (x >= 0) - - phi::funcs::real_log( - 1. + phi::funcs::real_exp(x - 2. * x * (x >= 0)))); - - out_data[i] = 0.0; - out_data[i] += -c_pos * term_pos * s_pos; - out_data[i] += -c_neg * term_neg * s_neg; - } -} - -template -__global__ void GPUSigmoidFocalLossBackward(const T *x_data, - const int *label_data, - const int *fg_num_data, - const T gamma, - const T alpha, - const int num_classes, - const T *dout_data, - const int limit, - T *dx_data) { - CUDA_KERNEL_LOOP(i, limit) { - T x = x_data[i]; - T dout = dout_data[i]; - - int a = i / num_classes; // current sample - int d = i % num_classes; // current class - - T fg_num = static_cast((fg_num_data[0] > 1) ? fg_num_data[0] : 1); - T s_neg = (1.0 - alpha) / fg_num; - T s_pos = alpha / fg_num; - - int g = label_data[a]; - T c_pos = static_cast(g == (d + 1)); - T c_neg = static_cast((g != -1) & (g != (d + 1))); - - T p = 1. / (1. + phi::funcs::real_exp(-x)); - - // (1-p)**g * (1 - p - g*p*log(p)) - T term_pos = - std::pow(static_cast(1. - p), gamma) * - (1. - p - - (p * gamma * phi::funcs::real_log(p > FLT_MIN ? p : FLT_MIN))); - // (p**g) * (g*(1-p)*log(1-p) - p) - T term_neg = std::pow(p, gamma) * - ((-1. * x * (x >= 0) - - phi::funcs::real_log( - 1. + phi::funcs::real_exp(x - 2. * x * (x >= 0)))) * - (1. - p) * gamma - - p); - - dx_data[i] = 0.0; - dx_data[i] += -c_pos * s_pos * term_pos; - dx_data[i] += -c_neg * s_neg * term_neg; - dx_data[i] = dx_data[i] * dout; - } -} - -template -class GPUSigmoidFocalLossKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &context) const override { - const phi::DenseTensor *X = context.Input("X"); - const phi::DenseTensor *Labels = context.Input("Label"); - const phi::DenseTensor *FgNum = context.Input("FgNum"); - phi::DenseTensor *Out = context.Output("Out"); - T gamma = static_cast(context.Attr("gamma")); - T alpha = static_cast(context.Attr("alpha")); - auto x_dims = X->dims(); - int num_classes = static_cast(x_dims[1]); - auto out_data = Out->mutable_data(context.GetPlace()); - - auto &dev_ctx = context.cuda_device_context(); - - int limit = Out->numel(); - int blocks = NumBlocks(limit); - int threads = kNumCUDAThreads; - GPUSigmoidFocalLossForward - <<>>(X->data(), - Labels->data(), - FgNum->data(), - gamma, - alpha, - num_classes, - limit, - out_data); - } -}; - -template -class GPUSigmoidFocalLossGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &context) const override { - const phi::DenseTensor *X = context.Input("X"); - const phi::DenseTensor *Labels = context.Input("Label"); - const phi::DenseTensor *FgNum = context.Input("FgNum"); - const phi::DenseTensor *dOut = - context.Input(framework::GradVarName("Out")); - phi::DenseTensor *dX = - context.Output(framework::GradVarName("X")); - auto dx_data = dX->mutable_data(context.GetPlace()); - T gamma = static_cast(context.Attr("gamma")); - T alpha = static_cast(context.Attr("alpha")); - auto x_dims = X->dims(); - int num_classes = static_cast(x_dims[1]); - - auto &dev_ctx = context.cuda_device_context(); - - int limit = dX->numel(); - int blocks = NumBlocks(limit); - int threads = kNumCUDAThreads; - GPUSigmoidFocalLossBackward - <<>>(X->data(), - Labels->data(), - FgNum->data(), - gamma, - alpha, - num_classes, - dOut->data(), - limit, - dx_data); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -PD_REGISTER_STRUCT_KERNEL(sigmoid_focal_loss, - GPU, - ALL_LAYOUT, - ops::GPUSigmoidFocalLossKernel, - float, - double) {} -PD_REGISTER_STRUCT_KERNEL(sigmoid_focal_loss_grad, - GPU, - ALL_LAYOUT, - ops::GPUSigmoidFocalLossGradKernel, - float, - double) {} diff --git a/paddle/fluid/operators/detection/sigmoid_focal_loss_op.h b/paddle/fluid/operators/detection/sigmoid_focal_loss_op.h deleted file mode 100644 index 28cac641d1452..0000000000000 --- a/paddle/fluid/operators/detection/sigmoid_focal_loss_op.h +++ /dev/null @@ -1,130 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include -#include - -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -template -class SigmoidFocalLossKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &context) const override { - const phi::DenseTensor *X = context.Input("X"); - const phi::DenseTensor *Labels = context.Input("Label"); - const phi::DenseTensor *FgNum = context.Input("FgNum"); - phi::DenseTensor *Out = context.Output("Out"); - T gamma = static_cast(context.Attr("gamma")); - T alpha = static_cast(context.Attr("alpha")); - auto out_data = Out->mutable_data(context.GetPlace()); - int limit = Out->numel(); - auto x_data = X->data(); - auto label_data = Labels->data(); - auto fg_num_data = FgNum->data(); - auto x_dims = X->dims(); - int num_classes = static_cast(x_dims[1]); - - for (int idx = 0; idx < limit; ++idx) { - T x = x_data[idx]; - int a = idx / num_classes; // current sample - int d = idx % num_classes; // current class - int g = label_data[a]; // target - - // Check whether the input data is positive or negative - // The target classes are in range 1-81 - // and the d is in range 0-80 - T c_pos = static_cast(g == (d + 1)); - T c_neg = static_cast((g != -1) & (g != (d + 1))); - T fg_num = static_cast((fg_num_data[0] > 1) ? fg_num_data[0] : 1); - T s_neg = (1.0 - alpha) / fg_num; - T s_pos = alpha / fg_num; - - // p = 1. / 1. + expf(-x) - T p = 1. / (1. + std::exp(-x)); - - // (1 - p)**gamma * log(p) where - T term_pos = std::pow(static_cast(1. - p), gamma) * - std::log(p > FLT_MIN ? p : FLT_MIN); - // p**gamma * log(1 - p) - T term_neg = - std::pow(p, gamma) * - (-1. * x * (x >= 0) - std::log(1. + std::exp(x - 2. * x * (x >= 0)))); - - out_data[idx] = 0.0; - out_data[idx] += -c_pos * term_pos * s_pos; - out_data[idx] += -c_neg * term_neg * s_neg; - } - } -}; - -template -class SigmoidFocalLossGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &context) const override { - const phi::DenseTensor *X = context.Input("X"); - const phi::DenseTensor *Labels = context.Input("Label"); - const phi::DenseTensor *FgNum = context.Input("FgNum"); - const phi::DenseTensor *dOut = - context.Input(framework::GradVarName("Out")); - phi::DenseTensor *dX = - context.Output(framework::GradVarName("X")); - auto dx_data = dX->mutable_data(context.GetPlace()); - T gamma = static_cast(context.Attr("gamma")); - T alpha = static_cast(context.Attr("alpha")); - auto x_dims = X->dims(); - int num_classes = static_cast(x_dims[1]); - - int limit = dX->numel(); - auto x_data = X->data(); - auto label_data = Labels->data(); - auto fg_num_data = FgNum->data(); - auto dout_data = dOut->data(); - for (int idx = 0; idx < limit; ++idx) { - T x = x_data[idx]; - int a = idx / num_classes; // current sample - int d = idx % num_classes; // current class - - T fg_num = static_cast((fg_num_data[0] > 1) ? fg_num_data[0] : 1); - T s_neg = static_cast((1.0 - alpha) / fg_num); - T s_pos = alpha / fg_num; - int g = label_data[a]; - - T c_pos = static_cast(g == (d + 1)); - T c_neg = static_cast((g != -1) & (g != (d + 1))); - T p = 1. / (1. + std::exp(-x)); - - // (1-p)**g * (1 - p - g*p*log(p)) - T term_pos = std::pow(static_cast(1. - p), gamma) * - (1. - p - (p * gamma * std::log(p > FLT_MIN ? p : FLT_MIN))); - // (p**g) * (g*(1-p)*log(1-p) - p) - T term_neg = std::pow(p, gamma) * - ((-1. * x * (x >= 0) - - std::log(1. + std::exp(x - 2. * x * (x >= 0)))) * - (1. - p) * gamma - - p); - dx_data[idx] = 0.0; - dx_data[idx] += -c_pos * s_pos * term_pos; - dx_data[idx] += -c_neg * s_neg * term_neg; - dx_data[idx] = dx_data[idx] * dout_data[idx]; - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/detection/target_assign_op.cc b/paddle/fluid/operators/detection/target_assign_op.cc deleted file mode 100644 index 437b46c459ff3..0000000000000 --- a/paddle/fluid/operators/detection/target_assign_op.cc +++ /dev/null @@ -1,191 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/detection/target_assign_op.h" - -namespace paddle { -namespace operators { - -class TargetAssignOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE_EQ(ctx->HasInput("X"), - true, - platform::errors::InvalidArgument( - "Input(X) of TargetAssignOp should not be null")); - PADDLE_ENFORCE_EQ( - ctx->HasInput("MatchIndices"), - true, - platform::errors::InvalidArgument( - "Input(MatchIndices) of TargetAssignOp should not be null")); - - PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), - true, - platform::errors::InvalidArgument( - "Output(Out) of TargetAssignOp should not be null.")); - PADDLE_ENFORCE_EQ( - ctx->HasOutput("OutWeight"), - true, - platform::errors::InvalidArgument( - "Output(OutWeight) of TargetAssignOp should not be null.")); - - auto in_dims = ctx->GetInputDim("X"); - auto mi_dims = ctx->GetInputDim("MatchIndices"); - - PADDLE_ENFORCE_EQ( - in_dims.size(), - 3, - platform::errors::InvalidArgument( - "Expected the rank of Input(X) is 3. But received %d.", - in_dims.size())); - PADDLE_ENFORCE_EQ(mi_dims.size(), - 2, - platform::errors::InvalidArgument( - "The rank of Input(MatchIndices) must be 2.")); - - if (ctx->HasInput("NegIndices")) { - auto neg_dims = ctx->GetInputDim("NegIndices"); - PADDLE_ENFORCE_EQ(neg_dims.size(), - 2, - platform::errors::InvalidArgument( - "The rank of Input(NegIndices) must be 2.")); - PADDLE_ENFORCE_EQ( - neg_dims[1], - 1, - platform::errors::InvalidArgument( - "The last dimension of Out(NegIndices) must be 1.")); - } - - auto n = mi_dims[0]; - auto m = mi_dims[1]; - auto k = in_dims[in_dims.size() - 1]; - ctx->SetOutputDim("Out", {n, m, k}); - ctx->SetOutputDim("OutWeight", {n, m, 1}); - } - - protected: - phi::KernelKey GetExpectedKernelType( - const framework::ExecutionContext& ctx) const override { - return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(ctx, "X"), - ctx.GetPlace()); - } -}; - -class TargetAssignOpMaker : public framework::OpProtoAndCheckerMaker { - public: - void Make() override { - AddInput("X", - "(phi::DenseTensor), This input is a 3D phi::DenseTensor with " - "shape [M, P, K]. " - "Some elements in X will be assigned to Out based on the " - "MatchIndices and NegIndices."); - AddInput("MatchIndices", - "(Tensor, default Tensor), The input matched indices " - "with shape [N, P], If MatchIndices[i][j] is -1, the j-th entity " - "of column is not matched to any entity of row in i-th instance."); - AddInput("NegIndices", - "(phi::DenseTensor, default phi::DenseTensor), The input " - "negative example " - "indices are an optional input with shape [Neg, 1], where Neg is " - "the total number of negative example indices.") - .AsDispensable(); - AddAttr("mismatch_value", - "(int, default 0), Fill this value to the " - "mismatched location.") - .SetDefault(0); - AddOutput("Out", - "(Tensor), The output is a 3D Tensor with shape [N, P, K], " - "N and P is the same as they are in NegIndices, K is the " - "same as it in input of X. If MatchIndices[i][j] " - "is -1, the Out[i][j][0 : K] is the mismatch_value."); - AddOutput("OutWeight", - "(Tensor), The weight for output with the shape of [N, P, 1]"); - AddComment(R"DOC( -This operator can be, for given the target bounding boxes or labels, -to assign classification and regression targets to each prediction as well as -weights to prediction. The weights is used to specify which prediction would -not contribute to training loss. - -For each instance, the output `Out` and`OutWeight` are assigned based on -`MatchIndices` and `NegIndices`. -Assumed that the row offset for each instance in `X` is called lod, -this operator assigns classification/regression targets by performing the -following steps: - -1. Assigning all outpts based on `MatchIndices`: - -If id = MatchIndices[i][j] > 0, - - Out[i][j][0 : K] = X[lod[i] + id][j % P][0 : K] - OutWeight[i][j] = 1. - -Otherwise, - - Out[j][j][0 : K] = {mismatch_value, mismatch_value, ...} - OutWeight[i][j] = 0. - -2. Assigning OutWeight based on `NegIndices` if `NegIndices` is provided: - -Assumed that the row offset for each instance in `NegIndices` is called neg_lod, -for i-th instance and each `id` of NegIndices in this instance: - - Out[i][id][0 : K] = {mismatch_value, mismatch_value, ...} - OutWeight[i][id] = 1.0 - - )DOC"); - } -}; - -template -struct NegTargetAssignFunctor { - void operator()(const phi::CPUContext& ctx, - const int* neg_indices, - const size_t* lod, - const int N, - const int M, - const int K, - const int mismatch_value, - T* out, - WT* out_wt) { - for (int i = 0; i < N; ++i) { - for (size_t j = lod[i]; j < lod[i + 1]; ++j) { - int id = neg_indices[j]; - int off = (i * M + id) * K; - for (int k = 0; k < K; ++k) { - out[off + k] = mismatch_value; - out_wt[off + k] = static_cast(1.0); - } - } - } - } -}; - -template struct NegTargetAssignFunctor; -template struct NegTargetAssignFunctor; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OPERATOR( - target_assign, - ops::TargetAssignOp, - ops::TargetAssignOpMaker, - paddle::framework::EmptyGradOpMaker, - paddle::framework::EmptyGradOpMaker); - -PD_REGISTER_STRUCT_KERNEL( - target_assign, CPU, ALL_LAYOUT, ops::TargetAssignKernel, int, float) {} diff --git a/paddle/fluid/operators/detection/target_assign_op.cu b/paddle/fluid/operators/detection/target_assign_op.cu deleted file mode 100644 index 951fcdbafae8e..0000000000000 --- a/paddle/fluid/operators/detection/target_assign_op.cu +++ /dev/null @@ -1,70 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/detection/target_assign_op.h" - -namespace paddle { -namespace operators { - -template -__global__ void NegTargetAssignKernel(const int* neg_indices, - const size_t* lod, - const int N, - const int M, - const int K, - const int mismatch_value, - T* out, - WT* out_wt) { - int bidx = blockIdx.x; - int st = lod[bidx]; - int ed = lod[bidx + 1]; - - int row_start = bidx * M; - for (int i = st + threadIdx.x; i < ed; i += blockDim.x) { - int id = row_start + neg_indices[i]; - for (int k = 0; k < K; ++k) { - out[id * K + k] = T(mismatch_value); - out_wt[id * K + k] = WT(1.); - } - } -} - -template -struct NegTargetAssignFunctor { - void operator()(const phi::GPUContext& ctx, - const int* neg_indices, - const size_t* lod, - const int N, - const int M, - const int K, - const int mismatch_value, - T* out, - WT* out_wt) { - const int block_size = 256; - const int grid_size = N; - NegTargetAssignKernel<<>>( - neg_indices, lod, N, M, K, mismatch_value, out, out_wt); - } -}; - -template struct NegTargetAssignFunctor; -template struct NegTargetAssignFunctor; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -PD_REGISTER_STRUCT_KERNEL( - target_assign, GPU, ALL_LAYOUT, ops::TargetAssignKernel, int, float) {} diff --git a/paddle/fluid/operators/detection/target_assign_op.h b/paddle/fluid/operators/detection/target_assign_op.h deleted file mode 100644 index 484bd8454bae9..0000000000000 --- a/paddle/fluid/operators/detection/target_assign_op.h +++ /dev/null @@ -1,181 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/platform/for_range.h" - -namespace paddle { -namespace operators { -template -struct TargetAssignFunctor { - const T* in_; - const int* match_indices_; - const size_t* lod_; - const int mismatch_value_; - const int64_t N_; - const int64_t M_; - const int64_t P_; - const int64_t K_; - - T* out_; - WT* out_wt_; - - TargetAssignFunctor(const T* input, - const int* match_indices, - const size_t* lod, - const int mismatch_value, - const int64_t N, - const int64_t M, - const int64_t P, - const int64_t K, - T* out, - WT* out_wt) - : in_(input), - match_indices_(match_indices), - lod_(lod), - mismatch_value_(mismatch_value), - N_(N), - M_(M), - P_(P), - K_(K), - out_(out), - out_wt_(out_wt) {} - - HOSTDEVICE void operator()(size_t i) const { - int h = i / M_; - int w = i - h * M_; - - size_t off = lod_[h]; - int id = match_indices_[i]; - - T* out = out_ + i * K_; - WT* out_wt = out_wt_ + i; - - if (id > -1) { - int w_off = w % P_; - const T* in = in_ + ((off + id) * P_ + w_off) * K_; - for (int64_t k = 0; k < K_; ++k) { - out[k] = in[k]; - } - out_wt[0] = static_cast(1.); - } else { - for (int64_t k = 0; k < K_; ++k) { - out[k] = static_cast(mismatch_value_); - } - out_wt[0] = static_cast(0.); - } - } -}; - -template -struct NegTargetAssignFunctor { - void operator()(const platform::DeviceContext& ctx, - const int* neg_indices, - const size_t* lod, - const int N, - const int M, - const int K, - const int mismatch_value, - T* out, - WT* out_wt) const; -}; - -template -class TargetAssignKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* match_indices = ctx.Input("MatchIndices"); - - auto* out = ctx.Output("Out"); - auto* out_wt = ctx.Output("OutWeight"); - - PADDLE_ENFORCE_EQ(x->lod().size(), - 1UL, - platform::errors::InvalidArgument( - "TargetAssignOp input(X) needs 1 level of LoD")); - int mismatch_value = ctx.Attr("mismatch_value"); - - const T* x_data = x->data(); - const int* match_idx_data = match_indices->data(); - - T* out_data = out->mutable_data(ctx.GetPlace()); - WT* out_wt_data = out_wt->mutable_data(ctx.GetPlace()); - - int64_t n = match_indices->dims()[0]; - int64_t m = match_indices->dims()[1]; - int64_t p = x->dims()[1]; - int64_t k = x->dims()[2]; - - auto x_lod = x->lod().back(); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - phi::MixVector mixv_x_lod(&x_lod); - size_t* x_lod_data = mixv_x_lod.MutableData(ctx.GetPlace()); -#else - size_t* x_lod_data = x_lod.data(); -#endif - - TargetAssignFunctor functor(x_data, - match_idx_data, - x_lod_data, - mismatch_value, - n, - m, - p, - k, - out_data, - out_wt_data); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - mixv_x_lod.CopyToCPU(); -#endif - - auto& device_ctx = ctx.template device_context(); - platform::ForRange for_range(device_ctx, n * m); - for_range(functor); - - auto* neg_indices = ctx.Input("NegIndices"); - if (neg_indices) { - PADDLE_ENFORCE_EQ( - neg_indices->lod().size(), - 1UL, - platform::errors::InvalidArgument( - "TargetAssignOp input(NegIndices) needs 1 level of LoD")); - const int* neg_idx_data = neg_indices->data(); - auto neg_lod = neg_indices->lod().back(); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - phi::MixVector mixv_neg_lod(&neg_lod); - size_t* neg_lod_data = mixv_neg_lod.MutableData(ctx.GetPlace()); -#else - size_t* neg_lod_data = neg_lod.data(); -#endif - NegTargetAssignFunctor neg_trg_functor; - neg_trg_functor(device_ctx, - neg_idx_data, - neg_lod_data, - n, - m, - k, - mismatch_value, - out_data, - out_wt_data); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - mixv_neg_lod.CopyToCPU(); -#endif - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/mean_iou_op.cc b/paddle/fluid/operators/mean_iou_op.cc deleted file mode 100644 index d87c49187c2fb..0000000000000 --- a/paddle/fluid/operators/mean_iou_op.cc +++ /dev/null @@ -1,112 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/mean_iou_op.h" - -namespace paddle { -namespace operators { - -class MeanIoUOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK( - ctx->HasInput("Predictions"), "Input", "Predictions", "MeanIoU"); - OP_INOUT_CHECK(ctx->HasInput("Labels"), "Input", "Labels", "MeanIoU"); - OP_INOUT_CHECK( - ctx->HasOutput("OutMeanIou"), "Output", "OutMeanIou", "MeanIoU"); - OP_INOUT_CHECK(ctx->HasOutput("OutWrong"), "Output", "OutWrong", "MeanIoU"); - OP_INOUT_CHECK( - ctx->HasOutput("OutCorrect"), "Output", "OutCorrect", "MeanIoU"); - - int64_t num_classes = - static_cast(ctx->Attrs().Get("num_classes")); - - ctx->SetOutputDim("OutMeanIou", phi::make_ddim({})); - ctx->SetOutputDim("OutWrong", {num_classes}); - ctx->SetOutputDim("OutCorrect", {num_classes}); - } - - protected: - phi::KernelKey GetExpectedKernelType( - const framework::ExecutionContext& ctx) const override { - return phi::KernelKey( - OperatorWithKernel::IndicateVarDataType(ctx, "Predictions"), - ctx.GetPlace()); - } -}; - -class MeanIoUOpMaker : public framework::OpProtoAndCheckerMaker { - public: - void Make() override { - AddInput("Predictions", - "(Tensor), A Tensor of prediction results for semantic labels" - " with type int32 or int64. The rank should be greater than 1."); - AddInput( - "Labels", - "(Tensor), A Tensor of ground truth labels with type int32 or int64." - "Its shape should be the same as Input(Predictions)."); - AddInput("InWrongs", - "(vector), A list of Tensor with shape " - "[num_classes]. They are used to collect wrong number among " - "batches. Empty list is also valid here.") - .AsDuplicable() - .AsDispensable(); - AddInput( - "InCorrects", - "(vector), A list of Tensor with shape " - "[num_classes]. They are used to collect correct number among batches. " - "Empty list is also valid here.") - .AsDuplicable() - .AsDispensable(); - AddInput("InMeanIou", - "(vector), A list of Tensor that Output(mean_iou) should " - "be added to. Empty list is also valid here.") - .AsDuplicable() - .AsDispensable(); - AddOutput("OutMeanIou", - "(vector), A Tensor representing the" - " mean intersection-over-union with shape []."); - AddOutput("OutWrong", "(Tensor), A Tensor with shape [num_classes]. "); - AddOutput("OutCorrect", "(Tensor), A Tensor with shape [num_classes]. "); - AddAttr("num_classes", "(int), The possible number of labels."); - - AddComment(R"DOC( -mean-IOU Operator. -Mean Intersection-Over-Union is a common evaluation metric for -semantic image segmentation, which first computes the IOU for each -semantic class and then computes the average over classes. -IOU is defined as follows: - IOU = true_positive / (true_positive + false_positive + false_negative). -It is based on pixel level area while "IOU Similarity Operator" -is based on area of rectangle. - -)DOC"); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OPERATOR( - mean_iou, - ops::MeanIoUOp, - ops::MeanIoUOpMaker, - paddle::framework::EmptyGradOpMaker, - paddle::framework::EmptyGradOpMaker); - -PD_REGISTER_STRUCT_KERNEL( - mean_iou, CPU, ALL_LAYOUT, ops::MeanIoUKernel, int, int64_t) {} diff --git a/paddle/fluid/operators/mean_iou_op.cu b/paddle/fluid/operators/mean_iou_op.cu deleted file mode 100644 index 46abb4b72910a..0000000000000 --- a/paddle/fluid/operators/mean_iou_op.cu +++ /dev/null @@ -1,170 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/mean_iou_op.h" -#include "paddle/fluid/memory/malloc.h" -#include "paddle/fluid/platform/device/gpu/gpu_info.h" -#include "paddle/phi/backends/gpu/gpu_primitives.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace paddle { -namespace operators { - -using phi::PADDLE_CUDA_NUM_THREADS; - -template -__global__ void CountCUDAKernel(const int num_classes, - const int count, - const T* predictions, - const T* labels, - int* wrong, - int* correct) { - extern __shared__ int blcok_cache[]; - int* wrong_c = blcok_cache; - int* correct_c = blcok_cache + num_classes; - // init cache - for (int i = threadIdx.x; i < num_classes * 2; i += blockDim.x) { - blcok_cache[i] = 0; - } - __syncthreads(); - - T pred; - T label; - CUDA_KERNEL_LOOP(i, count) { - pred = predictions[i]; - label = labels[i]; - if (pred == label) { - atomicAdd(correct_c + pred, 1); - } else { - atomicAdd(wrong_c + pred, 1); - atomicAdd(wrong_c + label, 1); - } - } - - __syncthreads(); - - for (int i = threadIdx.x; i < num_classes; i += blockDim.x) { - atomicAdd(wrong + i, wrong_c[i]); - atomicAdd(correct + i, correct_c[i]); - } -} - -__global__ void ComputeIoUCUDAKernel( - const int num_classes, int* wrong, int* correct, float* ious, float* iou) { - __shared__ int valid_count_c; - if (threadIdx.x == 0) { - valid_count_c = 0; - } - __syncthreads(); - CUDA_KERNEL_LOOP(i, num_classes) { - int wrong_n = wrong[i]; - int correct_n = correct[i]; - int denominator = wrong_n + correct_n; - if (denominator > 0) { - atomicAdd(&valid_count_c, 1); - ious[i] = static_cast(correct_n) / denominator; - } else { - ious[i] = 0; - } - } - __syncthreads(); - if (threadIdx.x == 0) { - float iou_sum = 0; - for (int i = 0; i < num_classes; ++i) { - iou_sum += ious[i]; - } - iou[0] += iou_sum / valid_count_c; - } -} - -template -class MeanIoUCUDAOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto& dev_ctx = ctx.template device_context(); - auto& place = *dev_ctx.eigen_device(); - // get input and output tensor - auto* predictions = ctx.Input("Predictions"); - auto* labels = ctx.Input("Labels"); - auto* out_mean_iou = ctx.Output("OutMeanIou"); - auto* out_wrong = ctx.Output("OutWrong"); - auto* out_correct = ctx.Output("OutCorrect"); - int num_classes = static_cast(ctx.Attr("num_classes")); - - // Get data ptr - const T* predictions_data = predictions->data(); - const T* labels_data = labels->data(); - int* out_wrong_data = out_wrong->mutable_data(ctx.GetPlace()); - int* out_correct_data = out_correct->mutable_data(ctx.GetPlace()); - float* out_mean_iou_data = - out_mean_iou->mutable_data(ctx.GetPlace()); - - // Get Eigen tensor - auto out_mean_iou_t = EigenScalar::From(*out_mean_iou); - auto out_wrong_t = EigenTensor::From(*out_wrong); - auto out_correct_t = EigenTensor::From(*out_correct); - - // Temporary memory - auto tmp_ious_data = memory::Alloc( - dev_ctx.GetPlace(), - num_classes * sizeof(float), - phi::Stream(reinterpret_cast(dev_ctx.stream()))); - float* ious_data = static_cast(tmp_ious_data->ptr()); - - // Init out_wrong, out_correct and out_mean_iou - out_wrong_t.device(place) = out_wrong_t.constant(0); - out_correct_t.device(place) = out_correct_t.constant(0); - out_mean_iou_t.device(place) = out_mean_iou_t.constant(0.0f); - - // collect pre wrong, correct and mean_iou - auto in_mean_ious = ctx.MultiInput("InMeanIou"); - for (int i = 0; i < in_mean_ious.size(); ++i) { - out_mean_iou_t.device(place) += - EigenScalar::From(*in_mean_ious[i]); - } - auto in_wrongs = ctx.MultiInput("InWrongs"); - for (int i = 0; i < in_wrongs.size(); ++i) { - out_wrong_t.device(place) += EigenTensor::From(*in_wrongs[i]); - } - auto in_corrects = ctx.MultiInput("InCorrects"); - for (int i = 0; i < in_corrects.size(); ++i) { - out_correct_t.device(place) += EigenTensor::From(*in_corrects[i]); - } - // compute - auto stream = ctx.cuda_device_context().stream(); - int block = PADDLE_CUDA_NUM_THREADS; - int grid = (predictions->numel() + block - 1) / block; - int cache_size = (num_classes * 2 + 1) * sizeof(int); - CountCUDAKernel - <<>>(num_classes, - predictions->numel(), - predictions_data, - labels_data, - out_wrong_data, - out_correct_data); - - ComputeIoUCUDAKernel<<<1, block, 0, stream>>>(num_classes, - out_wrong_data, - out_correct_data, - ious_data, - out_mean_iou_data); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -PD_REGISTER_STRUCT_KERNEL( - mean_iou, GPU, ALL_LAYOUT, ops::MeanIoUCUDAOpKernel, int, int64_t) {} diff --git a/paddle/fluid/operators/mean_iou_op.h b/paddle/fluid/operators/mean_iou_op.h deleted file mode 100644 index 8569d567c8f08..0000000000000 --- a/paddle/fluid/operators/mean_iou_op.h +++ /dev/null @@ -1,126 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include - -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -template -using EigenTensor = framework::EigenTensor; - -template -using EigenScalar = framework::EigenScalar; - -template -class MeanIoUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto& place = - *ctx.template device_context().eigen_device(); - // get input and output tensor - auto* predictions = ctx.Input("Predictions"); - auto* labels = ctx.Input("Labels"); - auto* out_mean_iou = ctx.Output("OutMeanIou"); - auto* out_wrong = ctx.Output("OutWrong"); - auto* out_correct = ctx.Output("OutCorrect"); - int num_classes = static_cast(ctx.Attr("num_classes")); - - // get data ptr - const T* predictions_data = predictions->data(); - const T* labels_data = labels->data(); - float* out_mean_iou_data = - out_mean_iou->mutable_data(ctx.GetPlace()); - int* out_wrong_data = out_wrong->mutable_data(ctx.GetPlace()); - int* out_correct_data = out_correct->mutable_data(ctx.GetPlace()); - - // get eigen tensor - auto out_mean_iou_t = EigenScalar::From(*out_mean_iou); - auto out_wrong_t = EigenTensor::From(*out_wrong); - auto out_correct_t = EigenTensor::From(*out_correct); - - // Tmp tensor - phi::DenseTensor denominator; - phi::DenseTensor valid_count; - phi::DenseTensor iou_sum; - - // get data ptr of tmp tensor - int* denominator_data = denominator.mutable_data( - {static_cast(num_classes)}, ctx.GetPlace()); - int* valid_count_data = valid_count.mutable_data({1}, ctx.GetPlace()); - float* iou_sum_data = iou_sum.mutable_data({1}, ctx.GetPlace()); - - // get eigen tensor of tmp tensor - auto denominator_t = EigenTensor::From(denominator); - auto valid_count_t = EigenTensor::From(valid_count); - auto iou_sum_t = EigenTensor::From(iou_sum); - - // init out_wrong, out_correct and out_mean_iou - out_wrong_t = out_wrong_t.constant(0); - out_correct_t = out_correct_t.constant(0); - out_mean_iou_t = out_mean_iou_t.constant(0); - - // collect pre wrong, correct and mean_iou - auto in_mean_ious = ctx.MultiInput("InMeanIou"); - for (size_t i = 0; i < in_mean_ious.size(); ++i) { - out_mean_iou_t.device(place) += - EigenScalar::From(*in_mean_ious[i]); - } - - auto in_wrongs = ctx.MultiInput("InWrongs"); - for (size_t i = 0; i < in_wrongs.size(); ++i) { - out_wrong_t.device(place) += EigenTensor::From(*in_wrongs[i]); - } - auto in_corrects = ctx.MultiInput("InCorrects"); - for (size_t i = 0; i < in_corrects.size(); ++i) { - out_correct_t.device(place) += EigenTensor::From(*in_corrects[i]); - } - - // compute - for (int64_t i = 0; i < predictions->numel(); ++i) { - if (predictions_data[i] == labels_data[i]) { - out_correct_data[predictions_data[i]] += 1; - } else { - out_wrong_data[labels_data[i]] += 1; - out_wrong_data[predictions_data[i]] += 1; - } - } - - denominator_t = out_wrong_t + out_correct_t; - valid_count_t = - (denominator_t > denominator_t.constant(0.0f)).cast().sum(); - - for (int i = 0; i < num_classes; ++i) { - if (denominator_data[i] == 0) { - denominator_data[i] = 1; - } - } - - iou_sum_t = - (out_correct_t.cast() / denominator_t.cast()).sum(); - out_mean_iou_data[0] += (iou_sum_data[0] / valid_count_data[0]); - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/optimizers/proximal_adagrad_op.cc b/paddle/fluid/operators/optimizers/proximal_adagrad_op.cc deleted file mode 100644 index 3261e96cbbeca..0000000000000 --- a/paddle/fluid/operators/optimizers/proximal_adagrad_op.cc +++ /dev/null @@ -1,137 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/optimizers/proximal_adagrad_op.h" - -namespace paddle { -namespace operators { - -class ProximalAdagradOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - protected: - void InferShape(framework::InferShapeContext *ctx) const override { - OP_INOUT_CHECK( - ctx->HasInput("Param"), "Input", "Param", "ProximalAdagradOp"); - OP_INOUT_CHECK( - ctx->HasInput("Moment"), "Input", "Moment", "ProximalAdagradOp"); - OP_INOUT_CHECK(ctx->HasInput("Grad"), "Input", "Grad", "ProximalAdagradOp"); - OP_INOUT_CHECK(ctx->HasInput("LearningRate"), - "Input", - "LearningRate", - "ProximalAdagradOp"); - - OP_INOUT_CHECK( - ctx->HasOutput("ParamOut"), "Output", "ParamOut", "ProximalAdagradOp"); - OP_INOUT_CHECK(ctx->HasOutput("MomentOut"), - "Output", - "MomentOut", - "ProximalAdagradOp"); - - auto param_dim = ctx->GetInputDim("Param"); - PADDLE_ENFORCE_EQ(param_dim, - ctx->GetInputDim("Grad"), - platform::errors::InvalidArgument( - "The shape of Intput(Param) should be equal to the " - "Input(Grad) of ProximalAdagrad Op. But received " - "Input(Param).dimensions=[%s], " - "Input(Grad).dimensions=[%s]", - param_dim, - ctx->GetInputDim("Grad"))); - - PADDLE_ENFORCE_EQ(param_dim, - ctx->GetInputDim("Moment"), - platform::errors::InvalidArgument( - "The shape of Intput(Param) should be equal to the " - "Input(Moment) of ProximalAdagrad Op. But received " - "Input(Param).dimensions=[%s], " - "Input(Moment).dimensions=[%s]", - param_dim, - ctx->GetInputDim("Moment"))); - - auto lr_dim = ctx->GetInputDim("LearningRate"); - PADDLE_ENFORCE_EQ( - phi::product(lr_dim), - 1, - platform::errors::InvalidArgument( - "Learning Rate should be a scalar. But received dimension[%s]", - lr_dim)); - - ctx->SetOutputDim("ParamOut", param_dim); - ctx->SetOutputDim("MomentOut", param_dim); - } - phi::KernelKey GetExpectedKernelType( - const framework::ExecutionContext &ctx) const override { - return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(ctx, "Param"), - ctx.GetPlace()); - } -}; - -class ProximalAdagradOpMaker : public framework::OpProtoAndCheckerMaker { - public: - void Make() override { - AddInput("Param", - "(Tensor, default Tensor) " - "Input parameter that has to be updated."); - AddInput("Moment", - "(Tensor, default Tensor) " - "Moment parameter that has to be updated."); - AddInput("Grad", - "(Tensor, default Tensor) " - "Input gradient of the parameter."); - AddInput("LearningRate", - "(Tensor, default Tensor) " - "The learning rate should be a tensor of size 1."); - - AddOutput("ParamOut", "(Tensor) Output updated parameter value."); - AddOutput("MomentOut", "(Tensor) Output updated moment value."); - - AddAttr("l1", - "(float, default 0.0) " - "L1 regularization strength.") - .SetDefault(0.0f); - AddAttr("l2", - "(float, default 0.0) " - "L2 regularization strength.") - .SetDefault(0.0f); - AddComment(R"DOC( -Proximal Adagrad Optimizer. - -Optimizer that implements the proximal adagrad algorithm: - -$$ -moment = moment + grad * grad \\ -prox\_param = param - learning\_rate * grad * (1 / \sqrt{moment}) \\ -param = sign(prox\_param) / (1 + learning\_rate * l2) * - \max(|prox\_param| - learning\_rate * l1 , 0) -$$ - -The paper that proposed Proximal GD: -(http://papers.nips.cc/paper/3793-efficient-learning-using-forward-backward-splitting.pdf) -Here, we use the adagrad learning rate as specified here: -(http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf) - -)DOC"); - } -}; -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_WITHOUT_GRADIENT(proximal_adagrad, - ops::ProximalAdagradOp, - ops::ProximalAdagradOpMaker); -PD_REGISTER_STRUCT_KERNEL( - proximal_adagrad, CPU, ALL_LAYOUT, ops::ProximalAdagradOpKernel, float) {} diff --git a/paddle/fluid/operators/optimizers/proximal_adagrad_op.cu b/paddle/fluid/operators/optimizers/proximal_adagrad_op.cu deleted file mode 100644 index 0a79dcd425f12..0000000000000 --- a/paddle/fluid/operators/optimizers/proximal_adagrad_op.cu +++ /dev/null @@ -1,17 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -You may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software distributed -under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR -CONDITIONS OF ANY KIND, either express or implied. See the License for the -specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/optimizers/proximal_adagrad_op.h" - -namespace ops = paddle::operators; -PD_REGISTER_STRUCT_KERNEL( - proximal_adagrad, GPU, ALL_LAYOUT, ops::ProximalAdagradOpKernel, float) {} diff --git a/paddle/fluid/operators/optimizers/proximal_adagrad_op.h b/paddle/fluid/operators/optimizers/proximal_adagrad_op.h deleted file mode 100644 index 973d870d14f31..0000000000000 --- a/paddle/fluid/operators/optimizers/proximal_adagrad_op.h +++ /dev/null @@ -1,66 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -template -class ProximalAdagradOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* param_out = ctx.Output("ParamOut"); - auto* moment_out = ctx.Output("MomentOut"); - - param_out->mutable_data(ctx.GetPlace()); - moment_out->mutable_data(ctx.GetPlace()); - - auto l1 = static_cast(ctx.Attr("l1")); - auto l2 = static_cast(ctx.Attr("l2")); - - auto grad = ctx.Input("Grad"); - auto p = framework::EigenVector::Flatten( - *ctx.Input("Param")); - auto m = framework::EigenVector::Flatten( - *ctx.Input("Moment")); - auto g = framework::EigenVector::Flatten(*grad); - auto lr = framework::EigenVector::Flatten( - *ctx.Input("LearningRate")); - - auto p_out = framework::EigenVector::Flatten(*param_out); - auto m_out = framework::EigenVector::Flatten(*moment_out); - auto* place = ctx.template device_context().eigen_device(); - - Eigen::DSizes grad_dsize(grad->numel()); - - m_out.device(*place) = m + g * g; - auto prox_param = p - lr.broadcast(grad_dsize) * g / m_out.sqrt(); - if (l1 > static_cast(0)) { - p_out.device(*place) = - prox_param.sign() * - (((prox_param.abs() - (lr * l1).broadcast(grad_dsize)) - .cwiseMax(static_cast(0.0))) / - (static_cast(1.0) + (lr * l2).broadcast(grad_dsize))); - } else { - p_out.device(*place) = - prox_param / (static_cast(1.0) + (lr * l2).broadcast(grad_dsize)); - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/optimizers/unity_build_rule.cmake b/paddle/fluid/operators/optimizers/unity_build_rule.cmake index 4485465ddf9eb..dae61e7cfcf26 100644 --- a/paddle/fluid/operators/optimizers/unity_build_rule.cmake +++ b/paddle/fluid/operators/optimizers/unity_build_rule.cmake @@ -8,7 +8,6 @@ register_unity_group( cc ftrl_op.cc lars_momentum_op.cc - proximal_adagrad_op.cc proximal_gd_op.cc decayed_adagrad_op.cc adadelta_op.cc @@ -19,7 +18,6 @@ register_unity_group( lars_momentum_op.cu momentum_op.cu sgd_op.cu - proximal_adagrad_op.cu adagrad_op.cu decayed_adagrad_op.cu adadelta_op.cu diff --git a/paddle/fluid/operators/sample_logits_op.cc b/paddle/fluid/operators/sample_logits_op.cc deleted file mode 100644 index 1e2e27f460871..0000000000000 --- a/paddle/fluid/operators/sample_logits_op.cc +++ /dev/null @@ -1,282 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#include "paddle/fluid/operators/sample_logits_op.h" - -#include - -#include "paddle/fluid/operators/math/sample_prob.h" - -namespace paddle { -namespace operators { - -class SampleLogitsOpMaker : public framework::OpProtoAndCheckerMaker { - public: - void Make() override { - AddInput("Logits", - "(Tensor, default: Tensor), The unscaled log probabilities " - "which is a 2-D tensor with shape [N x K]. N is the batch_size, " - "and K is the class number."); - AddInput("Labels", - "(Tensor) The ground truth which is a 2-D tensor. Labels is a " - "Tensor with shape [N x NT], where NT is the number of" - "true labels for each example."); - AddInput("CustomizedSamples", - "(Tensor, default: Tensor), A 2-D tensor with shape [N, " - "NT + S]," - " where N is the batch size, NT is the number of true labels " - "and S is the number of negtive sample for each example." - "The first NT elements of each row should be the same with true " - "labels, " - "followed by S custom negtive samples. This tensor" - "is only used when use_customized_samples is true.") - .AsDispensable(); - AddInput( - "CustomizedProbabilities", - "(Tensor, default: Tensor), A 2-D tensor with shape [N, NT + S]." - "The tensor has the same shape with CustomSamples," - "and each element represents probability of element in CustomSamples. " - "This " - "tensor is only used when use_customized_samples is true.") - .AsDispensable(); - AddOutput("Samples", - "(Tensor, default: Tensor), A 2-D tensor with shape [N, " - "NT + S]." - "The outputs value of sampler, including NT true lables and S " - "negetive samples " - "for each example. This will be used in" - "backward calculation.") - .AsIntermediate(); - AddOutput( - "Probabilities", - "(Tensor, default: Tensor), A 2-D tensor with shape [N, NT + S]." - "The probabilities of sampled positive and negtive labels.") - .AsIntermediate(); - AddOutput("LogitsDim", "Store dim information of Logits for gradient op") - .AsIntermediate(); - AddOutput("LabelsDim", "Store dim information of Logits for gradient op") - .AsIntermediate(); - AddOutput("SampledLogits", - "(Tensor, default: Tensor), A 2-D tensor with shape" - "[N, NT + S]. The outputs value of sampled logits, which will be" - "used in backward propagation.") - .AsIntermediate(); - AddOutput( - "SampledLabels", - "(Tensor, default: Tensor), A 2-D tensor. The sampled labels" - "with shape [N, NT]. The tonsor contains hard labels as input to " - " softmax op, that is 0, 1, ..., NT-1 because of the first NT elements" - " of Sampels are positive lables."); - AddAttr( - "use_customized_samples", - "An indicator whether to use customized samples with probabilities, if " - "True" - "the operator will use customized samples and customized probabilities" - "otherwise, the operator will generate them by itself.") - .SetDefault(false); - AddAttr( - "uniq", - "An indicator whether to sample non-repetitive negtive labels, if True" - "the operator will sample negtive labels without replacement." - "Otherwise, the operator will sample negtive labels with replacement.") - .SetDefault(true); - AddAttr( - "remove_accidental_hits", - "An indicator whether to remove accidental hits when samples hits true" - "labels, the removal is implemented by subtracting the corresponding" - "logits by float_max to subpress their softmax to be zero.") - .SetDefault(true); - AddAttr("num_samples", "The number of negative samples."); - AddAttr("seed", "Random seed for generating samples").SetDefault(0); - - AddComment(R"DOC( - """ - Computes sampled output training logits and labels suitable for implementing - sampled softmax. - """ - -)DOC"); - } -}; - -class SampleLogitsOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK( - ctx->HasInput("Labels"), "Input", "Logits", "SampleLogitsOp"); - OP_INOUT_CHECK( - ctx->HasInput("Labels"), "Input", "Logits", "SampleLogitsOp"); - - OP_INOUT_CHECK( - ctx->HasOutput("Samples"), "Output", "Samples", "SampleLogitsOp"); - OP_INOUT_CHECK(ctx->HasOutput("Probabilities"), - "Output", - "Probabilities", - "SampleLogitsOp"); - OP_INOUT_CHECK(ctx->HasOutput("SampledLogits"), - "Output", - "SampledLogits", - "SampleLogitsOp"); - OP_INOUT_CHECK(ctx->HasOutput("SampledLabels"), - "Output", - "SampledLabels", - "SampleLogitsOp"); - OP_INOUT_CHECK( - ctx->HasOutput("LogitsDim"), "Output", "LogitsDim", "SampleLogitsOp"); - OP_INOUT_CHECK( - ctx->HasOutput("LabelsDim"), "Output", "LabelsDim", "SampleLogitsOp"); - - auto logits_dims = ctx->GetInputDim("Logits"); - auto labels_dims = ctx->GetInputDim("Labels"); - - PADDLE_ENFORCE_EQ(logits_dims.size(), - 2UL, - platform::errors::InvalidArgument( - "Input(Logits) of SampleLogitsOp should be 2D. " - "But received shape = [%s] and dimension is %d.", - logits_dims, - logits_dims.size())); - PADDLE_ENFORCE_EQ(labels_dims.size(), - 2UL, - platform::errors::InvalidArgument( - "Input(Labels) of SampleLogitsOp should be 2D. " - "But received shape = [%s] and dimension is %d.", - labels_dims, - labels_dims.size())); - - const int num_samples = ctx->Attrs().Get("num_samples"); - int num_sampled_classes = static_cast(labels_dims[1] + num_samples); - if ((!ctx->IsRuntime()) && labels_dims[1] <= 0) { - num_sampled_classes = -1; - } - ctx->SetOutputDim("Samples", {logits_dims[0], num_sampled_classes}); - ctx->SetOutputDim("Probabilities", {logits_dims[0], num_sampled_classes}); - ctx->SetOutputDim("SampledLogits", {logits_dims[0], num_sampled_classes}); - ctx->SetOutputDim("SampledLabels", {logits_dims[0], labels_dims[1]}); - - // append 0 to shape variable to avoid optimized by memory optimize pass - auto logits_dim_vec = phi::vectorize(logits_dims); - logits_dim_vec.push_back(0); - ctx->SetOutputDim("LogitsDim", phi::make_ddim(logits_dim_vec)); - - auto labels_dim_vec = phi::vectorize(labels_dims); - labels_dim_vec.push_back(0); - ctx->SetOutputDim("LabelsDim", phi::make_ddim(labels_dim_vec)); - } - - protected: - phi::KernelKey GetExpectedKernelType( - const framework::ExecutionContext& ctx) const override { - auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "Logits"); - return phi::KernelKey(data_type, ctx.GetPlace()); - } -}; - -// UNDERSTAND: InferShape for Grad -class SampleLogitsOpGrad : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK( - ctx->HasInput("LogitsDim"), "Input", "LogitsDim", "SampleLogitsOpGrad"); - OP_INOUT_CHECK( - ctx->HasInput("LabelsDim"), "Input", "LabelsDim", "SampleLogitsOpGrad"); - OP_INOUT_CHECK(ctx->HasInput("Samples"), - "Input", - "SamplesabelsDim", - "SampleLogitsOpGrad"); - OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("SampledLogits")), - "Input", - "SampledLogits@GRAD", - "SampleLogitsOpGrad"); - OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("Logits")), - "Output", - "Logits@GRAD", - "SampleLogitsOpGrad"); - - auto logits_dims = ctx->GetInputDim("LogitsDim"); - logits_dims = framework::DDim(logits_dims.Get(), logits_dims.size() - 1); - auto labels_dims = ctx->GetInputDim("LabelsDim"); - labels_dims = framework::DDim(labels_dims.Get(), labels_dims.size() - 1); - PADDLE_ENFORCE_EQ( - logits_dims.size(), - 2UL, - platform::errors::InvalidArgument( - "Input(LogitsDim) of SampleLogitsOpGrad should be 2D. " - "But received shape = [%s] and dimension is %d.", - logits_dims, - logits_dims.size())); - PADDLE_ENFORCE_EQ( - labels_dims.size(), - 2UL, - platform::errors::InvalidArgument( - "Input(LabelsDim) of SampleLogitsOpGrad should be 2D. " - "But received shape = [%s] and dimension is %d.", - labels_dims, - labels_dims.size())); - - ctx->SetOutputDim(framework::GradVarName("Logits"), logits_dims); - } - - protected: - phi::KernelKey GetExpectedKernelType( - const framework::ExecutionContext& ctx) const override { - auto data_type = OperatorWithKernel::IndicateVarDataType( - ctx, framework::GradVarName("SampledLogits")); - return phi::KernelKey(data_type, ctx.GetPlace()); - } -}; - -// UNDERSTAND: what's the rule for making a GradMaker TODO - -template -class SampleLogitsGradMaker : public framework::SingleGradOpMaker { - public: - using framework::SingleGradOpMaker::SingleGradOpMaker; - - protected: - void Apply(GradOpPtr grad_op) const override { - grad_op->SetType("sample_logits_grad"); - grad_op->SetInput("LogitsDim", this->Output("LogitsDim")); - grad_op->SetInput("LabelsDim", this->Output("LabelsDim")); - grad_op->SetInput("Samples", this->Output("Samples")); - grad_op->SetInput(framework::GradVarName("SampledLogits"), - this->OutputGrad("SampledLogits")); - grad_op->SetOutput(framework::GradVarName("Logits"), - this->InputGrad("Logits")); - grad_op->SetAttrMap(this->Attrs()); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -REGISTER_OPERATOR(sample_logits, - ops::SampleLogitsOp, - ops::SampleLogitsOpMaker, - ops::SampleLogitsGradMaker, - ops::SampleLogitsGradMaker); -REGISTER_OPERATOR(sample_logits_grad, ops::SampleLogitsOpGrad); -PD_REGISTER_STRUCT_KERNEL( - sample_logits, CPU, ALL_LAYOUT, ops::SampleLogitsKernel, float, double) {} -PD_REGISTER_STRUCT_KERNEL(sample_logits_grad, - CPU, - ALL_LAYOUT, - ops::SampleLogitsGradKernel, - float, - double) {} diff --git a/paddle/fluid/operators/sample_logits_op.cu b/paddle/fluid/operators/sample_logits_op.cu deleted file mode 100644 index 6a853f71e6f32..0000000000000 --- a/paddle/fluid/operators/sample_logits_op.cu +++ /dev/null @@ -1,301 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include - -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/tensor_util.h" -#include "paddle/fluid/operators/math/sample_prob.h" -#include "paddle/fluid/operators/sample_logits_op.h" -#include "paddle/phi/kernels/funcs/math_function.h" -#include "paddle/phi/kernels/funcs/softmax.h" - -namespace paddle { -namespace operators { - -// UNDERSTAND: something like take_along_axis in numpy. -template -__global__ void GPUTakeAlongD1(size_t size, - const int batch_size, - const int array_slice_size, - const int idx_slice_size, - const T* p_array, - const int64_t* p_index, - T* p_value) { - const auto value_slice_size = idx_slice_size; - int idx = blockDim.x * blockIdx.x + threadIdx.x; - int step_size = blockDim.x * gridDim.x; - - for (; idx < size; idx += step_size) { - int i = idx / idx_slice_size; - auto array_index = p_index[idx]; - p_value[idx] = p_array[i * array_slice_size + array_index]; - } -} - -// UNDERSTAND: something like put_along_axis in numpy but if there is duplicate -// indices, scatter is done in += way. -template -__global__ void GPUPutAlongD1(size_t size, - const int batch_size, - const int array_slice_size, - const int idx_slice_size, - T* p_array, - const int64_t* p_index, - const T* p_value) { - const auto value_slice_size = idx_slice_size; - int idx = blockDim.x * blockIdx.x + threadIdx.x; - int step_size = blockDim.x * gridDim.x; - - // size == batch_size - for (; idx < size; idx += step_size) { - int i = idx; - for (int j = 0; j < idx_slice_size; ++j) { - auto array_index = p_index[i * idx_slice_size + j]; - p_array[i * array_slice_size + array_index] += - p_value[i * idx_slice_size + j]; - } - } -} - -// UNDERSTAND: set label as 0,1,...,num_true-1 -template -__global__ void GPUSetLabel(size_t size, const int num_true, int64_t* p_array) { - int idx = blockDim.x * blockIdx.x + threadIdx.x; - int step_size = blockDim.x * gridDim.x; - - for (; idx < size; idx += step_size) { - p_array[idx] = idx % num_true; - } -} - -// UNDERSTAND: compute accidentdal hits from samples and minus corresponding -// logits by a float max, here 1e20 -template -__global__ void gpu_compute_remove_accidental_hits(const int size, - const int num_true, - const int idx_slice_size, - const int64_t* p_index, - T* p_value) { - const auto value_slice_size = idx_slice_size; - int idx = blockDim.x * blockIdx.x + threadIdx.x; - int step_size = blockDim.x * gridDim.x; - - for (; idx < size; idx += step_size) { - int i = idx / idx_slice_size; - if (idx % idx_slice_size < num_true) continue; - for (int j = 0; j < num_true; ++j) { - const auto true_idx = i * idx_slice_size + j; - if (p_index[true_idx] == p_index[idx]) { - p_value[idx] -= 1e20; - break; - } - } - } -} - -template -class SampleLogitsCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - // get necessary inputs - const phi::DenseTensor* logits = context.Input("Logits"); - const phi::DenseTensor* labels = context.Input("Labels"); - VLOG(3) << "Enter SampleLogitsCUDAKernel"; - - // get necessary outputs - phi::DenseTensor* samples = context.Output("Samples"); - phi::DenseTensor* probabilities = - context.Output("Probabilities"); - phi::DenseTensor* sampled_logits = - context.Output("SampledLogits"); - phi::DenseTensor* sampled_labels = - context.Output("SampledLabels"); - - // shapes - const auto batch_size = logits->dims()[0]; - const auto num_classes = logits->dims()[1]; - const auto labels_dim = labels->dims(); - const auto num_true = labels_dim[1]; - const auto samples_dim = samples->dims(); - - // attrs - const auto num_samples = context.Attr("num_samples"); - const bool use_customized_samples = - context.Attr("use_customized_samples"); - const bool uniq = context.Attr("uniq"); - const bool remove_accidental_hits = - context.Attr("remove_accidental_hits"); - - // device contexts - auto& dev_ctx = context.cuda_device_context(); - - // UNDERSTAND: allocate memories for temporaries - sampled_logits->mutable_data(samples_dim, context.GetPlace()); - phi::funcs::SetConstant set_zero; - set_zero(dev_ctx, sampled_logits, static_cast(0)); - - auto sampled_labels_data = - sampled_labels->mutable_data(labels_dim, context.GetPlace()); - int threads = 512; - size_t size = batch_size * num_true; - int grid = (size + threads - 1) / threads; - GPUSetLabel - <<>>( - size, num_true, sampled_labels_data); - - if (use_customized_samples) { - const phi::DenseTensor* customized_samples = - context.Input("CustomizedSamples"); - const phi::DenseTensor* customized_probabilities = - context.Input("CustomizedProbabilities"); - PADDLE_ENFORCE_EQ( - customized_samples, - samples, - platform::errors::InvalidArgument( - "CustomizedSamples must be the same phi::DenseTensor with " - "Samples when use_customized_samples = True")); - PADDLE_ENFORCE_EQ( - customized_probabilities, - probabilities, - platform::errors::InvalidArgument( - "CustomizedProbabilities must be the same phi::DenseTensor with " - "Probabilities when use_customized_samples = True")); - } else { - samples->mutable_data(context.GetPlace()); - probabilities->mutable_data(samples_dim, context.GetPlace()); - // UNDERSTAND: sampling - const auto seed = context.Attr("seed"); - auto sampler_with_prob = math::GPUSampleWithProb(); - sampler_with_prob(context.cuda_device_context(), - seed, - num_classes, - uniq, - num_samples, - labels, - samples, - probabilities); - } - - // UNDERSTAND: gather sampled logits and remove accidental hits if needed - const auto num_take = samples->dims()[1]; - const auto array_dims = logits->dims(); - const auto idx_dims = samples->dims(); - - const T* p_array = logits->data(); - const int64_t* p_index = samples->data(); - T* p_value = sampled_logits->data(); - - // src slice size - const auto array_slice_size = array_dims[1]; - // index slice size - const auto idx_slice_size = idx_dims[1]; - - size = batch_size * num_take; - grid = (size + threads - 1) / threads; - GPUTakeAlongD1 - <<>>( - size, - batch_size, - array_slice_size, - idx_slice_size, - p_array, - p_index, - p_value); - - if (remove_accidental_hits) { - const size_t size = batch_size * (num_true + num_samples); - int grid = (size + threads - 1) / threads; - gpu_compute_remove_accidental_hits - <<>>( - size, num_true, idx_slice_size, p_index, p_value); - } - - // subtracted sampled logits with logQ(y|x) - auto probs = EigenMatrix::From(*probabilities); - auto smp_logits = EigenMatrix::From(*sampled_logits); - smp_logits.device(*dev_ctx.eigen_device()) = - (smp_logits - probs.log().unaryExpr(TolerableValue())) - .unaryExpr(TolerableValue()); - } -}; - -template -class SampleLogitsGradCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto logits_grad = - context.Output(framework::GradVarName("Logits")); - const phi::DenseTensor* samples = - context.Input("Samples"); - const phi::DenseTensor* sampled_logits_grad = - context.Input( - framework::GradVarName("SampledLogits")); - logits_grad->mutable_data(context.GetPlace()); - - auto& dev_ctx = context.cuda_device_context(); - phi::funcs::SetConstant set_zero; - set_zero(dev_ctx, logits_grad, static_cast(0)); - - // UNDERSTAND: scatter it back to logit_grad - const auto batch_size = samples->dims()[0]; - const auto num_put = samples->dims()[1]; - const auto array_dims = logits_grad->dims(); - const auto idx_dims = samples->dims(); - - T* p_array = logits_grad->data(); - const int64_t* p_index = samples->data(); - const T* p_value = sampled_logits_grad->data(); - - // src slice size - const auto array_slice_size = array_dims[1]; - // index slice size - const auto idx_slice_size = idx_dims[1]; - - int threads = 128; - const size_t size = batch_size; - int grid = (size + threads - 1) / threads; - - GPUPutAlongD1 - <<>>( - size, - batch_size, - array_slice_size, - idx_slice_size, - p_array, - p_index, - p_value); - } -}; - -} // namespace operators -} // namespace paddle -namespace ops = paddle::operators; - -PD_REGISTER_STRUCT_KERNEL(sample_logits, - GPU, - ALL_LAYOUT, - ops::SampleLogitsCUDAKernel, - float, - double) {} -PD_REGISTER_STRUCT_KERNEL(sample_logits_grad, - GPU, - ALL_LAYOUT, - ops::SampleLogitsGradCUDAKernel, - float, - double) {} diff --git a/paddle/fluid/operators/sample_logits_op.h b/paddle/fluid/operators/sample_logits_op.h deleted file mode 100644 index bf58a054dad2d..0000000000000 --- a/paddle/fluid/operators/sample_logits_op.h +++ /dev/null @@ -1,330 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include - -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/tensor_util.h" -#include "paddle/fluid/operators/math/sample_prob.h" -#include "paddle/phi/kernels/funcs/math_function.h" -#include "paddle/phi/kernels/funcs/softmax.h" - -namespace paddle { -namespace operators { - -template -using EigenMatrix = framework::EigenMatrix; - -template -struct TolerableValue { - HOSTDEVICE T operator()(const T& x) const { - PADDLE_ENFORCE(std::is_floating_point::value, - "TolerableValue should be float in sample_logits_op."); - const T kApproInf = 1e20; - if (x == INFINITY) return kApproInf; - if (x == -INFINITY) return -kApproInf; - return x; - } -}; - -// UNDERSTAND: something like take_along_axis in numpy. -template -static void CPUTakeAlongD1(const platform::DeviceContext& ctx, - const phi::DenseTensor& array, - const phi::DenseTensor& index, - phi::DenseTensor* value) { - PADDLE_ENFORCE_EQ( - platform::is_cpu_place(ctx.GetPlace()), - true, - platform::errors::InvalidArgument("This kernel only runs on CPU.")); - // UNDERSTAND: check shape src(B, C), index(B, K), out should also be (B, K) - const auto batch_size = index.dims()[0]; - const auto num_take = index.dims()[1]; - const auto array_dims = array.dims(); - const auto idx_dims = index.dims(); - PADDLE_ENFORCE_EQ(idx_dims.size(), - 2, - platform::errors::InvalidArgument( - "index of CPUTakeAlongD1 should be 2D. " - "But received shape = [%s] and dimension is %d.", - idx_dims, - idx_dims.size())); - PADDLE_ENFORCE_EQ(array_dims.size(), - 2, - platform::errors::InvalidArgument( - "array of CPUTakeAlongD1 should be 2D. " - "But received shape = [%s] and dimension is %d.", - array_dims, - array_dims.size())); - PADDLE_ENFORCE_EQ(idx_dims[0], - array_dims[0], - platform::errors::InvalidArgument( - "The first dimension of index and array of " - "CPUTakeAlongD1 should be equal. " - "But received index shape = [%s], array shape = [%s], " - "and the first dimensions are %d and %d.", - idx_dims, - array_dims, - idx_dims[0], - array_dims[0])); - PADDLE_ENFORCE_EQ( - idx_dims, - value->dims(), - platform::errors::InvalidArgument( - "index and array of CPUTakeAlongD1 should have the same shape. " - "But received index shape = [%s], array shape = [%s].", - idx_dims, - value->dims())); - - // UNDERSTAND: no allocations here - const T* p_array = array.data(); - const int64_t* p_index = index.data(); - T* p_value = value->data(); - - // src slice size - const auto array_slice_size = array_dims[1]; - - // index slice size - const auto idx_slice_size = idx_dims[1]; - const auto value_slice_size = idx_slice_size; - - for (int i = 0; i < batch_size; ++i) { - for (int j = 0; j < num_take; ++j) { - auto array_index = p_index[i * idx_slice_size + j]; - p_value[i * value_slice_size + j] = - p_array[i * array_slice_size + array_index]; - } - } -} - -// UNDERSTAND: something like put_along_axis in numpy but if there is duplicate -// indices, scatter is done in += way. -template -static void CPUPutAlongD1(const platform::DeviceContext& ctx, - phi::DenseTensor* array, - const phi::DenseTensor& index, - const phi::DenseTensor& value) { - PADDLE_ENFORCE_EQ( - platform::is_cpu_place(ctx.GetPlace()), - true, - platform::errors::InvalidArgument("This kernel only runs on CPU.")); - // UNDERSTAND: check shape src(B, C), index(B, K), out should also be (B, K) - const auto batch_size = index.dims()[0]; - const auto num_put = index.dims()[1]; - auto array_dims = array->dims(); - auto idx_dims = index.dims(); - PADDLE_ENFORCE_EQ(idx_dims.size(), - 2, - platform::errors::InvalidArgument( - "index of CPUPutAlongD1 should be 2D. " - "But received shape = [%s] and dimension is %d.", - idx_dims, - idx_dims.size())); - PADDLE_ENFORCE_EQ(array_dims.size(), - 2, - platform::errors::InvalidArgument( - "array of CPUPutAlongD1 should be 2D. " - "But received shape = [%s] and dimension is %d.", - array_dims, - array_dims.size())); - PADDLE_ENFORCE_EQ(idx_dims[0], - array_dims[0], - platform::errors::InvalidArgument( - "The first dimension of index and array of " - "CPUPutAlongD1 should be equal. " - "But received index shape = [%s], array shape = [%s], " - "and the first dimensions are %d and %d.", - idx_dims, - array_dims, - idx_dims[0], - array_dims[0])); - PADDLE_ENFORCE_EQ( - idx_dims, - value.dims(), - platform::errors::InvalidArgument( - "index and array of CPUPutAlongD1 should have the same shape. " - "But received index shape = [%s], array shape = [%s].", - idx_dims, - value.dims())); - - // UNDERSTAND: no allocations here - T* p_array = array->data(); - const int64_t* p_index = index.data(); - const T* p_value = value.data(); - - // slice sizes - const auto array_slice_size = array_dims[1]; - const auto idx_slice_size = idx_dims[1]; - const auto value_slice_size = idx_slice_size; - - for (int i = 0; i < batch_size; ++i) { - for (int j = 0; j < num_put; ++j) { - auto array_index = p_index[i * idx_slice_size + j]; - p_array[i * array_slice_size + array_index] += - p_value[i * value_slice_size + j]; - } - } -} - -// UNDERSTAND: compute accidentdal hits from samples and minus corresponding -// logits by a float max, here 1e20 -template -static void compute_remove_accidental_hits(const platform::DeviceContext& ctx, - phi::DenseTensor* sampled_logits, - const phi::DenseTensor& samples, - const int num_true) { - const auto batch_size = sampled_logits->dims()[0]; - const auto num_sampled_classes = sampled_logits->dims()[1]; - T* sampled_logits_data = sampled_logits->data(); - const auto samples_data = samples.data(); - - std::unordered_set tmp_true_labels; - for (int i = 0; i < batch_size; ++i) { - tmp_true_labels.clear(); - tmp_true_labels.insert(samples_data + i * num_sampled_classes, - samples_data + i * num_sampled_classes + num_true); - for (int j = num_true; j < num_sampled_classes; ++j) { - const auto idx = i * num_sampled_classes + j; - if (tmp_true_labels.find(samples_data[idx]) != tmp_true_labels.end()) - sampled_logits_data[idx] -= 1e20; - } - } -} - -template -class SampleLogitsKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - PADDLE_ENFORCE_EQ( - platform::is_cpu_place(context.GetPlace()), - true, - platform::errors::InvalidArgument("this kernel only runs on cpu.")); - VLOG(3) << "Enter SampleLogitsKernel"; - // get necessary inputs - const phi::DenseTensor* logits = context.Input("Logits"); - const phi::DenseTensor* labels = context.Input("Labels"); - - // get necessary outputs - phi::DenseTensor* samples = context.Output("Samples"); - phi::DenseTensor* probabilities = - context.Output("Probabilities"); - phi::DenseTensor* sampled_logits = - context.Output("SampledLogits"); - phi::DenseTensor* sampled_labels = - context.Output("SampledLabels"); - - // shapes - const auto batch_size = logits->dims()[0]; - const auto num_classes = logits->dims()[1]; - const auto labels_dim = labels->dims(); - const auto num_true = labels_dim[1]; - const auto samples_dim = samples->dims(); - - // attrs - const auto num_samples = context.Attr("num_samples"); - const bool use_customized_samples = - context.Attr("use_customized_samples"); - const bool remove_accidental_hits = - context.Attr("remove_accidental_hits"); - - // device contexts - auto& dev_ctx = context.template device_context(); - - // UNDERSTAND: allocate memories for temporaries - sampled_logits->mutable_data(samples_dim, context.GetPlace()); - auto sampled_labels_data = - sampled_labels->mutable_data(labels_dim, context.GetPlace()); - for (int i = 0; i < batch_size; ++i) { - for (int j = 0; j < num_true; ++j) { - sampled_labels_data[i * num_true + j] = j; - } - } - - if (use_customized_samples) { - const phi::DenseTensor* customized_samples = - context.Input("CustomizedSamples"); - const phi::DenseTensor* customized_probabilities = - context.Input("CustomizedProbabilities"); - PADDLE_ENFORCE_EQ( - customized_samples, - samples, - platform::errors::InvalidArgument( - "CustomizedSamples must be the same phi::DenseTensor with " - "Samples when use_customized_samples = True")); - PADDLE_ENFORCE_EQ( - customized_probabilities, - probabilities, - platform::errors::InvalidArgument( - "CustomizedProbabilities must be the same phi::DenseTensor with " - "Probabilities when use_customized_samples = True")); - } else { - samples->mutable_data(context.GetPlace()); - probabilities->mutable_data(samples_dim, context.GetPlace()); - // UNDERSTAND: sampling - const auto seed = context.Attr("seed"); - auto sampler_with_prob = math::SampleWithProb(); - sampler_with_prob(dev_ctx, - math::LogUniformSampler(num_classes, seed), - num_samples, - labels, - samples, - probabilities); - } - - // UNDERSTAND: gather sampled logits and remove accidental hits if needed - CPUTakeAlongD1(dev_ctx, *logits, *samples, sampled_logits); - if (remove_accidental_hits) { - compute_remove_accidental_hits( - dev_ctx, sampled_logits, *samples, num_true); - } - - // subtracted sampled logits with logQ(y|x) - auto probs = EigenMatrix::From(*probabilities); - auto smp_logits = EigenMatrix::From(*sampled_logits); - smp_logits.device(*dev_ctx.eigen_device()) = - (smp_logits - probs.log().unaryExpr(TolerableValue())) - .unaryExpr(TolerableValue()); - } -}; - -template -class SampleLogitsGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto logits_grad = - context.Output(framework::GradVarName("Logits")); - const phi::DenseTensor* samples = - context.Input("Samples"); - const phi::DenseTensor* sampled_logits_grad = - context.Input( - framework::GradVarName("SampledLogits")); - logits_grad->mutable_data(context.GetPlace()); - - auto& dev_ctx = context.template device_context(); - phi::funcs::SetConstant set_zero; - set_zero(dev_ctx, logits_grad, static_cast(0)); - - // UNDERSTAND: scatter it back to logit_grad - CPUPutAlongD1(dev_ctx, logits_grad, *samples, *sampled_logits_grad); - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/unity_build_rule.cmake b/paddle/fluid/operators/unity_build_rule.cmake index 4c8bff3a899f8..8f4b64125479a 100644 --- a/paddle/fluid/operators/unity_build_rule.cmake +++ b/paddle/fluid/operators/unity_build_rule.cmake @@ -173,7 +173,6 @@ register_unity_group( matmul_v2_op.cc) register_unity_group( cc - mean_iou_op.cc mean_op.cc minus_op.cc mish_op.cc @@ -223,7 +222,6 @@ register_unity_group( roi_align_op.cc roll_op.cc run_program_op.cc - sample_logits_op.cc sampling_id_op.cc save_combine_op.cc save_op.cc @@ -440,7 +438,6 @@ register_unity_group( softmax_with_cross_entropy_op.cu) register_unity_group( cu - mean_iou_op.cu mean_op.cu minus_op.cu mish_op.cu @@ -468,7 +465,6 @@ register_unity_group( cu roi_align_op.cu roll_op.cu - sample_logits_op.cu sampling_id_op.cu save_combine_op.cu save_op.cu diff --git a/test/legacy_test/test_mean_iou.py b/test/legacy_test/test_mean_iou.py deleted file mode 100644 index f50a8beb010f9..0000000000000 --- a/test/legacy_test/test_mean_iou.py +++ /dev/null @@ -1,158 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import unittest - -import numpy as np -from op_test import OpTest - - -def compute_mean_iou( - predictions, labels, num_classes, in_wrongs, in_corrects, in_mean_ious -): - assert predictions.shape == labels.shape - predictions = predictions.flatten() - labels = labels.flatten() - - out_wrong = np.zeros([num_classes]).astype("int32") - for _, wrong in in_wrongs: - out_wrong += wrong - out_correct = np.zeros([num_classes]).astype("int32") - for _, correct in in_corrects: - out_correct += correct - - for pred, label in zip(predictions, labels): - if pred == label: - out_correct[pred] += 1 - else: - out_wrong[pred] += 1 - out_wrong[label] += 1 - - denominator = out_wrong + out_correct - valid_count = (denominator != 0).sum() - denominator = np.where( - denominator > 0, denominator, np.ones(denominator.shape) - ) - mean_iou = (out_correct / denominator).sum() / valid_count - - for _, in_mean_iou in in_mean_ious: - mean_iou += float(in_mean_iou) - return mean_iou, out_wrong, out_correct - - -class TestMeanIOUOp(OpTest): - def setUp(self): - self.config() - self.op_type = "mean_iou" - predictions = np.random.randint( - 0, self.num_classes, self.image_size - ).astype("int32") - labels = np.random.randint(0, self.num_classes, self.image_size).astype( - "int32" - ) - - in_wrongs = [] - for i in range(self.in_wrong_num): - in_wrongs.append( - ( - "in_wrong_%d" % i, - np.random.randint(0, 10, [self.num_classes]).astype( - "int32" - ), - ) - ) - - in_corrects = [] - for i in range(self.in_correct_num): - in_corrects.append( - ( - "in_correct_%d" % i, - np.random.randint(0, 10, [self.num_classes]).astype( - "int32" - ), - ) - ) - - self.inputs = { - 'Predictions': predictions, - 'Labels': labels, - 'InWrongs': in_wrongs, - 'InCorrects': in_corrects, - 'InMeanIou': self.in_mean_ious, - } - self.attrs = {'num_classes': int(self.num_classes)} - mean_iou, out_wrong, out_correct = compute_mean_iou( - predictions, - labels, - self.num_classes, - in_wrongs, - in_corrects, - self.in_mean_ious, - ) - self.outputs = { - 'OutMeanIou': mean_iou, - 'OutWrong': out_wrong, - 'OutCorrect': out_correct, - } - - def config(self): - self.num_classes = 10 - self.image_size = [128, 128] - self.in_wrong_num = 0 - self.in_correct_num = 0 - self.in_mean_ious = [] - - def test_check_output(self): - self.check_output() - - -class TestCase1(TestMeanIOUOp): - def config(self): - self.num_classes = 5 - self.image_size = [100, 128] - self.in_wrong_num = 2 - self.in_correct_num = 2 - self.in_mean_ious = [] - for i in range(2): - self.in_mean_ious.append( - ( - "in_mean_iou_%d" % i, - np.random.uniform(0, 1, []).astype("float32"), - ) - ) - - # NOTE(dev): Skip check_dygraph becuase Python API doesn't expose - # in_wrong_num/in_correct_num/in_mean_iou_num argument - def test_check_output(self): - self.check_output(check_dygraph=False) - - -class TestCase2(TestCase1): - def config(self): - self.num_classes = 5 - self.image_size = [100, 128] - self.in_wrong_num = 2 - self.in_correct_num = 2 - self.in_mean_ious = [] - for i in range(2): - self.in_mean_ious.append( - ( - "in_mean_iou_%d" % i, - np.random.uniform(0, 1, [1]).astype("float32"), - ) - ) - - -if __name__ == '__main__': - unittest.main() diff --git a/test/legacy_test/test_proximal_adagrad_op.py b/test/legacy_test/test_proximal_adagrad_op.py deleted file mode 100644 index 45d25d3a21350..0000000000000 --- a/test/legacy_test/test_proximal_adagrad_op.py +++ /dev/null @@ -1,51 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import unittest - -import numpy as np -from op_test import OpTest - - -class TestProximalAdagradOp(OpTest): - def setUp(self): - self.op_type = "proximal_adagrad" - w = np.random.random((102, 105)).astype("float32") - m = np.random.random((102, 105)).astype("float32") - g = np.random.random((102, 105)).astype("float32") - lr = np.array([0.1]).astype("float32") - l1 = 0.1 - l2 = 0.2 - - self.inputs = {'Param': w, 'Grad': g, 'Moment': m, 'LearningRate': lr} - self.attrs = {'l1': l1, 'l2': l2} - param_out = 0.0 - - moment_out = m + g * g - prox_param = w - lr * g / np.sqrt(moment_out) - if l1 > 0.0: - x = np.abs(prox_param) - lr * l1 - x[x < 0] = 0 - param_out = np.sign(prox_param) * (x / (1.0 + lr * l2)) - else: - param_out = prox_param / (1.0 + lr * l2) - - self.outputs = {'ParamOut': param_out, 'MomentOut': moment_out} - - def test_check_output(self): - self.check_output() - - -if __name__ == "__main__": - unittest.main() diff --git a/test/legacy_test/test_sample_logits_op.py b/test/legacy_test/test_sample_logits_op.py deleted file mode 100644 index 64c70b5a8a07c..0000000000000 --- a/test/legacy_test/test_sample_logits_op.py +++ /dev/null @@ -1,119 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import collections -import unittest - -import numpy as np -from op_test import OpTest - - -class TestSampleLogitsOp(OpTest): - def setUp(self): - self.op_type = "sample_logits" - self.dtype = np.float64 - self.use_mkldnn = False - bs = 2 - K = 20 - NT = 10 - S = 5 - - Samples = np.random.random([bs, NT + S]).astype('int64') - Probabilities = np.random.random([bs, NT + S]).astype('float64') - LogitsDim = np.array([bs, K], dtype=np.int64) - LabelsDim = np.array([bs, NT], dtype=np.int64) - SampledLogits = np.random.random([bs, NT + S]).astype('float64') - SampledLabels = np.random.random([bs, NT]).astype('int64') - - self.bs = bs - self.K = K - self.NT = NT - self.S = S - Labels = np.array(list(range(self.NT)) * self.bs).astype('int64') - self.Labels = Labels.reshape(self.bs, -1) - self.Logits = np.random.random([self.bs, self.K]).astype('float64') - - self.inputs = {"Logits": self.Logits, "Labels": self.Labels} - self.fetch_list = [ - 'Samples', - 'Probabilities', - 'SampledLogits', - 'SampledLabels', - ] - self.outputs = collections.OrderedDict( - ( - ('Samples', Samples), - ('Probabilities', Probabilities), - ('LogitsDim', LogitsDim), - ('LabelsDim', LabelsDim), - ('SampledLogits', SampledLogits), - ('SampledLabels', SampledLabels), - ) - ) - - self.attrs = {'num_samples': self.S} - - def test_check_output(self): - places = self._get_places() - for p in places: - (Samples, Probabilities, SampledLogits, SampledLabels) = ( - np.array(o) for o in self.calc_output(p) - ) - - assert ( - Samples.dtype == np.int64 - ), f"Samples dtype is {Samples.dtype}, not int64" - assert ( - Probabilities.dtype == np.float64 - ), f"Probabilities dtype is {Probabilities.dtype}, not float64" - assert ( - SampledLogits.dtype == np.float64 - ), f"SampledLogits dtype is {SampledLogits.dtype}, not float64" - assert ( - SampledLabels.dtype == np.int64 - ), f"SampledLabels dtype is {SampledLabels.dtype}, not int64" - - assert Samples.shape == (self.bs, self.NT + self.S) - assert Probabilities.shape == (self.bs, self.NT + self.S) - assert SampledLogits.shape == (self.bs, self.NT + self.S) - assert SampledLabels.shape == (self.bs, self.NT) - - assert (SampledLabels == self.Labels).all() - sampled_logits = self.Logits[:, Samples[0][: self.NT]] - sampled_logits -= np.log(Probabilities[:, : self.NT]) - np.testing.assert_almost_equal( - sampled_logits, SampledLogits[:, : self.NT] - ) - - def test_check_grad(self): - self._check_grad_helper() - for p in self._get_places(): - grads = self._get_gradient(['Logits'], p, ['SampledLogits'], []) - np.testing.assert_almost_equal(grads[0].sum(), np.array([1.0])) - - -class TestSampleLogitsOpNoUniq(TestSampleLogitsOp): - def setUp(self): - super().setUp() - self.attrs = {'num_samples': self.S, 'uniq': False} - - -class TestSampleLogitsOpWithAccidentalHits(TestSampleLogitsOp): - def setUp(self): - super().setUp() - self.attrs = {'num_samples': self.S, 'remove_accidental_hits': False} - - -if __name__ == "__main__": - unittest.main() diff --git a/test/legacy_test/test_sigmoid_focal_loss_op.py b/test/legacy_test/test_sigmoid_focal_loss_op.py deleted file mode 100644 index efe1922165fb4..0000000000000 --- a/test/legacy_test/test_sigmoid_focal_loss_op.py +++ /dev/null @@ -1,145 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import copy -import math -import unittest - -import numpy as np -from op_test import OpTest - -import paddle - - -def sigmoid_focal_loss_forward( - x_data, label_data, fg_num_data, gamma, alpha, num_classes -): - x_data_t = copy.deepcopy(x_data) - out_data = copy.deepcopy(x_data) - x_width = len(x_data) - x_height = len(x_data[0, :]) - x_data_t = x_data_t.flatten() - out_data = out_data.flatten() - for idx in range(len(x_data_t)): - x = x_data_t[idx] - a = int(idx / num_classes) - d = int(idx % num_classes) - label = label_data[a] - c_pos = float(int(label) == int(d + 1)) - c_neg = float((int(label) != -1) & (int(label) != (d + 1))) - fg_num = max(fg_num_data, 1) - z_neg = (1.0 - alpha) / fg_num - z_pos = alpha / fg_num - - p = 1.0 / (1.0 + math.exp(-x)) - FLT_MIN = 1.175494351e-38 - term_pos = math.pow((1.0 - p), gamma) * math.log(max(FLT_MIN, p)) - term_neg = math.pow(p, gamma) * ( - -1.0 * x * (x >= 0) - - math.log(1.0 + math.exp(x - 2.0 * x * (x >= 0))) - ) - out_data[idx] = 0.0 - out_data[idx] += -c_pos * term_pos * z_pos - out_data[idx] += -c_neg * term_neg * z_neg - - out_data = out_data.reshape(x_width, x_height) - return out_data - - -class TestSigmoidFocalLossOp1(OpTest): - def set_argument(self): - self.num_anchors = 10 - self.num_classes = 10 - self.gamma = 2.0 - self.alpha = 0.25 - - def setUp(self): - self.set_argument() - - dims = (self.num_anchors, self.num_classes) - X = np.random.standard_normal(dims).astype("float64") - L = np.random.randint(0, self.num_classes + 1, (dims[0], 1)).astype( - "int32" - ) - F = np.zeros(1) - F[0] = len(np.where(L > 0)[0]) - F = F.astype("int32") - - self.op_type = "sigmoid_focal_loss" - self.inputs = { - 'X': X, - 'Label': L, - 'FgNum': F, - } - self.attrs = { - 'gamma': self.gamma, - 'alpha': self.alpha, - } - loss = sigmoid_focal_loss_forward( - self.inputs['X'], - self.inputs['Label'], - self.inputs['FgNum'], - self.gamma, - self.alpha, - self.num_classes, - ) - self.outputs = {'Out': loss.astype('float64')} - - def test_check_output(self): - self.check_output() - - def test_check_grad(self): - self.check_grad(['X'], 'Out') - - -@unittest.skipIf( - not paddle.is_compiled_with_cuda(), "core is not compiled with CUDA" -) -class TestSigmoidFocalLossOp2(TestSigmoidFocalLossOp1): - def test_check_output(self): - place = paddle.CUDAPlace(0) - self.check_output_with_place(place, atol=2e-3) - - def test_check_grad(self): - place = paddle.CUDAPlace(0) - self.check_grad_with_place( - place, ['X'], 'Out', max_relative_error=0.002 - ) - - -class TestSigmoidFocalLossOp3(TestSigmoidFocalLossOp1): - def set_argument(self): - self.num_anchors = 200 - self.num_classes = 10 - self.gamma = 1.0 - self.alpha = 0.5 - - -@unittest.skipIf( - not paddle.is_compiled_with_cuda(), "core is not compiled with CUDA" -) -class TestSigmoidFocalLossOp4(TestSigmoidFocalLossOp3): - def test_check_output(self): - place = paddle.CUDAPlace(0) - self.check_output_with_place(place, atol=2e-3) - - def test_check_grad(self): - place = paddle.CUDAPlace(0) - self.check_grad_with_place( - place, ['X'], 'Out', max_relative_error=0.002 - ) - - -if __name__ == '__main__': - unittest.main() diff --git a/test/legacy_test/test_target_assign_op.py b/test/legacy_test/test_target_assign_op.py deleted file mode 100644 index 98369d62247df..0000000000000 --- a/test/legacy_test/test_target_assign_op.py +++ /dev/null @@ -1,195 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import random -import unittest - -import numpy as np -from op_test import OpTest - - -def gen_match_and_neg_indices(num_prior, gt_lod, neg_lod): - if len(gt_lod) != len(neg_lod): - raise AssertionError("The input arguments are illegal.") - - batch_size = len(gt_lod) - - match_indices = -1 * np.ones((batch_size, num_prior)).astype('int32') - neg_indices = np.zeros((sum(neg_lod), 1)).astype('int32') - - offset = 0 - for n in range(batch_size): - gt_num = gt_lod[n] - ids = random.sample(list(range(num_prior)), gt_num) - match_indices[n, ids] = list(range(gt_num)) - - ret_ids = set(range(num_prior)) - set(ids) - l = neg_lod[n] - neg_ids = random.sample(ret_ids, l) - neg_indices[offset : offset + neg_lod[n], :] = ( - np.array(neg_ids).astype('int32').reshape(l, 1) - ) - offset += neg_lod[n] - - return match_indices, neg_indices - - -def target_assign( - encoded_box, - gt_label, - match_indices, - neg_indices, - gt_lod, - neg_lod, - mismatch_value, -): - batch_size, num_prior = match_indices.shape - - # init target bbox - trg_box = np.zeros((batch_size, num_prior, 4)).astype('float32') - # init weight for target bbox - trg_box_wt = np.zeros((batch_size, num_prior, 1)).astype('float32') - # init target label - trg_label = np.ones((batch_size, num_prior, 1)).astype('int32') - trg_label = trg_label * mismatch_value - # init weight for target label - trg_label_wt = np.zeros((batch_size, num_prior, 1)).astype('float32') - - gt_offset = 0 - neg_offset = 0 - for i in range(batch_size): - cur_indices = match_indices[i] - col_ids = np.where(cur_indices > -1) - col_val = cur_indices[col_ids] - - # target bbox - for v, c in zip(col_val + gt_offset, col_ids[0].tolist()): - trg_box[i][c][:] = encoded_box[v][c][:] - # weight for target bbox - trg_box_wt[i][col_ids] = 1.0 - - trg_label[i][col_ids] = gt_label[col_val + gt_offset] - trg_label_wt[i][col_ids] = 1.0 - # set target label weight to 1.0 for the negative samples - if neg_indices is not None: - neg_ids = neg_indices[neg_offset : neg_offset + neg_lod[i]] - trg_label_wt[i][neg_ids] = 1.0 - # update offset - gt_offset += gt_lod[i] - neg_offset += neg_lod[i] - - return trg_box, trg_box_wt, trg_label, trg_label_wt - - -class TestTargetAssginFloatType(OpTest): - def setUp(self): - self.op_type = "target_assign" - num_prior = 120 - num_class = 21 - gt_lod = [5, 6, 12] - neg_lod = [4, 3, 6] - mismatch_value = 0 - batch_size = len(gt_lod) - num_gt = sum(gt_lod) - - encoded_box = np.random.random((num_gt, num_prior, 4)).astype('float32') - gt_label = np.random.randint(num_class, size=(num_gt, 1)).astype( - 'int32' - ) - - match_indices, neg_indices = gen_match_and_neg_indices( - num_prior, gt_lod, neg_lod - ) - - out, out_wt, _, _ = target_assign( - encoded_box, - gt_label, - match_indices, - neg_indices, - gt_lod, - neg_lod, - mismatch_value, - ) - - # assign regression targets - x = encoded_box - self.inputs = { - 'X': (x, [gt_lod]), - 'MatchIndices': match_indices, - } - self.attrs = {'mismatch_value': mismatch_value} - self.outputs = { - 'Out': out, - 'OutWeight': out_wt, - } - - def test_check_output(self): - # NODE(yjjiang11): This op will be deprecated. - self.check_output(check_dygraph=False) - - -class TestTargetAssginIntType(OpTest): - def setUp(self): - self.op_type = "target_assign" - num_prior = 120 - num_class = 21 - gt_lod = [5, 6, 12] - neg_lod = [4, 3, 6] - mismatch_value = 0 - batch_size = len(gt_lod) - num_gt = sum(gt_lod) - - encoded_box = np.random.random((num_gt, num_prior, 4)).astype('float32') - gt_label = np.random.randint(num_class, size=(num_gt, 1)).astype( - 'int32' - ) - - match_indices, neg_indices = gen_match_and_neg_indices( - num_prior, gt_lod, neg_lod - ) - - ( - _, - _, - out, - out_wt, - ) = target_assign( - encoded_box, - gt_label, - match_indices, - neg_indices, - gt_lod, - neg_lod, - mismatch_value, - ) - - # assign cassification argets - x = np.reshape(gt_label, (num_gt, 1, 1)) - self.inputs = { - 'X': (x, [gt_lod]), - 'MatchIndices': match_indices, - 'NegIndices': (neg_indices, [neg_lod]), - } - self.attrs = {'mismatch_value': mismatch_value} - self.outputs = { - 'Out': out, - 'OutWeight': out_wt, - } - - def test_check_output(self): - self.check_output(check_dygraph=False) - - -if __name__ == '__main__': - unittest.main() diff --git a/tools/parallel_UT_rule.py b/tools/parallel_UT_rule.py index bce8ec2ef55c0..dbfdb276cf01a 100755 --- a/tools/parallel_UT_rule.py +++ b/tools/parallel_UT_rule.py @@ -819,7 +819,6 @@ 'test_fit_a_line', 'test_mish_op', 'test_transpose_op', - 'test_mean_iou', 'test_conv3d_transpose_op', 'test_jit_save_load', 'test_unsqueeze2_op', @@ -891,7 +890,6 @@ 'test_cross_op', 'concat_test', 'test_ast_util', - 'test_proximal_adagrad_op', 'test_pairwise_distance', 'test_imperative_mnist', 'test_beam_search_decoder', @@ -997,7 +995,6 @@ 'test_unstack_op', 'test_increment', 'strided_memcpy_test', - 'test_target_assign_op', 'test_trt_dynamic_shape_transformer_prune', 'test_box_decoder_and_assign_op', 'test_trt_dynamic_shape', @@ -1188,7 +1185,6 @@ 'test_imperative_optimizer', 'test_subtract_op', 'test_conv_transpose_nn_grad', - 'test_sigmoid_focal_loss_op', 'test_cuda_stream_event', 'test_sequence_pad_op', 'test_rnn_cells', @@ -1456,7 +1452,6 @@ 'test_generate_proposals_v2_op', 'test_graph', 'test_gelu_op', - 'test_sample_logits_op', 'test_weight_normalization', 'test_activation_bf16_mkldnn_op', 'trt_dynamic_shape_test', @@ -2581,7 +2576,6 @@ 'test_logical_op', 'test_imperative_deepcf', 'test_cholesky_op', - 'test_sample_logits_op', 'test_ir_fc_fuse_pass', 'test_fleet_base_single', 'test_multiprocess_dataloader_iterable_dataset_dynamic', @@ -2644,7 +2638,6 @@ 'test_auc_single_pred_op', 'test_instance_norm_op_v2', 'test_softmax_bf16_mkldnn_op', - 'test_mean_iou', 'test_sequence_slice_op', 'test_polygon_box_transform', 'test_sequence_pad_op', @@ -2689,7 +2682,6 @@ 'test_bicubic_interp_op', 'test_spp_op', 'test_callbacks', - 'test_sigmoid_focal_loss_op', 'test_sequence_unpad_op', 'test_conv1d_transpose_layer', 'test_sequence_pool', @@ -2846,7 +2838,6 @@ 'test_diag', 'test_strided_slice_op', 'test_switch_case', - 'test_target_assign_op', 'test_isfinite_op', 'test_conv_elementwise_add_act_fuse_pass', 'test_unbind_op', @@ -2888,7 +2879,6 @@ 'test_dequantize_log_op', 'test_mkldnn_batch_norm_act_fuse_pass', 'test_imperative_skip_op', - 'test_proximal_adagrad_op', 'test_conv2d_transpose_mkldnn_op', 'test_imperative_optimizer', 'test_assign_value_op', diff --git a/tools/static_mode_white_list.py b/tools/static_mode_white_list.py index 80d14655af003..d60c9255f69a7 100755 --- a/tools/static_mode_white_list.py +++ b/tools/static_mode_white_list.py @@ -317,7 +317,6 @@ 'test_matmul_op', 'test_matmul_v2_op', 'test_matrix_nms_op', - 'test_mean_iou', 'test_memory_reuse_exclude_feed_var', 'test_memory_usage', 'test_merge_ids_op', @@ -389,7 +388,6 @@ 'test_program_prune_backward', 'test_program_to_string', 'test_protobuf_descs', - 'test_proximal_adagrad_op', 'test_proximal_gd_op', 'test_prroi_pool_op', 'test_prune', @@ -432,7 +430,6 @@ 'test_rpn_target_assign_op', 'test_run_program_op', 'test_runtime_and_compiletime_exception', - 'test_sample_logits_op', 'test_save_model_without_var', 'test_scale_op', 'test_scale_mkldnn_op', @@ -450,7 +447,6 @@ 'test_shuffle_batch_op', 'test_shuffle_channel_op', 'test_sigmoid_cross_entropy_with_logits_op', - 'test_sigmoid_focal_loss_op', 'test_sign_op', 'test_similarity_focus_op', 'test_size_op', @@ -472,7 +468,6 @@ 'test_sum_op', 'test_switch', 'test_switch_case', - 'test_target_assign_op', 'test_tdm_child_op', 'test_tdm_sampler_op', 'test_teacher_student_sigmoid_loss_op', From 1e3212fb4a7cb2fabaf5a5fd2539567c553b4825 Mon Sep 17 00:00:00 2001 From: zhangbo9674 <82555433+zhangbo9674@users.noreply.github.com> Date: Tue, 10 Oct 2023 09:50:01 +0800 Subject: [PATCH 47/62] [PIR] Support if op exe (#57801) * add * add * fix * fix * refine * delete sub_blocks * refine * refien * add ut * fix --- .../new_executor/instruction/CMakeLists.txt | 2 +- .../instruction/cond_instruction.cc | 173 +++------ .../instruction/cond_instruction.h | 28 +- .../instruction/instruction_base.cc | 32 +- .../instruction/instruction_base.h | 10 +- .../instruction/instruction_util.cc | 73 ++-- .../instruction/instruction_util.h | 11 +- .../instruction/legacy_kernel_instruction.cc | 38 +- .../instruction/legacy_kernel_instruction.h | 18 +- .../instruction/phi_kernel_instruction.cc | 41 +- .../instruction/phi_kernel_instruction.h | 18 +- .../interpreter/interpreter_util.cc | 90 ----- .../interpreter/interpreter_util.h | 9 - .../new_executor/new_ir_interpreter.cc | 43 +-- .../new_executor/new_ir_interpreter.h | 6 +- .../pir_adaptor/pir_adaptor_util.cc | 359 ++++++++++++------ .../pir_adaptor/pir_adaptor_util.h | 175 +++++---- .../pir/transforms/pd_op_to_kernel_pass.cc | 68 +++- test/legacy_test/test_cond.py | 9 + 19 files changed, 575 insertions(+), 628 deletions(-) diff --git a/paddle/fluid/framework/new_executor/instruction/CMakeLists.txt b/paddle/fluid/framework/new_executor/instruction/CMakeLists.txt index 0623499975b6f..64c14374162c6 100644 --- a/paddle/fluid/framework/new_executor/instruction/CMakeLists.txt +++ b/paddle/fluid/framework/new_executor/instruction/CMakeLists.txt @@ -2,7 +2,7 @@ cc_library( instruction_base SRCS instruction_base.cc phi_kernel_instruction.cc legacy_kernel_instruction.cc cond_instruction.cc instruction_util.cc - DEPS phi framework_proto) + DEPS pir_adaptor phi framework_proto) if(WITH_CINN AND NOT CINN_ONLY) cc_library( diff --git a/paddle/fluid/framework/new_executor/instruction/cond_instruction.cc b/paddle/fluid/framework/new_executor/instruction/cond_instruction.cc index 8c89800dd2d95..7a037f0983c64 100644 --- a/paddle/fluid/framework/new_executor/instruction/cond_instruction.cc +++ b/paddle/fluid/framework/new_executor/instruction/cond_instruction.cc @@ -48,33 +48,23 @@ std::vector GetYiedOpInputs(pir::Block* block) { } } } - return vec_res; } -void GetInputIds( - pir::Operation* op, - Scope* inner_scope, - const std::unordered_map<::pir::Value, std::string>& value_2_var_name, - const std::map& var_name_2_id, - const std::unordered_map& - variable_2_var_name, - std::unordered_map>* input_ids) { +void GetInputIds(pir::Operation* op, + const ValueExecutionInfo& value_exec_info, + std::unordered_map>* input_ids) { for (size_t i = 0; i < op->num_operands(); i++) { pir::Value value = op->operand_source(i); - if (value) { - PADDLE_ENFORCE_NE( - value_2_var_name.find(value), - value_2_var_name.end(), + if (value && value.type()) { + PADDLE_ENFORCE_EQ( + value_exec_info.HasValue(value), + true, phi::errors::PreconditionNotMet( "input should in name map, [%d] 'th input of [%s] op", i, "if op")); - std::vector inputs_id = GetValueIds(value, - inner_scope, - value_2_var_name, - var_name_2_id, - variable_2_var_name); + std::vector inputs_id = GetValueIds(value, value_exec_info); input_ids->emplace(value, inputs_id); } } @@ -82,11 +72,7 @@ void GetInputIds( void GetOutsideOpInputs( pir::Block* block, - Scope* inner_scope, - const std::unordered_map<::pir::Value, std::string>& value_2_var_name, - const std::map& var_name_2_id, - const std::unordered_map& - variable_2_var_name, + const ValueExecutionInfo& value_exec_info, std::unordered_map>* input_ids) { std::unordered_set inner_outputs; for (auto op : (*block)) { @@ -99,18 +85,14 @@ void GetOutsideOpInputs( for (size_t i = 0; i < op->num_operands(); ++i) { pir::Value value = op->operand_source(i); if (value && (!inner_outputs.count(value))) { - PADDLE_ENFORCE_NE( - value_2_var_name.find(value), - value_2_var_name.end(), + PADDLE_ENFORCE_EQ( + value_exec_info.HasValue(value), + true, phi::errors::PreconditionNotMet( "input should in name map, [%d] 'th input of [%s] op", i, - "if op")); - std::vector inputs_id = GetValueIds(value, - inner_scope, - value_2_var_name, - var_name_2_id, - variable_2_var_name); + op->name())); + std::vector inputs_id = GetValueIds(value, value_exec_info); input_ids->emplace(value, inputs_id); } @@ -118,142 +100,113 @@ void GetOutsideOpInputs( } } -CondInstruction::CondInstruction( - size_t id, - const platform::Place& place, - pir::Operation* op, - Scope* scope, - Scope* local_scope, - ValueExecutionInfo* parent_exe_info, - const std::map& sub_blocks) +CondInstruction::CondInstruction(size_t id, + const platform::Place& place, + pir::Operation* op, + ValueExecutionInfo* value_exec_info) : InstructionBase(id, place) { - op_ = op; - VLOG(6) << "finish process dist attributes"; - - SetKernelType(AnalyseOpFuncType(op, place)); - VLOG(6) << "finish process analyse kernel type"; - - Scope* inner_scope = local_scope == nullptr ? scope : local_scope; - - VLOG(6) << "finish process inputs outputs index"; - PADDLE_ENFORCE( op->isa(), phi::errors::PreconditionNotMet("Cond instruction only support if op")); - auto if_op = op->dyn_cast(); + op_ = op; - for (size_t i = 0; i < if_op.num_results(); ++i) { - if_op_outputs_.push_back(inner_scope->GetVar( - parent_exe_info->GetValue2VarName().at(if_op.result(i)))); - } + SetKernelType(AnalyseOpFuncType(op, place)); + VLOG(6) << "finish process analyse kernel type"; auto cond_value = if_op.operand_source(0); - auto var_name = parent_exe_info->GetValue2VarName().at(cond_value); - cond_var = inner_scope->FindVar(var_name); + cond_var_ = value_exec_info->GetScope()->FindVar( + value_exec_info->GetValue2VarName().at(cond_value)); + for (size_t i = 0; i < if_op.num_results(); ++i) { + output_vars_.push_back(value_exec_info->GetScope()->GetVar( + value_exec_info->GetValue2VarName().at(if_op.result(i)))); + } + VLOG(6) << "finish process cond_var and output_vars"; auto true_branch_block = if_op.true_block(); - auto false_branch_block = if_op.false_block(); - auto true_branch_yied_inputs = GetYiedOpInputs(true_branch_block); - auto false_branch_yied_inputs = GetYiedOpInputs(false_branch_block); - - auto true_scope = sub_blocks.at(true_branch_block); - true_branch_inter = + Scope* true_scope = &(value_exec_info->GetScope()->NewScope()); + true_branch_inter_ = new NewIRInterpreter(place, {}, true_branch_block, true_scope, - parent_exe_info->NewChild(true_scope), + value_exec_info->NewChild(true_scope), {}); std::set true_skip_gc_names_set; for (auto value : true_branch_yied_inputs) { - true_skip_gc_names_.push_back(true_branch_inter->GetNameByValue(value)); - true_skip_gc_names_set.insert(true_branch_inter->GetNameByValue(value)); + true_skip_gc_names_.push_back(true_branch_inter_->GetNameByValue(value)); + true_skip_gc_names_set.insert(true_branch_inter_->GetNameByValue(value)); } - true_branch_inter->SetSkipGcVars(true_skip_gc_names_set); + true_branch_inter_->SetSkipGcVars(true_skip_gc_names_set); + VLOG(6) << "finish process true branch interpreter"; - auto false_scope = sub_blocks.at(false_branch_block); - false_branch_inter = + auto false_branch_block = if_op.false_block(); + auto false_branch_yied_inputs = GetYiedOpInputs(false_branch_block); + Scope* false_scope = &(value_exec_info->GetScope()->NewScope()); + false_branch_inter_ = new NewIRInterpreter(place, {}, false_branch_block, false_scope, - parent_exe_info->NewChild(false_scope), + value_exec_info->NewChild(false_scope), {}); std::set false_skip_gc_names_set; for (auto value : false_branch_yied_inputs) { - false_skip_gc_names_.push_back(false_branch_inter->GetNameByValue(value)); - false_skip_gc_names_set.insert(false_branch_inter->GetNameByValue(value)); + false_skip_gc_names_.push_back(false_branch_inter_->GetNameByValue(value)); + false_skip_gc_names_set.insert(false_branch_inter_->GetNameByValue(value)); } - false_branch_inter->SetSkipGcVars(false_skip_gc_names_set); - - // the true branch and false branch input will be the if_op inputs + false_branch_inter_->SetSkipGcVars(false_skip_gc_names_set); + VLOG(6) << "finish process false branch interpreter"; + // NOTE(zhangbo): IfOp sub_block's inputs include two kind of value: one is + // OpOperand of IfOp, and the other is external Values used in true_block or + // false_block. std::unordered_map> inputs; - GetInputIds(op, - inner_scope, - parent_exe_info->GetValue2VarName(), - parent_exe_info->GetVarName2Id(), - parent_exe_info->GetVar2VarName(), - &inputs); - GetOutsideOpInputs(true_branch_block, - inner_scope, - parent_exe_info->GetValue2VarName(), - parent_exe_info->GetVarName2Id(), - parent_exe_info->GetVar2VarName(), - &inputs); - - GetOutsideOpInputs(false_branch_block, - inner_scope, - parent_exe_info->GetValue2VarName(), - parent_exe_info->GetVarName2Id(), - parent_exe_info->GetVar2VarName(), - &inputs); + GetInputIds(op, *value_exec_info, &inputs); + GetOutsideOpInputs(true_branch_block, *value_exec_info, &inputs); + GetOutsideOpInputs(false_branch_block, *value_exec_info, &inputs); SetInputs(inputs); std::unordered_map> outputs; for (size_t i = 0; i < op->num_results(); i++) { pir::Value value = op->result(i); if (value && value.type()) { - PADDLE_ENFORCE_NE( - parent_exe_info->GetValue2VarName().find(value), - parent_exe_info->GetValue2VarName().end(), + PADDLE_ENFORCE_EQ( + value_exec_info->HasValue(value), + true, phi::errors::PreconditionNotMet( "input should in name map, [%d] 'th input of [%s] op", i, "if op")); - std::vector outputs_id = - GetValueIds(value, - inner_scope, - parent_exe_info->GetValue2VarName(), - parent_exe_info->GetVarName2Id(), - parent_exe_info->GetVar2VarName()); + std::vector outputs_id = GetValueIds(value, *value_exec_info); outputs.emplace(value, outputs_id); } } SetOutputs(outputs); + VLOG(6) << "finish process inputs outputs index"; } void CondInstruction::CopyBranchOutput( const std::vector& var_names, const NewIRInterpreter* inter) { for (size_t i = 0; i < var_names.size(); ++i) { - auto* inner_var = inter->local_scope()->GetVar(var_names[i]); + auto* inner_var = inter->InnerScope()->GetVar(var_names[i]); - if_op_outputs_[i]->GetMutable()->ShareDataWith( + output_vars_[i]->GetMutable()->ShareDataWith( inner_var->Get()); } } void CondInstruction::Run() { - if (cond_var->Get().data()[0]) { - true_branch_inter->Run({}, false); - CopyBranchOutput(true_skip_gc_names_, true_branch_inter); + DeviceContext().Wait(); + if (cond_var_->Get().data()[0]) { + true_branch_inter_->Run({}, false); + CopyBranchOutput(true_skip_gc_names_, true_branch_inter_); } else { - false_branch_inter->Run({}, false); - CopyBranchOutput(false_skip_gc_names_, false_branch_inter); + false_branch_inter_->Run({}, false); + CopyBranchOutput(false_skip_gc_names_, false_branch_inter_); } // copy ouptut diff --git a/paddle/fluid/framework/new_executor/instruction/cond_instruction.h b/paddle/fluid/framework/new_executor/instruction/cond_instruction.h index 75eb7d0ece04f..974784dbe982a 100644 --- a/paddle/fluid/framework/new_executor/instruction/cond_instruction.h +++ b/paddle/fluid/framework/new_executor/instruction/cond_instruction.h @@ -29,14 +29,10 @@ class ValueExecutionInfo; class CondInstruction : public InstructionBase { public: - CondInstruction( - size_t id, - const platform::Place& place, - ::pir::Operation* op, - Scope* scope, - Scope* local_scope, - ValueExecutionInfo* parent_exe_info, - const std::map& sub_blocks); + CondInstruction(size_t id, + const platform::Place& place, + ::pir::Operation* op, + ValueExecutionInfo* value_exe_info); void Run() override; @@ -48,19 +44,23 @@ class CondInstruction : public InstructionBase { void CopyBranchOutput(const std::vector& var_names, const NewIRInterpreter* inter); + ::pir::Operation* op_; + std::string cond_name_{"cond_instruction"}; - Variable* cond_var; + Variable* cond_var_; + + std::vector output_vars_; - std::vector if_op_outputs_; + NewIRInterpreter* true_branch_inter_; - NewIRInterpreter* true_branch_inter; - NewIRInterpreter* false_branch_inter; + NewIRInterpreter* false_branch_inter_; + // TODO(zhangbo): Currently, only the output of IfOp is included. In the + // future, need to consider how to support IfGradOp using IfOp value. std::vector true_skip_gc_names_; - std::vector false_skip_gc_names_; - ::pir::Operation* op_; + std::vector false_skip_gc_names_; }; } // namespace framework diff --git a/paddle/fluid/framework/new_executor/instruction/instruction_base.cc b/paddle/fluid/framework/new_executor/instruction/instruction_base.cc index a6d2f5a201b38..0b494c29dea86 100644 --- a/paddle/fluid/framework/new_executor/instruction/instruction_base.cc +++ b/paddle/fluid/framework/new_executor/instruction/instruction_base.cc @@ -16,6 +16,7 @@ #include "paddle/fluid/framework/new_executor/instruction/instruction_util.h" #include "paddle/fluid/framework/new_executor/interpreter/interpreter_util.h" +#include "paddle/fluid/framework/new_executor/pir_adaptor/pir_adaptor_util.h" #include "paddle/fluid/platform/profiler/event_tracing.h" #include "paddle/fluid/framework/new_executor/interpreter/stream_analyzer.h" @@ -214,12 +215,7 @@ void InstructionBase::SetOutputs( } void InstructionBase::InitInputsOutputsIds( - ::pir::Operation* op, - Scope* inner_scope, - const std::unordered_map& value_2_var_name, - const std::map& var_name_2_id, - const std::unordered_map& - variable_2_var_name) { + ::pir::Operation* op, const ValueExecutionInfo& value_exec_info) { auto op_attributes = op->attributes(); auto op_name = op_attributes.at("op_name").dyn_cast().AsString(); @@ -227,18 +223,14 @@ void InstructionBase::InitInputsOutputsIds( for (size_t i = 0; i < op->num_operands(); i++) { pir::Value value = op->operand_source(i); if (value) { - PADDLE_ENFORCE_NE( - value_2_var_name.find(value), - value_2_var_name.end(), + PADDLE_ENFORCE_EQ( + value_exec_info.HasValue(value), + true, phi::errors::PreconditionNotMet( "input should in name map, [%d] 'th input of [%s] op", i, op_name)); - std::vector inputs_id = GetValueIds(value, - inner_scope, - value_2_var_name, - var_name_2_id, - variable_2_var_name); + std::vector inputs_id = GetValueIds(value, value_exec_info); inputs.emplace(value, inputs_id); } } @@ -248,18 +240,14 @@ void InstructionBase::InitInputsOutputsIds( for (size_t i = 0; i < op->num_results(); i++) { pir::Value value = op->result(i); if (value && value.type()) { - PADDLE_ENFORCE_NE( - value_2_var_name.find(value), - value_2_var_name.end(), + PADDLE_ENFORCE_EQ( + value_exec_info.HasValue(value), + true, phi::errors::PreconditionNotMet( "input should in name map, [%d] 'th input of [%s] op", i, op_name)); - std::vector outputs_id = GetValueIds(value, - inner_scope, - value_2_var_name, - var_name_2_id, - variable_2_var_name); + std::vector outputs_id = GetValueIds(value, value_exec_info); outputs.emplace(value, outputs_id); } } diff --git a/paddle/fluid/framework/new_executor/instruction/instruction_base.h b/paddle/fluid/framework/new_executor/instruction/instruction_base.h index 7a77e8e8fae85..6079742611915 100644 --- a/paddle/fluid/framework/new_executor/instruction/instruction_base.h +++ b/paddle/fluid/framework/new_executor/instruction/instruction_base.h @@ -28,6 +28,7 @@ class Value; namespace paddle { namespace framework { +class ValueExecutionInfo; using SchedulingPriority = int64_t; @@ -139,13 +140,8 @@ class InstructionBase { virtual ::pir::Operation* Operation() const = 0; - void InitInputsOutputsIds( - ::pir::Operation* op, - Scope* inner_scope, - const std::unordered_map<::pir::Value, std::string>& value_2_var_name, - const std::map& var_name_2_id, - const std::unordered_map& - variable_2_var_name); + void InitInputsOutputsIds(::pir::Operation* op, + const ValueExecutionInfo& value_exec_info); // if scope is not null, also show dimensions of arguments virtual std::string DebugStringEx( diff --git a/paddle/fluid/framework/new_executor/instruction/instruction_util.cc b/paddle/fluid/framework/new_executor/instruction/instruction_util.cc index dfafd44281537..a9791fefdbc38 100644 --- a/paddle/fluid/framework/new_executor/instruction/instruction_util.cc +++ b/paddle/fluid/framework/new_executor/instruction/instruction_util.cc @@ -40,22 +40,17 @@ PHI_DECLARE_bool(dynamic_static_unified_comm); namespace paddle { namespace framework { -std::vector GetValueIds( - pir::Value value, - Scope* inner_scope, - const std::unordered_map& value_2_var_name, - const std::map& var_name_2_id, - const std::unordered_map& - variable_2_var_name) { +std::vector GetValueIds(pir::Value value, + const ValueExecutionInfo& value_exec_info) { std::vector ids; - auto& var_name = value_2_var_name.at(value); - ids.push_back(var_name_2_id.at(var_name)); + ids.push_back(value_exec_info.GetVarId(value)); // NOTE(zhangbo): Value maybe a VariableRefArray - auto var = inner_scope->FindVar(var_name); + auto var = + value_exec_info.GetScope()->FindVar(value_exec_info.GetVarName(value)); if (var->IsType()) { auto& var_array = var->Get(); for (auto item : var_array) { - ids.push_back(var_name_2_id.at(variable_2_var_name.at(item))); + ids.push_back(value_exec_info.GetVarId(item)); } } return ids; @@ -147,42 +142,48 @@ OpFuncType AnalyseOpFuncType(pir::Operation* op, const platform::Place& place) { return OpFuncType::kCpuSync; } - auto kernel_key = op->attributes() - .at("kernel_key") - .dyn_cast() - .data(); - if (phi::TransToPhiPlace(kernel_key.backend()).GetType() == - phi::AllocationType::CPU) { - return OpFuncType::kCpuSync; - } - PADDLE_ENFORCE_EQ(interpreter::IsSupportedHeterPlace(place), true, phi::errors::Fatal("Unsupported current place %s", place)); + auto& op_attributes = op->attributes(); + + if ((op->dialect()->name() == "pd_kernel") && + (op_attributes.count("kernel_key") > 0)) { + auto kernel_key = op_attributes.at("kernel_key") + .dyn_cast() + .data(); + if (phi::TransToPhiPlace(kernel_key.backend()).GetType() == + phi::AllocationType::CPU) { + return OpFuncType::kCpuSync; + } + } + // Some GPU OPs do not launch CUDA Kernel, but spend a lot of time on CPU // computing. They execute serially in device thread and block CUDA kernel // launching in other GPU OPs. To improve performance, set them as kGpuSync // and so that they would be dispatched to host thread. - auto& op_attributes = op->attributes(); - auto op_name = - op_attributes.at("op_name").dyn_cast().AsString(); - if (op_name == "pd_op.coalesce_tensor" && - (!platform::is_xpu_place(place) || - op->attribute("persist_output").data() == false) && - op->attribute("set_constant").data() == false && - op->attribute("copy_data").data() == false) { - return OpFuncType::kGpuSync; - } + if ((op->dialect()->name() == "pd_kernel") && + (op_attributes.count("op_name") > 0)) { + auto op_name = + op_attributes.at("op_name").dyn_cast().AsString(); + if (op_name == "pd_op.coalesce_tensor" && + (!platform::is_xpu_place(place) || + op->attribute("persist_output").data() == false) && + op->attribute("set_constant").data() == false && + op->attribute("copy_data").data() == false) { + return OpFuncType::kGpuSync; + } - // for memcpy explicitly called by user - if (platform::is_gpu_place(place) && op_name == "pd_op.memcpy_d2h") { - return OpFuncType::kGpuSync; - } + if (platform::is_gpu_place(place) && op_name == "pd_op.memcpy_d2h") { + return OpFuncType::kGpuSync; + } - if (op_name == "pd_op.shape") { - return OpFuncType::kGpuSync; + if (op_name == "pd_op.shape") { + return OpFuncType::kGpuSync; + } } + return OpFuncType::kGpuAsync; } diff --git a/paddle/fluid/framework/new_executor/instruction/instruction_util.h b/paddle/fluid/framework/new_executor/instruction/instruction_util.h index c555a101d8366..dd1b98fa3dc15 100644 --- a/paddle/fluid/framework/new_executor/instruction/instruction_util.h +++ b/paddle/fluid/framework/new_executor/instruction/instruction_util.h @@ -28,13 +28,10 @@ namespace paddle { namespace framework { -std::vector GetValueIds( - pir::Value value, - Scope* inner_scope, - const std::unordered_map<::pir::Value, std::string>& value_2_var_name, - const std::map& var_name_2_id, - const std::unordered_map& - variable_2_var_name); +class ValueExecutionInfo; + +std::vector GetValueIds(pir::Value value, + const ValueExecutionInfo& value_exec_info); platform::DeviceContext* ParseDeviceContext( pir::Operation* op, diff --git a/paddle/fluid/framework/new_executor/instruction/legacy_kernel_instruction.cc b/paddle/fluid/framework/new_executor/instruction/legacy_kernel_instruction.cc index 748c7e603f7d7..97bda34777008 100644 --- a/paddle/fluid/framework/new_executor/instruction/legacy_kernel_instruction.cc +++ b/paddle/fluid/framework/new_executor/instruction/legacy_kernel_instruction.cc @@ -36,13 +36,8 @@ LegacyKernelInstruction::LegacyKernelInstruction( size_t id, const platform::Place& place, pir::Operation* op, - Scope* scope, - Scope* local_scope, - const std::unordered_map& value_2_var_name, - const std::map& var_name_2_id, - const std::unordered_map& - variable_2_var_name) - : InstructionBase(id, place) { + const ValueExecutionInfo& value_exec_info) + : InstructionBase(id, place), value_exec_info_(value_exec_info) { auto& op_attributes = op->attributes(); auto op_name = op_attributes.at("op_name").dyn_cast().AsString(); @@ -99,18 +94,13 @@ LegacyKernelInstruction::LegacyKernelInstruction( VLOG(6) << "finish process yaml_info_parser"; if (infer_meta_interface_) { - pir::BuildPhiContext< + BuildPhiContext< phi::InferMetaContext, phi::MetaTensor, phi::MetaTensor, paddle::small_vector, paddle::small_vector, - false>(op, - value_2_var_name, - scope, - local_scope, - yaml_info_parser, - &infer_meta_context_); + false>(op, value_exec_info_, yaml_info_parser, &infer_meta_context_); } VLOG(6) << "finish process infer meta context"; @@ -126,10 +116,10 @@ LegacyKernelInstruction::LegacyKernelInstruction( phi_kernel_->IsValid(), true, "not found kernel for [%s]", kernel_name); VLOG(6) << "finish process select kernel: " << kernel_name; - Scope* inner_scope = local_scope == nullptr ? scope : local_scope; + const Scope* inner_scope = value_exec_info_.GetScope(); + + operator_base_ = BuildOperatorBase(op, value_exec_info_, yaml_info_parser); - operator_base_ = pir::BuildOperatorBase( - op, value_2_var_name, yaml_info_parser, variable_2_var_name, inner_scope); paddle::framework::VariableValueMap in_map; paddle::framework::VariableValueMap out_map; auto dev_ctx = phi::DeviceContextPool::Instance().Get( @@ -137,14 +127,11 @@ LegacyKernelInstruction::LegacyKernelInstruction( runtime_context_ = std::make_shared( paddle::framework::RuntimeContext(in_map, out_map)); - pir::BuildRuntimeContext(op, - value_2_var_name, - scope, - local_scope, - yaml_info_parser, - runtime_context_.get()); + BuildRuntimeContext( + op, value_exec_info, yaml_info_parser, runtime_context_.get()); + kernel_context_ = new paddle::framework::ExecutionContext( - *operator_base_, *local_scope, *dev_ctx, *(runtime_context_.get())); + *operator_base_, *inner_scope, *dev_ctx, *(runtime_context_.get())); VLOG(6) << "finish process kernel context"; SetDeviceContext( @@ -156,8 +143,7 @@ LegacyKernelInstruction::LegacyKernelInstruction( GetStreamPriority())); VLOG(6) << "finish process device context"; - InitInputsOutputsIds( - op, inner_scope, value_2_var_name, var_name_2_id, variable_2_var_name); + InitInputsOutputsIds(op, value_exec_info); VLOG(6) << "finish process inputs outputs index"; auto& no_need_buffer_ids = yaml_info_parser.NoNeedBufferIds(); diff --git a/paddle/fluid/framework/new_executor/instruction/legacy_kernel_instruction.h b/paddle/fluid/framework/new_executor/instruction/legacy_kernel_instruction.h index 9c6fbd9b7d807..1ccbc8ebc0158 100644 --- a/paddle/fluid/framework/new_executor/instruction/legacy_kernel_instruction.h +++ b/paddle/fluid/framework/new_executor/instruction/legacy_kernel_instruction.h @@ -18,25 +18,19 @@ namespace pir { class Operation; -class Value; } // namespace pir namespace paddle { namespace framework { class Scope; +class ValueExecutionInfo; class LegacyKernelInstruction : public InstructionBase { public: - LegacyKernelInstruction( - size_t id, - const platform::Place& place, - ::pir::Operation* op, - Scope* scope, - Scope* local_scope, - const std::unordered_map<::pir::Value, std::string>& value_2_var_name, - const std::map& var_name_2_id, - const std::unordered_map& - variable_2_var_name); + LegacyKernelInstruction(size_t id, + const platform::Place& place, + ::pir::Operation* op, + const ValueExecutionInfo& value_exec_info); ~LegacyKernelInstruction(); phi::Kernel* PhiKernel() const { return phi_kernel_; } @@ -70,6 +64,8 @@ class LegacyKernelInstruction : public InstructionBase { phi::Kernel* phi_kernel_{nullptr}; // not owned ::pir::Operation* op_{nullptr}; // not owned + + const ValueExecutionInfo& value_exec_info_; // not owned }; } // namespace framework diff --git a/paddle/fluid/framework/new_executor/instruction/phi_kernel_instruction.cc b/paddle/fluid/framework/new_executor/instruction/phi_kernel_instruction.cc index e779fb52f26e4..3f93161a363fa 100644 --- a/paddle/fluid/framework/new_executor/instruction/phi_kernel_instruction.cc +++ b/paddle/fluid/framework/new_executor/instruction/phi_kernel_instruction.cc @@ -40,13 +40,8 @@ PhiKernelInstruction::PhiKernelInstruction( size_t id, const platform::Place& place, pir::Operation* op, - Scope* scope, - Scope* local_scope, - const std::unordered_map& value_2_var_name, - const std::map& var_name_2_id, - const std::unordered_map& - variable_2_var_name) - : InstructionBase(id, place) { + const ValueExecutionInfo& value_exec_info) + : InstructionBase(id, place), value_exec_info_(value_exec_info) { auto op_attributes = op->attributes(); auto op_name = op_attributes.at("op_name").dyn_cast().AsString(); @@ -103,18 +98,13 @@ PhiKernelInstruction::PhiKernelInstruction( VLOG(6) << "finish process yaml_info_parser"; if (infer_meta_interface_) { - pir::BuildPhiContext< + BuildPhiContext< phi::InferMetaContext, phi::MetaTensor, phi::MetaTensor, paddle::small_vector, paddle::small_vector, - false>(op, - value_2_var_name, - scope, - local_scope, - yaml_info_parser, - &infer_meta_context_); + false>(op, value_exec_info_, yaml_info_parser, &infer_meta_context_); } VLOG(6) << "finish process infer meta context"; @@ -130,17 +120,14 @@ PhiKernelInstruction::PhiKernelInstruction( phi_kernel_->IsValid(), true, "not found kernel for [%s]", kernel_name); VLOG(6) << "finish process select kernel"; - pir::BuildPhiContext, - paddle::small_vector, - true>(op, - value_2_var_name, - scope, - local_scope, - yaml_info_parser, - &kernel_context_); + BuildPhiContext, + paddle::small_vector, + true>( + op, value_exec_info_, yaml_info_parser, &kernel_context_); + kernel_context_.SetDeviceContext(phi::DeviceContextPool::Instance().Get( phi::TransToPhiPlace(kernel_key.backend()))); VLOG(6) << "finish process kernel context"; @@ -154,9 +141,7 @@ PhiKernelInstruction::PhiKernelInstruction( GetStreamPriority())); VLOG(6) << "finish process device context"; - Scope* inner_scope = local_scope == nullptr ? scope : local_scope; - InitInputsOutputsIds( - op, inner_scope, value_2_var_name, var_name_2_id, variable_2_var_name); + InitInputsOutputsIds(op, value_exec_info); VLOG(6) << "finish process inputs outputs index"; auto& no_need_buffer_ids = yaml_info_parser.NoNeedBufferIds(); diff --git a/paddle/fluid/framework/new_executor/instruction/phi_kernel_instruction.h b/paddle/fluid/framework/new_executor/instruction/phi_kernel_instruction.h index 96484f435a9f7..41539300c4503 100644 --- a/paddle/fluid/framework/new_executor/instruction/phi_kernel_instruction.h +++ b/paddle/fluid/framework/new_executor/instruction/phi_kernel_instruction.h @@ -23,20 +23,14 @@ class Operation; namespace paddle { namespace framework { class Scope; -class Value; +class ValueExecutionInfo; class PhiKernelInstruction : public InstructionBase { public: - PhiKernelInstruction( - size_t id, - const platform::Place& place, - ::pir::Operation* op, - Scope* scope, - Scope* local_scope, - const std::unordered_map<::pir::Value, std::string>& value_2_var_name, - const std::map& var_name_2_id, - const std::unordered_map& - variable_2_var_name); + PhiKernelInstruction(size_t id, + const platform::Place& place, + ::pir::Operation* op, + const ValueExecutionInfo& value_exec_info); ~PhiKernelInstruction(); @@ -71,6 +65,8 @@ class PhiKernelInstruction : public InstructionBase { std::string phi_op_name_; ::pir::Operation* op_{nullptr}; // not owned + + const ValueExecutionInfo& value_exec_info_; // not owned }; } // namespace framework diff --git a/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc b/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc index 480d445017c21..3d142acdc1c7a 100644 --- a/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc +++ b/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc @@ -1027,96 +1027,6 @@ void BuildOpFuncList(const platform::Place& place, delete garbages; } -void BuildOpFuncList( - const platform::Place& place, - pir::Block* block, - std::vector* vec_func_list, - framework::Scope* scope, - framework::Scope* local_scope, - const std::unordered_map& value_2_name_map, - const ExecutionConfig& execution_config) { - vec_func_list->reserve(block->size()); - pir::IrContext* ctx = pir::IrContext::Instance(); - - ctx->GetOrRegisterDialect(); - - for (auto op : *block) { - OpFuncNode op_func_node; - auto attr_map = op->attributes(); - - auto op_name = - attr_map.at("op_name").dyn_cast().AsString(); - op_func_node.phi_op_name_ = op_name; - - if (GetSpecialOpNames().count(op_name)) { - VLOG(6) << "skip process " << op_name; - continue; - } - - pir::OpInfo op_info = ctx->GetRegisteredOpInfo(op_name); - - auto impl = - op_info.GetInterfaceImpl(); - - op_func_node.infer_meta_interface_ = - op_info.GetInterfaceImpl(); - - VLOG(6) << "op name" << op_func_node.phi_op_name_; - dialect::OpYamlInfoParser op_yaml_info_parser(impl->get_op_info_()); - if (op_func_node.infer_meta_interface_) { - pir::BuildPhiContext< - phi::InferMetaContext, - phi::MetaTensor, - phi::MetaTensor, - paddle::small_vector, - paddle::small_vector, - false>(op, - value_2_name_map, - scope, - local_scope, - op_yaml_info_parser, - &(op_func_node.infer_meta_context_)); - } - - auto kernel_name = - attr_map.at("kernel_name").dyn_cast().AsString(); - auto kernel_key = attr_map.at("kernel_key") - .dyn_cast() - .data(); - - VLOG(6) << "finish process infer meta context"; - auto t1 = phi::KernelFactory::Instance().SelectKernelOrThrowError( - kernel_name, kernel_key); - op_func_node.phi_kernel_ = new phi::Kernel(t1.kernel); - - PADDLE_ENFORCE_EQ(op_func_node.phi_kernel_->IsValid(), - true, - "not found kernel for [%s]", - kernel_name); - - pir::BuildPhiContext, - paddle::small_vector, - true>(op, - value_2_name_map, - scope, - local_scope, - op_yaml_info_parser, - &(op_func_node.kernel_context_)); - - VLOG(6) << "finish process kernel context"; - op_func_node.kernel_context_.SetDeviceContext( - phi::DeviceContextPool::Instance().Get( - phi::TransToPhiPlace(kernel_key.backend()))); - op_func_node.dev_ctx_ = phi::DeviceContextPool::Instance().Get( - phi::TransToPhiPlace(kernel_key.backend())); - - vec_func_list->emplace_back(op_func_node); - } -} - void BuildVariableScope(const framework::BlockDesc& block, const ExecutionConfig& execution_config, VariableScope* var_scope) { diff --git a/paddle/fluid/framework/new_executor/interpreter/interpreter_util.h b/paddle/fluid/framework/new_executor/interpreter/interpreter_util.h index 010d11318b432..49bcd8de0b4b1 100644 --- a/paddle/fluid/framework/new_executor/interpreter/interpreter_util.h +++ b/paddle/fluid/framework/new_executor/interpreter/interpreter_util.h @@ -104,15 +104,6 @@ void BuildOpFuncList(const platform::Place& place, bool use_local_scope = true, bool static_build = false); -void BuildOpFuncList( - const platform::Place& place, - ::pir::Block* block, - std::vector* vec_func_list, - framework::Scope* scope, - framework::Scope* local_scope, - const std::unordered_map<::pir::Value, std::string>& value_2_name_map, - const ExecutionConfig& execution_config); - void BuildVariableScope(const framework::BlockDesc& block, const ExecutionConfig& execution_config, VariableScope* var_scope); diff --git a/paddle/fluid/framework/new_executor/new_ir_interpreter.cc b/paddle/fluid/framework/new_executor/new_ir_interpreter.cc index ac9b826e6a500..b6b2be142a0dd 100644 --- a/paddle/fluid/framework/new_executor/new_ir_interpreter.cc +++ b/paddle/fluid/framework/new_executor/new_ir_interpreter.cc @@ -47,6 +47,8 @@ #include "paddle/fluid/pir/dialect/kernel/ir/kernel_dialect.h" #include "paddle/fluid/pir/dialect/kernel/ir/kernel_op.h" #include "paddle/fluid/pir/dialect/kernel/ir/kernel_type.h" +#include "paddle/fluid/pir/dialect/operator/ir/control_flow_op.h" +#include "paddle/fluid/pir/dialect/operator/ir/manual_op.h" #include "paddle/fluid/pir/dialect/operator/utils/utils.h" #include "paddle/pir/core/builtin_attribute.h" @@ -114,7 +116,7 @@ NewIRInterpreter::NewIRInterpreter( std::stringstream ss; ss << this; - ::pir::BuildScope(*ir_block_, ss.str(), &sub_blocks_, value_exe_info_.get()); + BuildScope(*ir_block_, ss.str(), value_exe_info_.get()); } NewIRInterpreter::NewIRInterpreter( @@ -176,7 +178,7 @@ NewIRInterpreter::NewIRInterpreter( std::stringstream ss; ss << this; - ::pir::BuildScope(*ir_block_, ss.str(), &sub_blocks_, value_exe_info_.get()); + BuildScope(*ir_block_, ss.str(), value_exe_info_.get()); } NewIRInterpreter::~NewIRInterpreter() { @@ -379,7 +381,7 @@ std::string NewIRInterpreter::GetDepsString() const { bool NewIRInterpreter::HasLocalScope() const { return local_scope_ != nullptr; } -Scope* NewIRInterpreter::InnerScope() { +Scope* NewIRInterpreter::InnerScope() const { return local_scope_ != nullptr ? local_scope_ : scope_; } @@ -558,20 +560,15 @@ void NewIRInterpreter::BuildInstruction() { VLOG(6) << "Build Instruction for op: " << op_idx; if (op->dialect()->name() == "builtin") { if (interpreter::GetSpecialOpNames().count(op->name())) { - VLOG(6) << "skip process " << op->name(); + VLOG(6) << "skip process builtin dialect op: " << op->name(); continue; } } else if (op->dialect()->name() == "cf") { + VLOG(6) << "skip process cf dialect op: " << op->name(); continue; - } else if (op->dialect()->name() == "pd_op") { - vec_instruction_base_.emplace_back( - std::make_unique(op_idx++, - place_, - op, - scope_, - local_scope_, - value_exe_info_.get(), - sub_blocks_)); + } else if (op->isa()) { + vec_instruction_base_.emplace_back(std::make_unique( + op_idx++, place_, op, value_exe_info_.get())); } else if (op->dialect()->name() == "pd_kernel") { auto op_name = op->attributes() .at("op_name") @@ -583,28 +580,14 @@ void NewIRInterpreter::BuildInstruction() { } VLOG(6) << "process " << op_name; - if (op->name().compare(paddle::dialect::LegacyKernelOp::name()) == 0) { + if (op->isa()) { vec_instruction_base_.emplace_back( std::make_unique( - op_idx++, - place_, - op, - scope_, - local_scope_, - value_exe_info_->GetValue2VarName(), - value_exe_info_->GetVarName2Id(), - value_exe_info_->GetVar2VarName())); + op_idx++, place_, op, *(value_exe_info_.get()))); } else { vec_instruction_base_.emplace_back( std::make_unique( - op_idx++, - place_, - op, - scope_, - local_scope_, - value_exe_info_->GetValue2VarName(), - value_exe_info_->GetVarName2Id(), - value_exe_info_->GetVar2VarName())); + op_idx++, place_, op, *(value_exe_info_.get()))); } #ifdef PADDLE_WITH_CINN } else if (op->dialect()->name() == "cinn_runtime") { diff --git a/paddle/fluid/framework/new_executor/new_ir_interpreter.h b/paddle/fluid/framework/new_executor/new_ir_interpreter.h index 04a149bb6d692..3a128791cdfce 100644 --- a/paddle/fluid/framework/new_executor/new_ir_interpreter.h +++ b/paddle/fluid/framework/new_executor/new_ir_interpreter.h @@ -78,6 +78,8 @@ class NewIRInterpreter : public InterpreterBaseImpl { const Scope* local_scope() const override; + Scope* InnerScope() const; + const platform::Place& GetPlace() const override { return place_; } void SetOutputHooks(const std::vector& hookfuncs) override { @@ -115,8 +117,6 @@ class NewIRInterpreter : public InterpreterBaseImpl { // scope bool HasLocalScope() const; - Scope* InnerScope(); - // For log and debug std::string GetDepsString() const; @@ -216,8 +216,6 @@ class NewIRInterpreter : public InterpreterBaseImpl { // value execution info std::shared_ptr value_exe_info_; - std::map sub_blocks_; - std::vector var_ref_count_; interpreter::NewIrDependencyBuilder ir_dependency_builder_; diff --git a/paddle/fluid/framework/new_executor/pir_adaptor/pir_adaptor_util.cc b/paddle/fluid/framework/new_executor/pir_adaptor/pir_adaptor_util.cc index f8400b1c289a5..4142b3fe872f1 100644 --- a/paddle/fluid/framework/new_executor/pir_adaptor/pir_adaptor_util.cc +++ b/paddle/fluid/framework/new_executor/pir_adaptor/pir_adaptor_util.cc @@ -13,38 +13,36 @@ // limitations under the License. #include "paddle/fluid/framework/new_executor/pir_adaptor/pir_adaptor_util.h" -#include "paddle/fluid/pir/dialect/operator/interface/op_yaml_info.h" -#include "paddle/fluid/pir/dialect/operator/ir/op_dialect.h" -#include "paddle/fluid/pir/dialect/operator/ir/op_type.h" -#include "paddle/fluid/pir/dialect/operator/utils/op_yaml_info_util.h" -#include "paddle/fluid/pir/dialect/operator/utils/utils.h" -#include "paddle/phi/core/meta_tensor.h" -#include "paddle/pir/core/builtin_attribute.h" -#include "paddle/pir/core/builtin_op.h" -#include "paddle/pir/core/ir_context.h" -#include "paddle/pir/core/program.h" -#include "paddle/pir/core/utils.h" +#include "paddle/fluid/framework/op_info.h" +#include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/string_array.h" #include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/framework/tensor_ref_array.h" #include "paddle/fluid/framework/variable.h" #include "paddle/fluid/framework/variable_helper.h" -#include "paddle/phi/core/kernel_context.h" - -#include "paddle/fluid/framework/string_array.h" -#include "paddle/fluid/framework/tensor_ref_array.h" #include "paddle/fluid/ir_adaptor/translator/op_compat_info.h" #include "paddle/fluid/pir/dialect/kernel/ir/kernel_attribute.h" #include "paddle/fluid/pir/dialect/kernel/ir/kernel_type.h" +#include "paddle/fluid/pir/dialect/operator/interface/op_yaml_info.h" +#include "paddle/fluid/pir/dialect/operator/ir/control_flow_op.h" +#include "paddle/fluid/pir/dialect/operator/ir/manual_op.h" #include "paddle/fluid/pir/dialect/operator/ir/op_attribute.h" +#include "paddle/fluid/pir/dialect/operator/ir/op_dialect.h" +#include "paddle/fluid/pir/dialect/operator/ir/op_type.h" #include "paddle/fluid/pir/dialect/operator/utils/op_yaml_info_parser.h" +#include "paddle/fluid/pir/dialect/operator/utils/utils.h" #include "paddle/phi/core/enforce.h" +#include "paddle/phi/core/kernel_context.h" +#include "paddle/phi/core/meta_tensor.h" +#include "paddle/pir/core/builtin_attribute.h" +#include "paddle/pir/core/builtin_op.h" +#include "paddle/pir/core/ir_context.h" +#include "paddle/pir/core/program.h" +#include "paddle/pir/core/utils.h" #include "glog/logging.h" -#include "paddle/fluid/framework/op_info.h" -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/pir/dialect/operator/ir/control_flow_op.h" -#include "paddle/fluid/pir/dialect/operator/ir/manual_op.h" namespace paddle { namespace framework { @@ -126,10 +124,159 @@ std::string ValueExecutionInfo::GetNameById(int id) const { } return ""; } -} // namespace framework -} // namespace paddle -namespace pir { +const std::unordered_map<::pir::Value, std::string>& +ValueExecutionInfo::GetValue2VarName() const { + return value_2_var_name_; +} + +void ValueExecutionInfo::AddValue2VarName(::pir::Value value, + const std::string& var_name) { + value_2_var_name_.emplace(value, var_name); +} + +const std::unordered_map& +ValueExecutionInfo::GetVar2VarName() const { + return var_2_var_name_; +} + +const std::map& ValueExecutionInfo::GetVarName2Id() const { + return var_name_2_id_; +} + +const std::unordered_map& ValueExecutionInfo::GetId2VarName() + const { + return id_2_var_name_; +} + +const std::vector& ValueExecutionInfo::GetVarList() const { + return var_list_; +} + +void ValueExecutionInfo::ResetVarList(int id, Variable* var) { + var_list_[id] = var; +} + +bool ValueExecutionInfo::HasValue(::pir::Value value) const { + return HasValueInternal(value); +} + +bool ValueExecutionInfo::HasLocalValue(::pir::Value value) const { + return HasValueLocally(value); +} + +std::string ValueExecutionInfo::GetVarName(::pir::Value value) const { + return GetVarNameInternal(value); +} + +std::string ValueExecutionInfo::GetVarName(const Variable* var) const { + return GetVarNameInternal(var); +} + +std::string ValueExecutionInfo::GetLocalVarName(::pir::Value value) const { + return GetVarNameLocally(value); +} + +std::string ValueExecutionInfo::GetLocalVarName(const Variable* var) const { + return GetVarNameLocally(var); +} + +int ValueExecutionInfo::GetVarId(::pir::Value value) const { + return GetVarIdInternal(value); +} + +int ValueExecutionInfo::GetVarId(const Variable* var) const { + return GetVarIdInternal(var); +} + +int ValueExecutionInfo::GetLocalVarId(::pir::Value value) const { + return GetVarIdLocally(value); +} + +int ValueExecutionInfo::GetLocalVarId(const Variable* var) const { + return GetVarIdLocally(var); +} + +bool ValueExecutionInfo::HasValueInternal(::pir::Value value) const { + if (HasValueLocally(value)) { + return true; + } + return (parent_ == nullptr) ? false : parent_->HasValueInternal(value); +} + +bool ValueExecutionInfo::HasValueLocally(::pir::Value value) const { + auto it = value_2_var_name_.find(value); + if (it != value_2_var_name_.end()) { + return true; + } + return false; +} + +std::string ValueExecutionInfo::GetVarNameInternal(::pir::Value value) const { + auto name = GetVarNameLocally(value); + if (name != "") { + return name; + } + return (parent_ == nullptr) ? "" : parent_->GetVarNameInternal(value); +} + +std::string ValueExecutionInfo::GetVarNameLocally(::pir::Value value) const { + auto it = value_2_var_name_.find(value); + if (it != value_2_var_name_.end()) { + return it->second; + } + return ""; +} + +std::string ValueExecutionInfo::GetVarNameInternal(const Variable* var) const { + auto name = GetVarNameLocally(var); + if (name != "") { + return name; + } + return (parent_ == nullptr) ? "" : parent_->GetVarNameInternal(var); +} + +std::string ValueExecutionInfo::GetVarNameLocally(const Variable* var) const { + auto it = var_2_var_name_.find(var); + if (it != var_2_var_name_.end()) { + return it->second; + } + return ""; +} + +int ValueExecutionInfo::GetVarIdInternal(::pir::Value value) const { + auto id = GetVarIdLocally(value); + if (id != -1) { + return id; + } + return (parent_ == nullptr) ? -1 : parent_->GetVarIdInternal(value); +} + +int ValueExecutionInfo::GetVarIdLocally(::pir::Value value) const { + auto var_name = GetVarNameLocally(value); + auto it = var_name_2_id_.find(var_name); + if (it != var_name_2_id_.end()) { + return it->second; + } + return -1; +} + +int ValueExecutionInfo::GetVarIdInternal(const Variable* var) const { + auto id = GetVarIdLocally(var); + if (id != -1) { + return id; + } + return (parent_ == nullptr) ? -1 : parent_->GetVarIdInternal(var); +} + +int ValueExecutionInfo::GetVarIdLocally(const Variable* var) const { + auto var_name = GetVarNameLocally(var); + auto it = var_name_2_id_.find(var_name); + if (it != var_name_2_id_.end()) { + return it->second; + } + return -1; +} const std::unordered_set SpecialOps = {"pd_op.feed", "pd_op.fetch", @@ -142,31 +289,24 @@ const std::unordered_set SpecialOps = {"pd_op.feed", "pd_op.shadow_output", "pd_op.if"}; -using VariableNameMap = - std::unordered_map; - -paddle::framework::Variable* CreateVar( - pir::Value value, - const std::string& var_name_prefix, - bool force_persisable, - paddle::framework::ValueExecutionInfo* value_exe_info) { - Operation* def_op = value.dyn_cast().owner(); +Variable* CreateVar(pir::Value value, + const std::string& var_name_prefix, + bool force_persisable, + ValueExecutionInfo* value_exe_info) { + pir::Operation* def_op = value.dyn_cast().owner(); bool is_persisable = false; if (def_op->isa<::pir::SetParameterOp>()) { is_persisable = true; } - paddle::framework::Variable* var = nullptr; - + Variable* var = nullptr; std::string name = var_name_prefix + "_inner_var_" + std::to_string(value_exe_info->GetVar2VarName().size()); if (force_persisable || is_persisable) { VLOG(6) << "Create var: " << name << " in scope " << value_exe_info->GetScope()->root(); - var = const_cast( - value_exe_info->GetScope()->root()) - ->Var(name); + var = const_cast(value_exe_info->GetScope()->root())->Var(name); } else { VLOG(6) << "Create var: " << name << " in scope " << value_exe_info->GetScope(); @@ -178,20 +318,19 @@ paddle::framework::Variable* CreateVar( return var; } -void CheckInputVars( - pir::Operation* op, - const std::string& op_name, - const std::unordered_map& value_2_var_name) { +void CheckInputVars(pir::Operation* op, + const std::string& op_name, + ValueExecutionInfo* execution_info) { size_t input_num = op->num_operands(); if (input_num > 0) { for (size_t i = 0; i < input_num; ++i) { auto value = op->operand_source(i); if (IsInvalid(value)) { - PADDLE_ENFORCE_NE( - value_2_var_name.find(value), - value_2_var_name.end(), + PADDLE_ENFORCE_EQ( + execution_info->HasValue(value), + true, phi::errors::PreconditionNotMet( - "input should in name map, [%d] 'th input of [%s] op", + "input should in execution_info, [%d] 'th input of [%s] op", i, op_name)); } @@ -201,13 +340,13 @@ void CheckInputVars( void BuildValue(pir::Value value, const std::string& var_name_prefix, - paddle::framework::ValueExecutionInfo* value_exe_info) { + ValueExecutionInfo* value_exe_info) { if (!IsInvalid(value)) { VLOG(8) << "Value is not invalid, so skip build a variable."; return; } - paddle::framework::Variable* var = nullptr; + Variable* var = nullptr; auto& value_2_var_name = value_exe_info->GetValue2VarName(); if (value_2_var_name.find(value) != value_2_var_name.end()) { var = value_exe_info->GetScope()->FindVar(value_2_var_name.at(value)); @@ -221,7 +360,7 @@ void BuildValue(pir::Value value, } else if (value.type().isa()) { var->GetMutable(); } else if (value.type().isa()) { - auto tensor_array = var->GetMutable(); + auto tensor_array = var->GetMutable(); for (size_t i = 0; i < value.type().dyn_cast().size(); i++) { PADDLE_ENFORCE(value.type() @@ -236,16 +375,15 @@ void BuildValue(pir::Value value, tensor_array->emplace_back(var_i); } } else { - PADDLE_THROW(phi::errors::PreconditionNotMet( - "Output only support DenseTensorType or VectorType")); + PADDLE_THROW( + phi::errors::PreconditionNotMet("Output only support DenseTensorType " + "or SelectedRowsType or VectorType")); } } -void HandleForSpecialOp( - pir::Operation* op, - const std::string& var_name_prefix, - std::map* sub_blocks, - paddle::framework::ValueExecutionInfo* value_exe_info) { +void HandleForSpecialOp(pir::Operation* op, + const std::string& var_name_prefix, + ValueExecutionInfo* value_exe_info) { std::string op_name = op->name(); if (op->attributes().count("op_name")) { op_name = @@ -258,8 +396,7 @@ void HandleForSpecialOp( op->attributes().at("name").dyn_cast().AsString(); auto fetch_var_name = fetch_src_name + "@fetch"; - auto* var = const_cast( - value_exe_info->GetScope()->root()) + auto* var = const_cast(value_exe_info->GetScope()->root()) ->Var(fetch_var_name); var->GetMutable(); auto value = op->result(0); @@ -275,7 +412,7 @@ void HandleForSpecialOp( std::string name = op->attributes().at("name").dyn_cast().AsString(); - paddle::framework::Variable* var = value_exe_info->GetScope()->Var(name); + Variable* var = value_exe_info->GetScope()->Var(name); PADDLE_ENFORCE(var, paddle::platform::errors::InvalidArgument( "The variable %s shoud exist", name)); @@ -286,7 +423,7 @@ void HandleForSpecialOp( if (op_name == "builtin.combine") { auto out_value = op->result(0); - paddle::framework::Variable* var = nullptr; + Variable* var = nullptr; auto& value_2_var_name = value_exe_info->GetValue2VarName(); if (value_2_var_name.find(out_value) != value_2_var_name.end()) { var = value_exe_info->GetScope()->FindVar(value_2_var_name.at(out_value)); @@ -294,7 +431,7 @@ void HandleForSpecialOp( var = CreateVar(out_value, var_name_prefix, false, value_exe_info); } - auto tensor_array = var->GetMutable(); + auto tensor_array = var->GetMutable(); // clear tensor array tensor_array->clear(); size_t input_num = op->num_operands(); @@ -327,7 +464,7 @@ void HandleForSpecialOp( "SetParamer param name should not equal with var name")); if (value_exe_info->GetScope()->root()->FindVar(param_name) == nullptr) { - const_cast(value_exe_info->GetScope()->root()) + const_cast(value_exe_info->GetScope()->root()) ->Rename(orig_name, param_name); VLOG(6) << "set_parameter rename var: " << orig_name << " -> " << param_name; @@ -346,7 +483,7 @@ void HandleForSpecialOp( auto orig_name = value_exe_info->GetValue2VarName().at(value); if (value_exe_info->GetScope()->root()->FindVar(var_name) == nullptr) { - const_cast(value_exe_info->GetScope()->root()) + const_cast(value_exe_info->GetScope()->root()) ->Rename(orig_name, var_name); } @@ -377,7 +514,7 @@ void HandleForSpecialOp( op->attributes().at("index").dyn_cast().data(); auto in_var = value_exe_info->GetScope()->FindVar( value_exe_info->GetValue2VarName().at(in_value)); - auto variable_array = in_var->Get(); + auto variable_array = in_var->Get(); PADDLE_ENFORCE_EQ( value_exe_info->GetVar2VarName().count(variable_array[index]), @@ -401,7 +538,7 @@ void HandleForSpecialOp( auto in_var = value_exe_info->GetScope()->FindVar( value_exe_info->GetValue2VarName().at(in_value)); - auto variable_array = in_var->Get(); + auto variable_array = in_var->Get(); for (uint64_t idx = 0; idx < variable_array.size(); ++idx) { auto out_value = op->result(idx); @@ -419,20 +556,8 @@ void HandleForSpecialOp( if (op_name == "pd_op.if") { auto if_op = op->dyn_cast(); - - auto true_block = if_op.true_block(); - - auto false_block = if_op.false_block(); - - auto& true_branch_scope = value_exe_info->GetScope()->NewScope(); - sub_blocks->emplace(true_block, &true_branch_scope); - - auto& false_branch_scope = value_exe_info->GetScope()->NewScope(); - sub_blocks->emplace(false_block, &false_branch_scope); - for (size_t i = 0; i < if_op->num_results(); ++i) { // auto true_value = true_yeid_op->operand_source(i); - auto if_op_out_value = if_op->result(i); BuildValue(if_op_out_value, var_name_prefix, value_exe_info); } @@ -441,7 +566,7 @@ void HandleForSpecialOp( void HandleForInplaceOp(pir::Operation* op, const std::string& var_name_prefix, - paddle::framework::ValueExecutionInfo* value_exe_info) { + ValueExecutionInfo* value_exe_info) { if (op->num_results() < 1) return; pir::IrContext* ctx = pir::IrContext::Instance(); std::string op_name = op->name(); @@ -492,13 +617,11 @@ void HandleForInplaceOp(pir::Operation* op, // is created in inner_scope. void BuildScope(const pir::Block& block, const std::string& var_name_prefix, - std::map* sub_blocks, - paddle::framework::ValueExecutionInfo* value_exe_info) { + ValueExecutionInfo* value_exe_info) { VLOG(4) << "***** [before build] scope" << "(" << value_exe_info->GetScope() << ") ******\n" - << paddle::framework::GenScopeTreeDebugInfo( - const_cast( - value_exe_info->GetScope()->root())); + << GenScopeTreeDebugInfo( + const_cast(value_exe_info->GetScope()->root())); for (auto op : block) { std::string op_name = op->name(); @@ -510,11 +633,11 @@ void BuildScope(const pir::Block& block, } VLOG(4) << "build op:" << op_name; if (SpecialOps.count(op_name)) { - HandleForSpecialOp(op, var_name_prefix, sub_blocks, value_exe_info); + HandleForSpecialOp(op, var_name_prefix, value_exe_info); continue; } - CheckInputVars(op, op_name, value_exe_info->GetValue2VarName()); + CheckInputVars(op, op_name, value_exe_info); if (op->num_results() < 1) continue; if (op->attributes().count("is_inplace") != 0 && @@ -533,22 +656,16 @@ void BuildScope(const pir::Block& block, VLOG(4) << "***** [after build] scope" << "(" << value_exe_info->GetScope() << ") ******\n" - << paddle::framework::GenScopeTreeDebugInfo( - const_cast( - value_exe_info->GetScope()->root())); + << GenScopeTreeDebugInfo( + const_cast(value_exe_info->GetScope()->root())); } -void BuildRuntimeContext( - pir::Operation* op, - const std::unordered_map& name_map, - paddle::framework::Scope* scope, - paddle::framework::Scope* local_scope, - const paddle::dialect::OpYamlInfoParser& op_yaml_info, - paddle::framework::RuntimeContext* runtime_ctx) { - paddle::framework::Scope* inner_scope = - local_scope != nullptr ? local_scope : scope; - VLOG(6) << "BuildPhiContext in scope[" << scope << "] inner_scope[" - << inner_scope << "]"; +void BuildRuntimeContext(pir::Operation* op, + const ValueExecutionInfo& value_exec_info, + const paddle::dialect::OpYamlInfoParser& op_yaml_info, + RuntimeContext* runtime_ctx) { + const Scope* inner_scope = value_exec_info.GetScope(); + VLOG(6) << "BuildPhiContext in scope[" << inner_scope << "]"; auto& vec_kernel_fn_tensor_params = op_yaml_info.TensorParams(true); @@ -572,7 +689,7 @@ void BuildRuntimeContext( } auto legacy_attr_name = op_normalizer.GetLegacyArgName(fluid_op_name, name); - auto in_var_name = name_map.at(ptr); + auto in_var_name = value_exec_info.GetVarName(ptr); VLOG(6) << "ctx->EmplaceBackInput: " << name << "\t" << in_var_name; PADDLE_ENFORCE_NOT_NULL(inner_scope->FindVar(in_var_name), phi::errors::PreconditionNotMet( @@ -592,7 +709,7 @@ void BuildRuntimeContext( continue; } - auto in_var_name = name_map.at(ptr); + auto in_var_name = value_exec_info.GetVarName(ptr); VLOG(6) << "ctx->EmplaceBackOutput: " << name << "\t" << in_var_name; PADDLE_ENFORCE_NOT_NULL(inner_scope->FindVar(in_var_name), @@ -606,11 +723,11 @@ void BuildRuntimeContext( type.isa()) { runtime_ctx->outputs[legacy_arg_name] = {var}; } else if (type.isa()) { - auto var_ref = var->Get(); - std::vector vec_tmp; + auto var_ref = var->Get(); + std::vector vec_tmp; vec_tmp.reserve(var_ref.size()); for (size_t k = 0; k < var_ref.size(); ++k) { - vec_tmp.push_back(const_cast(var_ref[k])); + vec_tmp.push_back(const_cast(var_ref[k])); } runtime_ctx->outputs[legacy_arg_name] = vec_tmp; } else { @@ -621,13 +738,10 @@ void BuildRuntimeContext( } } -std::shared_ptr BuildOperatorBase( +std::shared_ptr BuildOperatorBase( pir::Operation* op, - const std::unordered_map& name_map, - const paddle::dialect::OpYamlInfoParser& op_yaml_info, - const std::unordered_map& - variable_2_var_name, - const paddle::framework::Scope* scope) { + const ValueExecutionInfo& value_exec_info, + const paddle::dialect::OpYamlInfoParser& op_yaml_info) { paddle::framework::VariableNameMap in_name_map; paddle::framework::VariableNameMap out_name_map; paddle::framework::AttributeMap attr_map; @@ -639,6 +753,8 @@ std::shared_ptr BuildOperatorBase( auto& op_normalizer = paddle::translator::OpNameNormalizer::instance(); + auto scope = value_exec_info.GetScope(); + // build inputs for (auto& name : vec_kernel_fn_tensor_params) { PADDLE_ENFORCE_EQ( @@ -654,8 +770,9 @@ std::shared_ptr BuildOperatorBase( << name; continue; } - VLOG(6) << "Push back inputs to VariableNameMap : " << name_map.at(ptr); - in_name_map[legacy_attr_name].push_back(name_map.at(ptr)); + VLOG(6) << "Push back inputs to VariableNameMap : " + << value_exec_info.GetVarName(ptr); + in_name_map[legacy_attr_name].push_back(value_exec_info.GetVarName(ptr)); } // build attribute @@ -746,18 +863,17 @@ std::shared_ptr BuildOperatorBase( if (ptr.type().isa() || ptr.type().isa()) { - out_name_map[legacy_arg_name].push_back(name_map.at(ptr)); - VLOG(6) << "Push back outputs to VariableNameMap : " << name_map.at(ptr); + out_name_map[legacy_arg_name].push_back(value_exec_info.GetVarName(ptr)); + VLOG(6) << "Push back outputs to VariableNameMap : " + << value_exec_info.GetVarName(ptr); } else if (ptr.type().isa()) { - auto var = scope->FindVar(name_map.at(ptr)); - auto var_ref = var->Get(); + auto var = scope->FindVar(value_exec_info.GetVarName(ptr)); + auto var_ref = var->Get(); for (size_t k = 0; k < var_ref.size(); ++k) { - PADDLE_ENFORCE(variable_2_var_name.count(var_ref[k]), - "Variable MUST in variable_2_var_name map"); out_name_map[legacy_arg_name].push_back( - variable_2_var_name.at(var_ref[k])); + value_exec_info.GetVarName(var_ref[k])); VLOG(6) << "Push back outputs to VariableNameMap : " - << variable_2_var_name.at(var_ref[k]); + << value_exec_info.GetVarName(var_ref[k]); } } else { PADDLE_THROW(phi::errors::Unimplemented( @@ -766,12 +882,13 @@ std::shared_ptr BuildOperatorBase( } } - auto& op_info = paddle::framework::OpInfoMap::Instance().Get(fluid_op_name); + auto& op_info = OpInfoMap::Instance().Get(fluid_op_name); auto ptr = op_info.Creator()(fluid_op_name, in_name_map, out_name_map, attr_map); - std::shared_ptr res(ptr); + std::shared_ptr res(ptr); return res; } -} // namespace pir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/new_executor/pir_adaptor/pir_adaptor_util.h b/paddle/fluid/framework/new_executor/pir_adaptor/pir_adaptor_util.h index 821f21ebddae8..87603f2f14b15 100644 --- a/paddle/fluid/framework/new_executor/pir_adaptor/pir_adaptor_util.h +++ b/paddle/fluid/framework/new_executor/pir_adaptor/pir_adaptor_util.h @@ -14,31 +14,29 @@ #pragma once -#include "paddle/fluid/pir/dialect/operator/interface/op_yaml_info.h" -#include "paddle/fluid/pir/dialect/operator/ir/op_dialect.h" -#include "paddle/fluid/pir/dialect/operator/ir/op_type.h" -#include "paddle/fluid/pir/dialect/operator/utils/op_yaml_info_util.h" -#include "paddle/fluid/pir/dialect/operator/utils/utils.h" -#include "paddle/phi/core/meta_tensor.h" -#include "paddle/pir/core/builtin_attribute.h" -#include "paddle/pir/core/ir_context.h" -#include "paddle/pir/core/program.h" -#include "paddle/pir/core/utils.h" - #include "paddle/fluid/framework/new_executor/interpreter/execution_config.h" +#include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/variable.h" #include "paddle/fluid/framework/variable_helper.h" -#include "paddle/phi/core/kernel_context.h" - -#include "paddle/fluid/framework/operator.h" #include "paddle/fluid/pir/dialect/kernel/ir/kernel_attribute.h" #include "paddle/fluid/pir/dialect/kernel/ir/kernel_type.h" +#include "paddle/fluid/pir/dialect/operator/interface/op_yaml_info.h" #include "paddle/fluid/pir/dialect/operator/ir/op_attribute.h" +#include "paddle/fluid/pir/dialect/operator/ir/op_dialect.h" +#include "paddle/fluid/pir/dialect/operator/ir/op_type.h" #include "paddle/fluid/pir/dialect/operator/utils/op_yaml_info_parser.h" +#include "paddle/fluid/pir/dialect/operator/utils/op_yaml_info_util.h" +#include "paddle/fluid/pir/dialect/operator/utils/utils.h" #include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/core/kernel_context.h" +#include "paddle/phi/core/meta_tensor.h" +#include "paddle/pir/core/builtin_attribute.h" +#include "paddle/pir/core/ir_context.h" +#include "paddle/pir/core/program.h" #include "paddle/pir/core/type_name.h" +#include "paddle/pir/core/utils.h" #include "glog/logging.h" @@ -48,11 +46,13 @@ namespace framework { class CondInstruction; class ValueExecutionInfo { public: + friend class CondInstruction; + explicit ValueExecutionInfo(Scope* scope) : scope_(scope) {} const ValueExecutionInfo* Parent() const { return parent_; } - Scope* GetScope() { return scope_; } + Scope* GetScope() const { return scope_; } void Add(::pir::Value value, std::string var_name); @@ -62,35 +62,64 @@ class ValueExecutionInfo { std::string GetNameById(int id) const; - const std::unordered_map<::pir::Value, std::string>& GetValue2VarName() - const { - return value_2_var_name_; - } + const std::unordered_map<::pir::Value, std::string>& GetValue2VarName() const; - void AddValue2VarName(::pir::Value value, const std::string& var_name) { - value_2_var_name_.emplace(value, var_name); - } + void AddValue2VarName(::pir::Value value, const std::string& var_name); const std::unordered_map& - GetVar2VarName() const { - return var_2_var_name_; - } + GetVar2VarName() const; - const std::map& GetVarName2Id() const { - return var_name_2_id_; - } + const std::map& GetVarName2Id() const; - const std::unordered_map& GetId2VarName() const { - return id_2_var_name_; - } + const std::unordered_map& GetId2VarName() const; - const std::vector& GetVarList() const { return var_list_; } + const std::vector& GetVarList() const; - void ResetVarList(int id, Variable* var) { var_list_[id] = var; } + void ResetVarList(int id, Variable* var); - friend class CondInstruction; + /// Check a value exist in the ValueExecutionInfo or any of its ancestors. + bool HasValue(::pir::Value value) const; + + /// Check a value exist in the ValueExecutionInfo. + bool HasLocalValue(::pir::Value value) const; + + std::string GetVarName(::pir::Value value) const; + + std::string GetVarName(const Variable* var) const; + + std::string GetLocalVarName(::pir::Value value) const; + + std::string GetLocalVarName(const Variable* var) const; + + int GetVarId(::pir::Value value) const; + + int GetVarId(const Variable* var) const; + + int GetLocalVarId(::pir::Value value) const; + + int GetLocalVarId(const Variable* var) const; private: + bool HasValueInternal(::pir::Value value) const; + + bool HasValueLocally(::pir::Value value) const; + + std::string GetVarNameInternal(::pir::Value value) const; + + std::string GetVarNameLocally(::pir::Value value) const; + + std::string GetVarNameInternal(const Variable* var) const; + + std::string GetVarNameLocally(const Variable* var) const; + + int GetVarIdInternal(::pir::Value value) const; + + int GetVarIdLocally(::pir::Value value) const; + + int GetVarIdInternal(const Variable* var) const; + + int GetVarIdLocally(const Variable* var) const; + std::shared_ptr NewChild(Scope* scope); ValueExecutionInfo* parent_{nullptr}; // not owned @@ -99,8 +128,7 @@ class ValueExecutionInfo { std::unordered_map<::pir::Value, std::string> value_2_var_name_; - std::unordered_map - var_2_var_name_; + std::unordered_map var_2_var_name_; std::map var_name_2_id_; @@ -109,11 +137,6 @@ class ValueExecutionInfo { std::vector var_list_; }; -} // namespace framework -} // namespace paddle - -namespace pir { - // NOTE(zhangbo): Some operators of Paddle support optional inputs or outputs, // representing whether the input or output exists. In the Pir, whether the // value itself is empty or the type it holds is empty is used to indicate @@ -125,27 +148,19 @@ inline bool IsInvalid(pir::Value value) { return true; } -void BuildScope( - const pir::Block& block, - const std::string& var_name_prefix, - std::map* sub_blocks, - paddle::framework::ValueExecutionInfo* value_exe_info = nullptr); +void BuildScope(const pir::Block& block, + const std::string& var_name_prefix, + ValueExecutionInfo* value_exe_info = nullptr); -void BuildRuntimeContext( - pir::Operation* op, - const std::unordered_map& name_map, - paddle::framework::Scope* scope, - paddle::framework::Scope* local_scope, - const paddle::dialect::OpYamlInfoParser& op_yaml_info, - paddle::framework::RuntimeContext* runtime_ctx); +void BuildRuntimeContext(pir::Operation* op, + const ValueExecutionInfo& value_exec_info, + const paddle::dialect::OpYamlInfoParser& op_yaml_info, + RuntimeContext* runtime_ctx); -std::shared_ptr BuildOperatorBase( +std::shared_ptr BuildOperatorBase( pir::Operation* op, - const std::unordered_map& name_map, - const paddle::dialect::OpYamlInfoParser& op_yaml_info, - const std::unordered_map& - variable_2_var_name, - const paddle::framework::Scope* scope); + const ValueExecutionInfo& value_exec_info, + const paddle::dialect::OpYamlInfoParser& op_yaml_info); template -void BuildPhiContext( - pir::Operation* op, - const std::unordered_map& name_map, - paddle::framework::Scope* scope, - paddle::framework::Scope* local_scope, - const paddle::dialect::OpYamlInfoParser& op_yaml_info, - Context* ctx) { - paddle::framework::Scope* inner_scope = - local_scope != nullptr ? local_scope : scope; - VLOG(6) << "Build " << get_type_name() << " in scope[" << scope - << "] inner_scope[" << inner_scope << "]"; +void BuildPhiContext(pir::Operation* op, + const ValueExecutionInfo& value_exec_info, + const paddle::dialect::OpYamlInfoParser& op_yaml_info, + Context* ctx) { + Scope* inner_scope = value_exec_info.GetScope(); + VLOG(6) << "Build " << pir::get_type_name() << "] inner_scope[" + << inner_scope << "]"; auto attr_map = op->attributes(); @@ -192,7 +203,7 @@ void BuildPhiContext( continue; } - auto in_var_name = name_map.at(ptr); + auto in_var_name = value_exec_info.GetVarName(ptr); VLOG(6) << "ctx->EmplaceBackInput: " << t << "\t" << in_var_name; PADDLE_ENFORCE_NOT_NULL(inner_scope->FindVar(in_var_name), @@ -202,9 +213,9 @@ void BuildPhiContext( if (var->IsType()) { const phi::TensorBase* tensor_in = &(var->Get()); ctx->EmplaceBackInput(InType(tensor_in)); - } else if (var->IsType()) { + } else if (var->IsType()) { InListType inputs; - auto& variable_array = var->Get(); + auto& variable_array = var->Get(); for (size_t i = 0; i < variable_array.size(); ++i) { if (variable_array[i]->IsType()) { inputs.emplace_back(InType(const_cast( @@ -233,7 +244,7 @@ void BuildPhiContext( // tensor attribute, get information from input pir::Value ptr = op->operand_source(name2id.at(t)); - auto in_var_name = name_map.at(ptr); + auto in_var_name = value_exec_info.GetVarName(ptr); auto& tensor_attr_type = op_yaml_info.TensorAttrTypeName(t); VLOG(6) << "ctx->EmplaceBack mutable attr: " << t << "\t" << in_var_name; @@ -243,8 +254,8 @@ void BuildPhiContext( &(inner_scope->FindVar(in_var_name)->Get())); ctx->EmplaceBackAttr(attr); } else if (ptr.type().isa()) { - auto& tensor_array = inner_scope->FindVar(in_var_name) - ->Get(); + auto& tensor_array = + inner_scope->FindVar(in_var_name)->Get(); if (tensor_array.size() == 1) { phi::Attribute attr = phi::TensorRef(&(tensor_array[0]->Get())); @@ -407,17 +418,18 @@ void BuildPhiContext( if (out_ptr.type().isa()) { ctx->EmplaceBackOutput(OutType(const_cast( - &(inner_scope->FindVar(name_map.at(out_ptr)) + &(inner_scope->FindVar(value_exec_info.GetVarName(out_ptr)) ->Get())))); } else if (out_ptr.type() .isa()) { ctx->EmplaceBackOutput(OutType(const_cast( - &(inner_scope->FindVar(name_map.at(out_ptr)) + &(inner_scope->FindVar(value_exec_info.GetVarName(out_ptr)) ->Get())))); } else if (out_ptr.type().isa()) { OutListType outputs; - auto& variable_array = inner_scope->FindVar(name_map.at(out_ptr)) - ->Get(); + auto& variable_array = + inner_scope->FindVar(value_exec_info.GetVarName(out_ptr)) + ->Get(); for (size_t i = 0; i < variable_array.size(); ++i) { if (variable_array[i]->IsType()) { outputs.emplace_back(OutType(const_cast( @@ -442,4 +454,5 @@ void BuildPhiContext( VLOG(6) << "Done build phi context"; } -} // namespace pir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/pir/transforms/pd_op_to_kernel_pass.cc b/paddle/fluid/pir/transforms/pd_op_to_kernel_pass.cc index 2eaed09881907..eae6f20a34eaa 100644 --- a/paddle/fluid/pir/transforms/pd_op_to_kernel_pass.cc +++ b/paddle/fluid/pir/transforms/pd_op_to_kernel_pass.cc @@ -230,9 +230,9 @@ pir::OpResult AddPlaceTransferOp(pir::OpResult in, const phi::KernelKey& kernel_key, pir::Block* block) { pir::IrContext* ctx = pir::IrContext::Instance(); - std::string op_name = paddle::dialect::PhiKernelOp::name(); - pir::OpInfo op_info = ctx->GetRegisteredOpInfo(op_name); + pir::OpInfo kernel_op_info = + ctx->GetRegisteredOpInfo(paddle::dialect::PhiKernelOp::name()); if ((src_place.GetType() == phi::AllocationType::CPU) && (dst_place.GetType() == phi::AllocationType::GPU)) { @@ -245,7 +245,7 @@ pir::OpResult AddPlaceTransferOp(pir::OpResult in, {"dst_place_type", pir::Int32Attribute::get(ctx, 1)}}; pir::Operation* op = - pir::Operation::Create({in}, op_attribute, {out_type}, op_info); + pir::Operation::Create({in}, op_attribute, {out_type}, kernel_op_info); if (in.owner()->HasAttribute(kAttrIsPersisable)) { op->set_attribute(kAttrIsPersisable, @@ -267,7 +267,7 @@ pir::OpResult AddPlaceTransferOp(pir::OpResult in, {"dst_place_type", pir::Int32Attribute::get(ctx, 0)}}; pir::Operation* op = - pir::Operation::Create({in}, op_attribute, {out_type}, op_info); + pir::Operation::Create({in}, op_attribute, {out_type}, kernel_op_info); block->push_back(op); @@ -676,45 +676,73 @@ void HandleForIfOp( pir::IrContext* ctx, std::unordered_map* map_op_pair, std::unordered_map* map_value_pair) { - auto cur_in = op_item->operand_source(0); - + auto old_cond = op_item->operand_source(0); PADDLE_ENFORCE_EQ( - map_value_pair->count(cur_in), + map_value_pair->count(old_cond), true, phi::errors::PreconditionNotMet( "[%d]'s input of [%s] op MUST in map pair", 0, op_item->name())); - auto new_in = map_value_pair->at(cur_in); + auto new_cond = map_value_pair->at(old_cond); + + // NOTE(zhangbo): IfOp's input cond should be a cpu type. + AllocatedDenseTensorType new_cond_type = + new_cond.type().dyn_cast(); + if (new_cond_type) { + if (new_cond_type.place().GetType() == phi::AllocationType::GPU) { + auto out_type = dialect::AllocatedDenseTensorType::get( + ctx, + phi::CPUPlace(), + old_cond.type().dyn_cast()); + phi::KernelKey kernel_key( + phi::Backend::GPU, phi::DataLayout::ALL_LAYOUT, phi::DataType::BOOL); + new_cond = AddPlaceTransferOp(new_cond, + out_type, + new_cond_type.place(), + phi::CPUPlace(), + kernel_key, + block); + } + } else { + PADDLE_THROW( + phi::errors::Unimplemented("IfOp onlu support DenseTensorType")); + } + // Create IfOp and insert to kernel dialect program pir::Builder builder(ctx, block); - - auto base_if_op = op_item->dyn_cast(); - std::vector op_output_types; - for (size_t i = 0; i < base_if_op.num_results(); ++i) { - op_output_types.push_back(paddle::dialect::AllocatedDenseTensorType::get( + auto old_ifop = op_item->dyn_cast(); + std::vector new_ifop_outputs; + for (size_t i = 0; i < old_ifop.num_results(); ++i) { + new_ifop_outputs.push_back(paddle::dialect::AllocatedDenseTensorType::get( ctx, place, - base_if_op.result(i).type().dyn_cast())); + old_ifop.result(i).type().dyn_cast())); } - auto new_if_op = - builder.Build(new_in, std::move(op_output_types)); + auto new_ifop = builder.Build( + new_cond, std::move(new_ifop_outputs)); // process true block - pir::Block* true_block = new_if_op.true_block(); + pir::Block* true_block = new_ifop.true_block(); ProcessBlock(place, - base_if_op.true_block(), + old_ifop.true_block(), true_block, ctx, map_op_pair, map_value_pair); // process false block - pir::Block* false_block = new_if_op.false_block(); + pir::Block* false_block = new_ifop.false_block(); ProcessBlock(place, - base_if_op.false_block(), + old_ifop.false_block(), false_block, ctx, map_op_pair, map_value_pair); + + // update map + (*map_op_pair)[op_item] = new_ifop; + for (size_t i = 0; i < op_item->num_results(); ++i) { + (*map_value_pair)[op_item->result(i)] = new_ifop->result(i); + } } pir::OpResult GetNewInput( diff --git a/test/legacy_test/test_cond.py b/test/legacy_test/test_cond.py index 55e6f8116cf33..b5ee90871f67f 100644 --- a/test/legacy_test/test_cond.py +++ b/test/legacy_test/test_cond.py @@ -12,11 +12,15 @@ # See the License for the specific language governing permissions and # limitations under the License. +import sys import unittest import numpy as np from simple_nets import batchnorm_fc_with_inputs, simple_fc_net_with_inputs +sys.path.append("../dygraph_to_static") +from dygraph_to_static_util import test_and_compare_with_new_ir + import paddle from paddle import base from paddle.base import core, framework @@ -27,6 +31,7 @@ class TestCondInputOutput(unittest.TestCase): + @test_and_compare_with_new_ir() def test_return_single_var(self): """ pseudocode: @@ -73,6 +78,7 @@ def false_func(): np.asarray(ret), np.full((3, 2), -1, np.int32), rtol=1e-05 ) + @test_and_compare_with_new_ir() def test_return_0d_tensor(self): """ pseudocode: @@ -110,6 +116,7 @@ def false_func(): np.testing.assert_allclose(np.asarray(ret), np.array(2), rtol=1e-05) self.assertEqual(ret.shape, ()) + @test_and_compare_with_new_ir() def test_0d_tensor_as_cond(self): """ pseudocode: @@ -210,6 +217,7 @@ def test_0d_tensor_dygraph(self): ) self.assertEqual(a.grad.shape, []) + @test_and_compare_with_new_ir() def test_return_var_tuple(self): """ pseudocode: @@ -347,6 +355,7 @@ def false_func(): self.assertIsNone(out2) self.assertIsNone(out3) + @test_and_compare_with_new_ir() def test_wrong_structure_exception(self): """ test returning different number of tensors cannot merge into output From 24701efea0cbdf3439e24db331d313bb7f018824 Mon Sep 17 00:00:00 2001 From: WangZhen <23097963+0x45f@users.noreply.github.com> Date: Tue, 10 Oct 2023 10:15:33 +0800 Subject: [PATCH 48/62] [PIR]Migrate maximum into pir (#57929) * [PIR]Migrate maximum into pir * Polish code --- python/paddle/pir_utils.py | 12 ++++++++++++ python/paddle/tensor/math.py | 2 +- test/legacy_test/test_maximum_op.py | 6 ++++++ 3 files changed, 19 insertions(+), 1 deletion(-) diff --git a/python/paddle/pir_utils.py b/python/paddle/pir_utils.py index 28d261b0155fc..f16d411262a22 100644 --- a/python/paddle/pir_utils.py +++ b/python/paddle/pir_utils.py @@ -13,6 +13,8 @@ # limitations under the License. +from functools import wraps + import paddle @@ -95,3 +97,13 @@ def _switch_to_old_ir(self): "IrGuard._switch_to_old_ir only work when paddle.framework.in_pir_mode() is false, \ please set FLAGS_enable_pir_api = false" ) + + +def test_with_pir_api(func): + @wraps(func) + def impl(*args, **kwargs): + func(*args, **kwargs) + with IrGuard(): + func(*args, **kwargs) + + return impl diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py index 5a60e6884b890..467c7f7ab88f1 100644 --- a/python/paddle/tensor/math.py +++ b/python/paddle/tensor/math.py @@ -1226,7 +1226,7 @@ def maximum(x, y, name=None): Tensor(shape=[3], dtype=float32, place=Place(cpu), stop_gradient=True, [5. , 3. , inf.]) """ - if in_dynamic_mode(): + if in_dynamic_or_pir_mode(): return _C_ops.maximum(x, y) else: return _elementwise_op(LayerHelper('elementwise_max', **locals())) diff --git a/test/legacy_test/test_maximum_op.py b/test/legacy_test/test_maximum_op.py index 818bdb65fee68..a0e660112bd03 100644 --- a/test/legacy_test/test_maximum_op.py +++ b/test/legacy_test/test_maximum_op.py @@ -18,6 +18,7 @@ import paddle from paddle.base import core +from paddle.pir_utils import test_with_pir_api class ApiMaximumTest(unittest.TestCase): @@ -39,6 +40,7 @@ def setUp(self): self.np_expected3 = np.maximum(self.input_a, self.input_c) self.np_expected4 = np.maximum(self.input_b, self.input_c) + @test_with_pir_api def test_static_api(self): paddle.enable_static() with paddle.static.program_guard( @@ -119,3 +121,7 @@ def test_dynamic_api(self): res = paddle.maximum(b, c) res = res.numpy() np.testing.assert_allclose(res, self.np_expected4, rtol=1e-05) + + +if __name__ == '__main__': + unittest.main() From 9bea183aa515bf3bda5b923bb10cc0028d14f1d9 Mon Sep 17 00:00:00 2001 From: winter-wang <78149749+winter-wang@users.noreply.github.com> Date: Tue, 10 Oct 2023 10:21:13 +0800 Subject: [PATCH 49/62] [PIR] support if_op build after subblock has been completed. (#57958) --- .../pir/dialect/operator/ir/CMakeLists.txt | 2 +- .../dialect/operator/ir/control_flow_op.cc | 46 +++++++++++++++++++ .../pir/dialect/operator/ir/control_flow_op.h | 10 ++-- .../pir/control_flow_dialect/if_op_test.cc | 40 ++++++++++++++++ 4 files changed, 92 insertions(+), 6 deletions(-) diff --git a/paddle/fluid/pir/dialect/operator/ir/CMakeLists.txt b/paddle/fluid/pir/dialect/operator/ir/CMakeLists.txt index 3026da6200254..7954e000baf51 100644 --- a/paddle/fluid/pir/dialect/operator/ir/CMakeLists.txt +++ b/paddle/fluid/pir/dialect/operator/ir/CMakeLists.txt @@ -191,7 +191,7 @@ cc_library( cc_library( pd_op_dialect_op SRCS ${op_source_file} manual_op.cc control_flow_op.cc - DEPS pd_op_dialect_core) + DEPS pd_op_dialect_core pir_control_flow) cc_library( api_builder SRCS api_builder.cc diff --git a/paddle/fluid/pir/dialect/operator/ir/control_flow_op.cc b/paddle/fluid/pir/dialect/operator/ir/control_flow_op.cc index 94ba9a2e2e37f..557f8c7106000 100644 --- a/paddle/fluid/pir/dialect/operator/ir/control_flow_op.cc +++ b/paddle/fluid/pir/dialect/operator/ir/control_flow_op.cc @@ -17,9 +17,11 @@ paddle::dialect::IfOp, paddle::dialect::WhileOp #else #include "paddle/fluid/pir/dialect/operator/ir/control_flow_op.h" +#include "paddle/phi/core/enforce.h" #include "paddle/pir/core/builder.h" #include "paddle/pir/core/ir_printer.h" #include "paddle/pir/core/operation_utils.h" +#include "paddle/pir/dialect/control_flow/ir/cf_ops.h" namespace paddle { namespace dialect { @@ -33,6 +35,50 @@ void IfOp::Build(pir::Builder &builder, // NOLINT argument.AddInput(cond); argument.output_types.swap(output_types); } + +void IfOp::Build(pir::Builder &builder, // NOLINT + pir::OperationArgument &argument, // NOLINT + pir::Value cond, + std::unique_ptr &&true_block, + std::unique_ptr &&false_block) { + VLOG(4) << "Start build IfOp"; + if (true_block && !true_block->empty() && + true_block->back()->isa()) { + auto *op = true_block->back(); + for (size_t i = 0; i < op->num_operands(); ++i) { + argument.AddOutput(op->operand(i).type()); + } + } + if (false_block && !false_block->empty() && + false_block->back()->isa()) { + auto *op = false_block->back(); + PADDLE_ENFORCE_EQ(op->num_operands(), + argument.output_types.size(), + phi::errors::PreconditionNotMet( + "The output size of true block and false block must " + "be equal. but they are %u and %u, respectively", + argument.output_types.size(), + op->num_operands())); + for (size_t i = 0; i < op->num_operands(); ++i) { + PADDLE_ENFORCE_EQ( + op->operand(i).type(), + argument.output_types[i], + phi::errors::PreconditionNotMet("The output[%d] type of true block " + "and false block must be equal.", + i)); + } + } else { + PADDLE_ENFORCE(argument.output_types.empty(), + phi::errors::PreconditionNotMet( + "The output size of true block and false block must be " + "equal. but they are %u and 0, respectively", + argument.output_types.size())); + } + argument.AddRegion()->push_back(true_block.release()); + argument.AddRegion()->push_back(false_block.release()); + argument.AddInput(cond); +} + pir::Block *IfOp::true_block() { pir::Region &true_region = (*this)->region(0); if (true_region.empty()) true_region.emplace_back(); diff --git a/paddle/fluid/pir/dialect/operator/ir/control_flow_op.h b/paddle/fluid/pir/dialect/operator/ir/control_flow_op.h index 3f93c51a534e9..99444f78da568 100644 --- a/paddle/fluid/pir/dialect/operator/ir/control_flow_op.h +++ b/paddle/fluid/pir/dialect/operator/ir/control_flow_op.h @@ -31,11 +31,11 @@ class IfOp : public pir::Op { pir::Value cond, std::vector &&output_types); - // static void Build(pir::Builder &builder, // NOLINT - // pir::OperationArgument &argument, // NOLINT - // pir::Value cond, - // std::unique_ptr&& true_block, - // std::unique_ptr&& false_block); + static void Build(pir::Builder &builder, // NOLINT + pir::OperationArgument &argument, // NOLINT + pir::Value cond, + std::unique_ptr &&true_block, + std::unique_ptr &&false_block); pir::Value cond() { return operand_source(0); } pir::Block *true_block(); diff --git a/test/cpp/pir/control_flow_dialect/if_op_test.cc b/test/cpp/pir/control_flow_dialect/if_op_test.cc index 218a67e1acc5b..02d4061a0d5f8 100644 --- a/test/cpp/pir/control_flow_dialect/if_op_test.cc +++ b/test/cpp/pir/control_flow_dialect/if_op_test.cc @@ -59,3 +59,43 @@ TEST(if_op_test, base) { LOG(INFO) << ss.str(); } + +TEST(if_op_test, build_by_block) { + pir::IrContext* ctx = pir::IrContext::Instance(); + ctx->GetOrRegisterDialect(); + ctx->GetOrRegisterDialect(); + + pir::Program program(ctx); + pir::Block* block = program.block(); + pir::Builder builder(ctx, block); + auto full_op = builder.Build( + std::vector{1}, true, phi::DataType::BOOL); + + // construct true block + std::unique_ptr true_block(new pir::Block()); + builder.SetInsertionPointToStart(true_block.get()); + auto full_op_1 = builder.Build( + std::vector{2}, true, phi::DataType::BOOL); + builder.Build(std::vector{full_op_1.out()}); + + // construct false block + std::unique_ptr false_block(new pir::Block()); + builder.SetInsertionPointToStart(false_block.get()); + auto full_op_2 = builder.Build( + std::vector{2}, true, phi::DataType::BOOL); + builder.Build(std::vector{full_op_2.out()}); + + builder.SetInsertionPointToEnd(block); + + builder.Build( + full_op.out(), std::move(true_block), std::move(false_block)); + + EXPECT_FALSE(true_block); + EXPECT_FALSE(false_block); + EXPECT_EQ(full_op_2->GetParentProgram(), &program); + + std::stringstream ss; + program.Print(ss); + + LOG(INFO) << ss.str(); +} From b481a98fff7983e390d31077695a10dd5b02c033 Mon Sep 17 00:00:00 2001 From: Yiqun Liu Date: Tue, 10 Oct 2023 10:23:49 +0800 Subject: [PATCH 50/62] Optimize the call of some elementwise kernel to decrease the static library size. (#57838) --- paddle/phi/kernels/gpu/lerp_kernel.cu | 33 ++---- .../phi/kernels/gpu/viterbi_decode_kernel.cu | 85 +++++++-------- .../kernels/impl/elementwise_kernel_impl.h | 9 +- paddle/phi/kernels/kps/compare_kernel.cu | 61 ++++++----- paddle/phi/kernels/kps/elementwise_kernel.cu | 11 +- paddle/phi/kernels/legacy/compare_kernel.h | 63 +++++++++++ .../legacy/elementwise_divide_kernel.h | 29 +++++ .../phi/kernels/legacy/elementwise_kernel.h | 7 -- .../phi/kernels/legacy/kps/compare_kernel.cu | 66 ++++-------- .../kernels/legacy/kps/elementwise_kernel.cu | 101 +----------------- 10 files changed, 202 insertions(+), 263 deletions(-) create mode 100644 paddle/phi/kernels/legacy/compare_kernel.h create mode 100644 paddle/phi/kernels/legacy/elementwise_divide_kernel.h diff --git a/paddle/phi/kernels/gpu/lerp_kernel.cu b/paddle/phi/kernels/gpu/lerp_kernel.cu index 9f059f1b5d6fb..f9d8514a54ca2 100644 --- a/paddle/phi/kernels/gpu/lerp_kernel.cu +++ b/paddle/phi/kernels/gpu/lerp_kernel.cu @@ -17,15 +17,11 @@ #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/common/amp_type_traits.h" #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/expand_kernel.h" #include "paddle/phi/kernels/funcs/broadcast_function.h" namespace phi { -template -struct BroadcastMinElementWiseDirectCUDAFunctor { - HOSTDEVICE inline T operator()(const T min) const { return min; } -}; - template struct LerpElementWiseDirectCUDAFunctor { HOSTDEVICE inline T operator()(const T x, const T y, const T weight) const { @@ -87,36 +83,23 @@ void LerpKernel(const Context &ctx, DenseTensor b_min = phi::EmptyLike(ctx, *out); if (x.dims().size() != y.dims().size() && weight.dims().size() != y.dims().size()) { - std::vector broadcast_min_inputs; - broadcast_min_inputs.reserve(1); - std::vector broadcast_min_outputs = {&b_min}; - auto broadcast_min_functor = - BroadcastMinElementWiseDirectCUDAFunctor(); if (x.dims().size() < y.dims().size() && x.dims().size() < weight.dims().size()) { - broadcast_min_inputs.emplace_back(&x); - phi::funcs::BroadcastKernel(ctx, - broadcast_min_inputs, - &broadcast_min_outputs, - broadcast_min_functor); + // x broadcast to b_min + ExpandKernel(ctx, x, phi::vectorize(b_min.dims()), &b_min); inputs.emplace_back(&b_min); inputs.emplace_back(&y); inputs.emplace_back(&weight); } else if (y.dims().size() < weight.dims().size()) { - broadcast_min_inputs.emplace_back(&y); - phi::funcs::BroadcastKernel(ctx, - broadcast_min_inputs, - &broadcast_min_outputs, - broadcast_min_functor); + // y broadcast to b_min + ExpandKernel(ctx, y, phi::vectorize(b_min.dims()), &b_min); inputs.emplace_back(&x); inputs.emplace_back(&b_min); inputs.emplace_back(&weight); } else { - broadcast_min_inputs.emplace_back(&weight); - phi::funcs::BroadcastKernel(ctx, - broadcast_min_inputs, - &broadcast_min_outputs, - broadcast_min_functor); + // weight broadcast to b_min + ExpandKernel( + ctx, weight, phi::vectorize(b_min.dims()), &b_min); inputs.emplace_back(&x); inputs.emplace_back(&y); inputs.emplace_back(&b_min); diff --git a/paddle/phi/kernels/gpu/viterbi_decode_kernel.cu b/paddle/phi/kernels/gpu/viterbi_decode_kernel.cu index be630f85ce07d..b69c4a691d0e3 100644 --- a/paddle/phi/kernels/gpu/viterbi_decode_kernel.cu +++ b/paddle/phi/kernels/gpu/viterbi_decode_kernel.cu @@ -33,8 +33,10 @@ namespace cub = hipcub; #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/tensor_utils.h" +#include "paddle/phi/kernels/elementwise_add_kernel.h" +#include "paddle/phi/kernels/elementwise_multiply_kernel.h" +#include "paddle/phi/kernels/elementwise_subtract_kernel.h" #include "paddle/phi/kernels/empty_kernel.h" -#include "paddle/phi/kernels/funcs/broadcast_function.h" #include "paddle/phi/kernels/funcs/compare_functors.h" #include "paddle/phi/kernels/funcs/concat_and_split_functor.h" #include "paddle/phi/kernels/funcs/elementwise_base.h" @@ -80,21 +82,6 @@ int64_t ComputeBlockSize(int64_t col) { return 8; } -template - typename BinaryFunctor, - typename T> -struct BinaryOperation { - void operator()(const Context& dev_ctx, - const DenseTensor& lhs, - const DenseTensor& rhs, - DenseTensor* output) { - std::vector ins{&lhs, &rhs}; - std::vector outs{output}; - phi::funcs::BroadcastKernel(dev_ctx, ins, &outs, BinaryFunctor(), 0); - } -}; - template typename CompareFunctor, @@ -314,47 +301,46 @@ void ViterbiDecodeKernel(const Context& dev_ctx, start_trans.Resize({1, n_labels}); auto logit0 = input_exp.Slice(0, 1); logit0.Resize({batch_size, n_labels}); - BinaryOperation AddFloat; - BinaryOperation AddInt; - BinaryOperation MulFloat; - BinaryOperation MulInt; - BinaryOperation SubFloat; - BinaryOperation SubInt; if (include_bos_eos_tag) { - AddFloat(dev_ctx, logit0, start_trans, &alpha); + phi::AddKernel(dev_ctx, logit0, start_trans, &alpha); GetMask()( dev_ctx, left_length, one, &float_mask); - MulFloat(dev_ctx, stop_trans, float_mask, &alpha_nxt); - AddFloat(dev_ctx, alpha, alpha_nxt, &alpha); + phi::MultiplyKernel( + dev_ctx, stop_trans, float_mask, &alpha_nxt); + phi::AddKernel(dev_ctx, alpha, alpha_nxt, &alpha); } else { alpha = logit0; } - SubInt(dev_ctx, left_length, one, &left_length); + phi::SubtractKernel( + dev_ctx, left_length, one, &left_length); Argmax argmax; for (int64_t i = 1; i < max_seq_len; ++i) { DenseTensor logit = input_exp.Slice(i, i + 1); logit.Resize({batch_size, n_labels}); DenseTensor& alpha_exp = alpha.Resize({batch_size, n_labels, 1}); - AddFloat(dev_ctx, alpha_exp, trans_exp, &alpha_trn_sum); + phi::AddKernel(dev_ctx, alpha_exp, trans_exp, &alpha_trn_sum); auto alpha_argmax_temp = alpha_argmax_unbind[i - 1]; alpha_argmax_temp.Resize({batch_size, n_labels}); argmax(dev_ctx, alpha_trn_sum, &alpha_argmax_temp, &alpha_max, 1); historys.emplace_back(alpha_argmax_temp); - AddFloat(dev_ctx, alpha_max, logit, &alpha_nxt); + phi::AddKernel(dev_ctx, alpha_max, logit, &alpha_nxt); alpha.Resize({batch_size, n_labels}); GetMask()( dev_ctx, left_length, zero, &float_mask); - MulFloat(dev_ctx, alpha_nxt, float_mask, &alpha_nxt); - SubFloat(dev_ctx, float_one, float_mask, &float_mask); - MulFloat(dev_ctx, alpha, float_mask, &alpha); - AddFloat(dev_ctx, alpha, alpha_nxt, &alpha); + phi::MultiplyKernel(dev_ctx, alpha_nxt, float_mask, &alpha_nxt); + phi::SubtractKernel( + dev_ctx, float_one, float_mask, &float_mask); + phi::MultiplyKernel(dev_ctx, alpha, float_mask, &alpha); + phi::AddKernel(dev_ctx, alpha, alpha_nxt, &alpha); if (include_bos_eos_tag) { GetMask()( dev_ctx, left_length, one, &float_mask); - MulFloat(dev_ctx, stop_trans, float_mask, &alpha_nxt); - AddFloat(dev_ctx, alpha, alpha_nxt, &alpha); + phi::MultiplyKernel( + dev_ctx, stop_trans, float_mask, &alpha_nxt); + phi::AddKernel(dev_ctx, alpha, alpha_nxt, &alpha); } - SubInt(dev_ctx, left_length, one, &left_length); + phi::SubtractKernel( + dev_ctx, left_length, one, &left_length); } argmax(dev_ctx, alpha, &last_ids, scores, 1); left_length.Resize({batch_size}); @@ -363,7 +349,8 @@ void ViterbiDecodeKernel(const Context& dev_ctx, // last_ids_update = last_ids * tag_mask int last_ids_index = 1; int actual_len = (std::min)(seq_len, static_cast(max_seq_len)); - MulInt(dev_ctx, last_ids, int_mask, &batch_path[actual_len - last_ids_index]); + phi::MultiplyKernel( + dev_ctx, last_ids, int_mask, &batch_path[actual_len - last_ids_index]); // The algorithm below can refer to // https://github.com/PaddlePaddle/PaddleNLP/blob/develop/paddlenlp/layers/crf.py#L438 ARange arange; @@ -371,24 +358,32 @@ void ViterbiDecodeKernel(const Context& dev_ctx, Gather gather; for (auto hist = historys.rbegin(); hist != historys.rend(); ++hist) { ++last_ids_index; - AddInt(dev_ctx, left_length, one, &left_length); - AddInt(dev_ctx, batch_offset, last_ids, &gather_idx); + phi::AddKernel(dev_ctx, left_length, one, &left_length); + phi::AddKernel( + dev_ctx, batch_offset, last_ids, &gather_idx); DenseTensor& last_ids_update = batch_path[actual_len - last_ids_index]; hist->Resize({batch_size * n_labels}); gather(dev_ctx, *hist, gather_idx, &last_ids_update); GetMask()( dev_ctx, left_length, zero, &int_mask); - MulInt(dev_ctx, last_ids_update, int_mask, &last_ids_update); + phi::MultiplyKernel( + dev_ctx, last_ids_update, int_mask, &last_ids_update); GetMask()( dev_ctx, left_length, zero, &zero_len_mask); - MulInt(dev_ctx, last_ids, zero_len_mask, &last_ids_tmp); - SubInt(dev_ctx, one, zero_len_mask, &zero_len_mask); - MulInt(dev_ctx, last_ids_update, zero_len_mask, &last_ids_update); - AddInt(dev_ctx, last_ids_update, last_ids_tmp, &last_ids_update); + phi::MultiplyKernel( + dev_ctx, last_ids, zero_len_mask, &last_ids_tmp); + phi::SubtractKernel( + dev_ctx, one, zero_len_mask, &zero_len_mask); + phi::MultiplyKernel( + dev_ctx, last_ids_update, zero_len_mask, &last_ids_update); + phi::AddKernel( + dev_ctx, last_ids_update, last_ids_tmp, &last_ids_update); GetMask()( dev_ctx, left_length, zero, &int_mask); - MulInt(dev_ctx, last_ids, int_mask, &last_ids); - AddInt(dev_ctx, last_ids_update, last_ids, &last_ids); + phi::MultiplyKernel( + dev_ctx, last_ids, int_mask, &last_ids); + phi::AddKernel( + dev_ctx, last_ids_update, last_ids, &last_ids); } TransposeKernel(dev_ctx, tpath, {1, 0}, path); } diff --git a/paddle/phi/kernels/impl/elementwise_kernel_impl.h b/paddle/phi/kernels/impl/elementwise_kernel_impl.h index 0121f35b3cecb..137829b5193f2 100644 --- a/paddle/phi/kernels/impl/elementwise_kernel_impl.h +++ b/paddle/phi/kernels/impl/elementwise_kernel_impl.h @@ -54,13 +54,8 @@ namespace phi { const DenseTensor& y, \ int axis, \ DenseTensor* out) { \ - std::vector inputs; \ - inputs.reserve(2); \ - std::vector outputs; \ - outputs.reserve(1); \ - inputs.emplace_back(&x); \ - inputs.emplace_back(&y); \ - outputs.emplace_back(out); \ + std::vector inputs = {&x, &y}; \ + std::vector outputs = {out}; \ dev_ctx.template Alloc(out); \ funcs::BroadcastKernel( \ dev_ctx, inputs, &outputs, funcs::name##Functor(), axis); \ diff --git a/paddle/phi/kernels/kps/compare_kernel.cu b/paddle/phi/kernels/kps/compare_kernel.cu index 545a9df2961bf..14bb86b475320 100644 --- a/paddle/phi/kernels/kps/compare_kernel.cu +++ b/paddle/phi/kernels/kps/compare_kernel.cu @@ -14,7 +14,7 @@ #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/funcs/broadcast_function.h" -#include "paddle/phi/kernels/impl/compare_kernel_impl.h" +#include "paddle/phi/kernels/funcs/compare_functors.h" #ifdef PADDLE_WITH_XPU_KP #include "paddle/phi/backends/xpu/xpu_context.h" @@ -27,6 +27,7 @@ #include "paddle/phi/kernels/compare_kernel.h" #include "paddle/phi/kernels/funcs/elementwise_base.h" #include "paddle/phi/kernels/gpu/reduce.h" +#include "paddle/phi/kernels/legacy/compare_kernel.h" #include "paddle/phi/kernels/primitive/functor_primitives.h" #endif @@ -43,37 +44,27 @@ struct BitwiseAdd { } }; -template -inline void CompareKernelImpl(const Context& ctx, - const DenseTensor& x, - const DenseTensor& y, - int axis, - DenseTensor* out) { - ctx.template Alloc(out); - std::vector ins{&x, &y}; - std::vector outs{out}; - funcs::BroadcastKernel(ctx, ins, &outs, Functor(), axis); -} +#define DEFINE_CUDA_COMPARE_KERNEL(name) \ + template \ + void name##Kernel(const Context& ctx, \ + const DenseTensor& x, \ + const DenseTensor& y, \ + DenseTensor* out) { \ + if (out->IsSharedWith(x)) { \ + auto x_origin = x; \ + name##RawKernel(ctx, x_origin, y, -1, out); \ + } else { \ + name##RawKernel(ctx, x, y, -1, out); \ + } \ + } -template -inline void InplaceCompareKernelImpl(const Context& ctx, - const DenseTensor& x, - const DenseTensor& y, - int axis, - DenseTensor* out) { - auto x_origin = x; - ctx.template Alloc(out); - out->set_type(phi::DataType::BOOL); - std::vector ins{&x_origin, &y}; - std::vector outs{out}; - funcs::BroadcastKernel(ctx, ins, &outs, Functor(), axis); -} +DEFINE_CUDA_COMPARE_KERNEL(LessThan) +DEFINE_CUDA_COMPARE_KERNEL(LessEqual) +DEFINE_CUDA_COMPARE_KERNEL(GreaterThan) +DEFINE_CUDA_COMPARE_KERNEL(GreaterEqual) +DEFINE_CUDA_COMPARE_KERNEL(Equal) +DEFINE_CUDA_COMPARE_KERNEL(NotEqual) +#undef DEFINE_CUDA_COMPARE_KERNEL #ifndef PADDLE_WITH_XPU_KP template @@ -106,6 +97,14 @@ inline void CompareAllKernelImpl(const Context& ctx, funcs::ReduceKernel>( ctx, tmp, out, kps::IdentityFunctor(), reduce_dims); } + +template +void EqualAllKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* out) { + CompareAllKernelImpl>(ctx, x, y, out); +} #endif } // namespace phi diff --git a/paddle/phi/kernels/kps/elementwise_kernel.cu b/paddle/phi/kernels/kps/elementwise_kernel.cu index e2a33d76120f8..584e026241bde 100644 --- a/paddle/phi/kernels/kps/elementwise_kernel.cu +++ b/paddle/phi/kernels/kps/elementwise_kernel.cu @@ -18,9 +18,9 @@ #include "paddle/phi/common/float16.h" #endif #include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/kernels/elementwise_add_kernel.h" #include "paddle/phi/kernels/impl/elementwise_kernel_impl.h" #include "paddle/phi/kernels/legacy/elementwise_add_kernel.h" +#include "paddle/phi/kernels/legacy/elementwise_divide_kernel.h" #include "paddle/phi/kernels/legacy/elementwise_kernel.h" #include "paddle/phi/kernels/legacy/elementwise_multipy_kernel.h" #include "paddle/phi/kernels/legacy/elementwise_subtract_kernel.h" @@ -146,13 +146,8 @@ void HeavisideKernel(const Context& dev_ctx, const DenseTensor& x, const DenseTensor& y, DenseTensor* out) { - std::vector inputs; - inputs.reserve(2); - std::vector outputs; - outputs.reserve(1); - inputs.emplace_back(&x); - inputs.emplace_back(&y); - outputs.emplace_back(out); + std::vector inputs = {&x, &y}; + std::vector outputs = {out}; dev_ctx.template Alloc(out); funcs::BroadcastKernel( dev_ctx, inputs, &outputs, funcs::ElementwiseHeavisideFunctor()); diff --git a/paddle/phi/kernels/legacy/compare_kernel.h b/paddle/phi/kernels/legacy/compare_kernel.h new file mode 100644 index 0000000000000..541ec10d244da --- /dev/null +++ b/paddle/phi/kernels/legacy/compare_kernel.h @@ -0,0 +1,63 @@ +/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void LessThanRawKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& y, + int axis, + DenseTensor* out); + +template +void LessEqualRawKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& y, + int axis, + DenseTensor* out); + +template +void GreaterThanRawKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& y, + int axis, + DenseTensor* out); + +template +void GreaterEqualRawKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& y, + int axis, + DenseTensor* out); + +template +void EqualRawKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& y, + int axis, + DenseTensor* out); + +template +void NotEqualRawKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& y, + int axis, + DenseTensor* out); + +} // namespace phi diff --git a/paddle/phi/kernels/legacy/elementwise_divide_kernel.h b/paddle/phi/kernels/legacy/elementwise_divide_kernel.h new file mode 100644 index 0000000000000..b63bcaad11693 --- /dev/null +++ b/paddle/phi/kernels/legacy/elementwise_divide_kernel.h @@ -0,0 +1,29 @@ +/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/infermeta/binary.h" + +namespace phi { + +template +void DivideRawKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + int axis, + DenseTensor* out); + +} // namespace phi diff --git a/paddle/phi/kernels/legacy/elementwise_kernel.h b/paddle/phi/kernels/legacy/elementwise_kernel.h index 1d453ec790f7c..b51704da7a6d6 100644 --- a/paddle/phi/kernels/legacy/elementwise_kernel.h +++ b/paddle/phi/kernels/legacy/elementwise_kernel.h @@ -19,13 +19,6 @@ namespace phi { -template -void DivideRawKernel(const Context& dev_ctx, - const DenseTensor& x, - const DenseTensor& y, - int axis, - DenseTensor* out); - template void MaximumRawKernel(const Context& dev_ctx, const DenseTensor& x, diff --git a/paddle/phi/kernels/legacy/kps/compare_kernel.cu b/paddle/phi/kernels/legacy/kps/compare_kernel.cu index 5ab9dd8fea2d3..67bd491738346 100644 --- a/paddle/phi/kernels/legacy/kps/compare_kernel.cu +++ b/paddle/phi/kernels/legacy/kps/compare_kernel.cu @@ -32,27 +32,14 @@ namespace phi { -template -struct BitwiseAdd { - // Bitwise add operator, returns a + b - inline T initial() { return static_cast(true); } - - __host__ __device__ __forceinline__ T operator()(const T& a, - const T& b) const { - return a & b; - } -}; - -template -inline void CompareCudaRawKernelImpl(const Context& ctx, - const DenseTensor& x, - const DenseTensor& y, - int axis, - DenseTensor* out) { +template +inline void CompareRawKernelImpl(const Context& ctx, + const DenseTensor& x, + const DenseTensor& y, + int axis, + DenseTensor* out) { ctx.template Alloc(out); + out->set_type(phi::DataType::BOOL); std::vector ins{&x, &y}; std::vector outs{out}; funcs::BroadcastKernel(ctx, ins, &outs, Functor(), axis); @@ -64,10 +51,8 @@ void LessThanRawKernel(const Context& ctx, const DenseTensor& y, int axis, DenseTensor* out) { - CompareCudaRawKernelImpl, - funcs::GreaterThanFunctor>(ctx, x, y, axis, out); + CompareRawKernelImpl>( + ctx, x, y, axis, out); } template @@ -76,10 +61,8 @@ void LessEqualRawKernel(const Context& ctx, const DenseTensor& y, int axis, DenseTensor* out) { - CompareCudaRawKernelImpl, - funcs::GreaterEqualFunctor>(ctx, x, y, axis, out); + CompareRawKernelImpl>( + ctx, x, y, axis, out); } template @@ -88,43 +71,38 @@ void GreaterThanRawKernel(const Context& ctx, const DenseTensor& y, int axis, DenseTensor* out) { - CompareCudaRawKernelImpl, - funcs::LessThanFunctor>(ctx, x, y, axis, out); + CompareRawKernelImpl>( + ctx, x, y, axis, out); } + template void GreaterEqualRawKernel(const Context& ctx, const DenseTensor& x, const DenseTensor& y, int axis, DenseTensor* out) { - CompareCudaRawKernelImpl, - funcs::LessEqualFunctor>(ctx, x, y, axis, out); + CompareRawKernelImpl>( + ctx, x, y, axis, out); } + template void EqualRawKernel(const Context& ctx, const DenseTensor& x, const DenseTensor& y, int axis, DenseTensor* out) { - CompareCudaRawKernelImpl, - funcs::EqualFunctor>(ctx, x, y, axis, out); + CompareRawKernelImpl>( + ctx, x, y, axis, out); } + template void NotEqualRawKernel(const Context& ctx, const DenseTensor& x, const DenseTensor& y, int axis, DenseTensor* out) { - CompareCudaRawKernelImpl, - funcs::NotEqualFunctor>(ctx, x, y, axis, out); + CompareRawKernelImpl>( + ctx, x, y, axis, out); } } // namespace phi diff --git a/paddle/phi/kernels/legacy/kps/elementwise_kernel.cu b/paddle/phi/kernels/legacy/kps/elementwise_kernel.cu index 6cd7f2dc20a86..f07164bc16885 100644 --- a/paddle/phi/kernels/legacy/kps/elementwise_kernel.cu +++ b/paddle/phi/kernels/legacy/kps/elementwise_kernel.cu @@ -23,105 +23,14 @@ namespace phi { DEFINE_CUDA_ELEMENTWISE_OP(Add) - -// Create the definition of Divide DEFINE_CUDA_ELEMENTWISE_OP(Divide) - -// Create the definition of Multiply DEFINE_CUDA_ELEMENTWISE_OP(Multiply) - -// Create the definition of Subtract DEFINE_CUDA_ELEMENTWISE_OP(Subtract) - -template -void MaximumRawKernel(const Context& dev_ctx, - const DenseTensor& x, - const DenseTensor& y, - int axis, - DenseTensor* out) { - std::vector inputs; - inputs.reserve(2); - std::vector outputs; - outputs.reserve(1); - inputs.emplace_back(&x); - inputs.emplace_back(&y); - outputs.emplace_back(out); - dev_ctx.template Alloc(out); - funcs::BroadcastKernel( - dev_ctx, inputs, &outputs, funcs::MaximumFunctor(), axis); -} - -template -void MinimumRawKernel(const Context& dev_ctx, - const DenseTensor& x, - const DenseTensor& y, - int axis, - DenseTensor* out) { - std::vector inputs; - inputs.reserve(2); - std::vector outputs; - outputs.reserve(1); - inputs.emplace_back(&x); - inputs.emplace_back(&y); - outputs.emplace_back(out); - dev_ctx.template Alloc(out); - funcs::BroadcastKernel( - dev_ctx, inputs, &outputs, funcs::MinimumFunctor(), axis); -} - -template -void RemainderRawKernel(const Context& dev_ctx, - const DenseTensor& x, - const DenseTensor& y, - int axis, - DenseTensor* out) { - std::vector inputs; - inputs.reserve(2); - std::vector outputs; - outputs.reserve(1); - inputs.emplace_back(&x); - inputs.emplace_back(&y); - outputs.emplace_back(out); - dev_ctx.template Alloc(out); - funcs::BroadcastKernel( - dev_ctx, inputs, &outputs, funcs::RemainderFunctor(), axis); -} - -template -void FloorDivideRawKernel(const Context& dev_ctx, - const DenseTensor& x, - const DenseTensor& y, - int axis, - DenseTensor* out) { - std::vector inputs; - inputs.reserve(2); - std::vector outputs; - outputs.reserve(1); - inputs.emplace_back(&x); - inputs.emplace_back(&y); - outputs.emplace_back(out); - dev_ctx.template Alloc(out); - funcs::BroadcastKernel( - dev_ctx, inputs, &outputs, funcs::FloorDivideFunctor(), axis); -} - -template -void ElementwisePowRawKernel(const Context& dev_ctx, - const DenseTensor& x, - const DenseTensor& y, - int axis, - DenseTensor* out) { - std::vector inputs; - inputs.reserve(2); - std::vector outputs; - outputs.reserve(1); - inputs.emplace_back(&x); - inputs.emplace_back(&y); - outputs.emplace_back(out); - dev_ctx.template Alloc(out); - funcs::BroadcastKernel( - dev_ctx, inputs, &outputs, funcs::ElementwisePowFunctor(), axis); -} +DEFINE_CUDA_ELEMENTWISE_OP(Maximum) +DEFINE_CUDA_ELEMENTWISE_OP(Minimum) +DEFINE_CUDA_ELEMENTWISE_OP(Remainder) +DEFINE_CUDA_ELEMENTWISE_OP(FloorDivide) +DEFINE_CUDA_ELEMENTWISE_OP(ElementwisePow) } // namespace phi From c37f450a6453a98201f91eb7b92e938527eeb25c Mon Sep 17 00:00:00 2001 From: WangZhen <23097963+0x45f@users.noreply.github.com> Date: Tue, 10 Oct 2023 10:25:21 +0800 Subject: [PATCH 51/62] [PIR]Polish relu pir code (#57763) --- python/paddle/nn/functional/activation.py | 6 +----- test/dygraph_to_static/test_convert_call.py | 2 +- test/legacy_test/test_activation_op.py | 10 ++++++---- 3 files changed, 8 insertions(+), 10 deletions(-) diff --git a/python/paddle/nn/functional/activation.py b/python/paddle/nn/functional/activation.py index f15c9f280db61..7acafa290f7e0 100644 --- a/python/paddle/nn/functional/activation.py +++ b/python/paddle/nn/functional/activation.py @@ -764,13 +764,9 @@ def relu(x, name=None): [0., 0., 1.]) """ - if in_dynamic_mode(): + if in_dynamic_or_pir_mode(): return _C_ops.relu(x) else: - if paddle.framework.in_dynamic_or_pir_mode(): - # Below code will be removed after we can generate IR api automatically - return paddle._pir_ops.relu(x) - check_variable_and_dtype( x, 'x', ['float16', 'uint16', 'float32', 'float64'], 'relu' ) diff --git a/test/dygraph_to_static/test_convert_call.py b/test/dygraph_to_static/test_convert_call.py index fb6c69fc899fa..77ca5a88f012b 100644 --- a/test/dygraph_to_static/test_convert_call.py +++ b/test/dygraph_to_static/test_convert_call.py @@ -286,7 +286,7 @@ def test_functional_api(self): func = paddle.nn.functional.relu func = paddle.jit.to_static(func) self.assertNotIn("_jst.IfElse", func.code) - self.assertIn("if in_dynamic_mode()", func.code) + self.assertIn("if in_dynamic_or_pir_mode()", func.code) @ast_only_test def test_class_api(self): diff --git a/test/legacy_test/test_activation_op.py b/test/legacy_test/test_activation_op.py index 5689b0cb970fc..84ec122e7adc6 100644 --- a/test/legacy_test/test_activation_op.py +++ b/test/legacy_test/test_activation_op.py @@ -2272,10 +2272,10 @@ def setUp(self): def test_check_grad(self): if self.dtype == np.float16: return - self.check_grad(['X'], 'Out', check_prim=True) + self.check_grad(['X'], 'Out', check_prim=True, check_new_ir=True) def test_check_output(self): - self.check_output(check_prim=True) + self.check_output(check_prim=True, check_new_ir=True) def if_enable_cinn(self): pass @@ -4583,7 +4583,9 @@ def test_check_grad(self): create_test_act_fp16_class(TestAsinh) create_test_act_fp16_class(TestAtanh) create_test_act_fp16_class(TestRound, grad_check=False) -create_test_act_fp16_class(TestRelu, check_prim=True, enable_cinn=True) +create_test_act_fp16_class( + TestRelu, check_prim=True, enable_cinn=True, check_new_ir=True +) create_test_act_fp16_class( TestGelu, check_prim=True, @@ -4725,7 +4727,7 @@ def test_check_grad(self): create_test_act_bf16_class(TestAsinh) create_test_act_bf16_class(TestAtanh) create_test_act_bf16_class(TestRound, grad_check=False) -create_test_act_bf16_class(TestRelu, check_prim=True) +create_test_act_bf16_class(TestRelu, check_prim=True, check_new_ir=True) create_test_act_bf16_class( TestGelu, check_prim=True, From 39c6270f887ae38c9cb822b30aa523278993aebf Mon Sep 17 00:00:00 2001 From: WangZhen <23097963+0x45f@users.noreply.github.com> Date: Tue, 10 Oct 2023 10:26:14 +0800 Subject: [PATCH 52/62] [PIR]Migrate abs into pir (#57765) --- python/paddle/tensor/layer_function_generator.py | 3 ++- test/legacy_test/test_activation_op.py | 11 ++++++++--- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/python/paddle/tensor/layer_function_generator.py b/python/paddle/tensor/layer_function_generator.py index 02ab66eb1da2a..30574b93baf48 100644 --- a/python/paddle/tensor/layer_function_generator.py +++ b/python/paddle/tensor/layer_function_generator.py @@ -27,6 +27,7 @@ convert_np_dtype_to_dtype_, core, in_dynamic_mode, + in_dynamic_or_pir_mode, ) __all__ = [] @@ -266,7 +267,7 @@ def generate_activation_fn(op_type): op_proto = OpProtoHolder.instance().get_op_proto(op_type) def func(x, name=None): - if in_dynamic_mode(): + if in_dynamic_or_pir_mode(): if hasattr(_C_ops, op_type): op = getattr(_C_ops, op_type) return op(x) diff --git a/test/legacy_test/test_activation_op.py b/test/legacy_test/test_activation_op.py index 84ec122e7adc6..c5fec28242344 100644 --- a/test/legacy_test/test_activation_op.py +++ b/test/legacy_test/test_activation_op.py @@ -1699,10 +1699,13 @@ def init_shape(self): def if_enable_cinn(self): pass + def test_check_output(self): + self.check_output(check_new_ir=True) + def test_check_grad(self): if self.dtype == np.float16: return - self.check_grad(['X'], 'Out', check_prim=True) + self.check_grad(['X'], 'Out', check_prim=True, check_new_ir=True) class TestAbs_ZeroDim(TestAbs): @@ -4566,7 +4569,9 @@ def test_check_grad(self): create_test_act_fp16_class( TestSqrtComp, check_prim=True, enable_cinn=True, check_new_ir=True ) -create_test_act_fp16_class(TestAbs, check_prim=True, enable_cinn=True) +create_test_act_fp16_class( + TestAbs, check_prim=True, enable_cinn=True, check_new_ir=True +) create_test_act_fp16_class(TestCeil, grad_check=False) create_test_act_fp16_class( TestFloor, check_prim=True, grad_check=False, enable_cinn=True @@ -4712,7 +4717,7 @@ def test_check_grad(self): create_test_act_bf16_class(TestSoftshrink) create_test_act_bf16_class(TestSqrt, check_prim=True, check_new_ir=True) create_test_act_bf16_class(TestSqrtComp, check_prim=True, check_new_ir=True) -create_test_act_bf16_class(TestAbs, check_prim=True) +create_test_act_bf16_class(TestAbs, check_prim=True, check_new_ir=True) create_test_act_bf16_class(TestCeil, grad_check=False) create_test_act_bf16_class(TestFloor, grad_check=False, check_prim=True) create_test_act_bf16_class(TestCos) From df88ea34600119e69b31cfbe51cdd5a1f88423cf Mon Sep 17 00:00:00 2001 From: xuxinyi389 <104957571+xuxinyi389@users.noreply.github.com> Date: Tue, 10 Oct 2023 10:31:35 +0800 Subject: [PATCH 53/62] remove non-public API (#57921) --- python/paddle/tensor/__init__.py | 31 ------------------- .../test_math_op_patch_var_base.py | 31 ------------------- 2 files changed, 62 deletions(-) diff --git a/python/paddle/tensor/__init__.py b/python/paddle/tensor/__init__.py index e16ef89ce8a47..61005132276d9 100644 --- a/python/paddle/tensor/__init__.py +++ b/python/paddle/tensor/__init__.py @@ -714,37 +714,6 @@ 'acosh_', 'asinh_', 'diag', - 'eye', - 'linspace', - 'fill_constant', - 'ones', - 'ones_like', - 'zeros', - 'zeros_like', - 'arange', - 'full', - 'full_like', - 'meshgrid', - 'empty', - 'empty_like', - 'complex', - 'eigh', - 'standard_normal', - 'normal', - 'uniform', - 'randn', - 'rand', - 'randint', - 'randint_like', - 'randperm', - 'poisson', - 'searchsorted', - 'set_printoptions', - 'array_length', - 'array_read', - 'array_write', - 'create_array', - 'einsum', 'normal_', ] diff --git a/test/legacy_test/test_math_op_patch_var_base.py b/test/legacy_test/test_math_op_patch_var_base.py index a49c2000de92d..8dea69705751c 100644 --- a/test/legacy_test/test_math_op_patch_var_base.py +++ b/test/legacy_test/test_math_op_patch_var_base.py @@ -635,37 +635,6 @@ def test_tensor_patch_method(self): self.assertTrue(inspect.ismethod(x.acosh_)) self.assertTrue(inspect.ismethod(x.asinh_)) self.assertTrue(inspect.ismethod(x.diag)) - self.assertTrue(inspect.ismethod(x.eye)) - self.assertTrue(inspect.ismethod(x.linspace)) - self.assertTrue(inspect.ismethod(x.fill_constant)) - self.assertTrue(inspect.ismethod(x.ones)) - self.assertTrue(inspect.ismethod(x.ones_like)) - self.assertTrue(inspect.ismethod(x.zeros)) - self.assertTrue(inspect.ismethod(x.zeros_like)) - self.assertTrue(inspect.ismethod(x.arange)) - self.assertTrue(inspect.ismethod(x.full)) - self.assertTrue(inspect.ismethod(x.full_like)) - self.assertTrue(inspect.ismethod(x.meshgrid)) - self.assertTrue(inspect.ismethod(x.empty)) - self.assertTrue(inspect.ismethod(x.empty_like)) - self.assertTrue(inspect.ismethod(x.complex)) - self.assertTrue(inspect.ismethod(x.eigh)) - self.assertTrue(inspect.ismethod(x.standard_normal)) - self.assertTrue(inspect.ismethod(x.normal)) - self.assertTrue(inspect.ismethod(x.uniform)) - self.assertTrue(inspect.ismethod(x.randn)) - self.assertTrue(inspect.ismethod(x.rand)) - self.assertTrue(inspect.ismethod(x.randint)) - self.assertTrue(inspect.ismethod(x.randint_like)) - self.assertTrue(inspect.ismethod(x.randperm)) - self.assertTrue(inspect.ismethod(x.poisson)) - self.assertTrue(inspect.ismethod(x.searchsorted)) - self.assertTrue(inspect.ismethod(x.set_printoptions)) - self.assertTrue(inspect.ismethod(x.array_length)) - self.assertTrue(inspect.ismethod(x.array_read)) - self.assertTrue(inspect.ismethod(x.array_write)) - self.assertTrue(inspect.ismethod(x.create_array)) - self.assertTrue(inspect.ismethod(x.einsum)) def test_complex_scalar(self): a_np = np.random.random(self.shape).astype(self.dtype) From 8b27ef1ecf47c638a0c373678ec47b4666c4594c Mon Sep 17 00:00:00 2001 From: WangZhen <23097963+0x45f@users.noreply.github.com> Date: Tue, 10 Oct 2023 10:33:23 +0800 Subject: [PATCH 54/62] [PIR]Migrate log into pir (#57946) --- python/paddle/tensor/math.py | 2 +- test/legacy_test/test_activation_op.py | 9 ++++++--- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py index 467c7f7ab88f1..811f28c1ba97b 100644 --- a/python/paddle/tensor/math.py +++ b/python/paddle/tensor/math.py @@ -171,7 +171,7 @@ def log(x, name=None): [[0.69314718, 1.09861231, 1.38629436], [1.94591010, 2.07944155, 2.19722462]]) """ - if in_dynamic_mode(): + if in_dynamic_or_pir_mode(): return _C_ops.log(x) else: check_variable_and_dtype( diff --git a/test/legacy_test/test_activation_op.py b/test/legacy_test/test_activation_op.py index c5fec28242344..5c4ed1a242c1c 100644 --- a/test/legacy_test/test_activation_op.py +++ b/test/legacy_test/test_activation_op.py @@ -3257,10 +3257,13 @@ def setUp(self): def if_enable_cinn(self): pass + def test_check_output(self): + self.check_output(check_new_ir=True) + def test_check_grad(self): if self.dtype == np.float16: return - self.check_grad(['X'], 'Out', check_prim=True) + self.check_grad(['X'], 'Out', check_prim=True, check_new_ir=True) class Test_Log_Op_Fp16(unittest.TestCase): @@ -4608,7 +4611,7 @@ def test_check_grad(self): create_test_act_fp16_class(TestELU) create_test_act_fp16_class(TestCELU) create_test_act_fp16_class(TestReciprocal) -create_test_act_fp16_class(TestLog, check_prim=True) +create_test_act_fp16_class(TestLog, check_prim=True, check_new_ir=True) if core.is_compiled_with_rocm(): create_test_act_fp16_class(TestLog2) else: @@ -4748,7 +4751,7 @@ def test_check_grad(self): create_test_act_bf16_class(TestELU) create_test_act_bf16_class(TestCELU) create_test_act_bf16_class(TestReciprocal) -create_test_act_bf16_class(TestLog, check_prim=True) +create_test_act_bf16_class(TestLog, check_prim=True, check_new_ir=True) if core.is_compiled_with_rocm(): create_test_act_bf16_class(TestLog2) else: From 8d9af14eefb48189125f165d2d615c86521f46c2 Mon Sep 17 00:00:00 2001 From: Shijie <505749828@qq.com> Date: Tue, 10 Oct 2023 10:41:38 +0800 Subject: [PATCH 55/62] remove unnecessary warnings (#57819) --- python/paddle/base/data_feeder.py | 18 +----------------- python/paddle/base/executor.py | 8 +------- python/paddle/incubate/autograd/primapi.py | 6 ++---- 3 files changed, 4 insertions(+), 28 deletions(-) diff --git a/python/paddle/base/data_feeder.py b/python/paddle/base/data_feeder.py index 536f524c596e6..2449f456fdc66 100644 --- a/python/paddle/base/data_feeder.py +++ b/python/paddle/base/data_feeder.py @@ -13,7 +13,6 @@ # limitations under the License. import struct -import warnings import numpy as np @@ -196,22 +195,7 @@ def check_dtype( # See NOTE [ Why skip dynamic graph check ] if in_dygraph_mode(): return - if convert_dtype(input_dtype) in ['float16']: - warnings.warn( - "The data type of '{}' in {} only support float16 in GPU now. {}".format( - input_name, op_name, extra_message - ) - ) - if convert_dtype(input_dtype) in ['uint16'] and op_name not in [ - 'reshape', - 'lookup_table', - 'scale', - ]: - warnings.warn( - "The data type of '{}' in {} only support bfloat16 in OneDNN now. {}".format( - input_name, op_name, extra_message - ) - ) + if convert_dtype(input_dtype) not in expected_dtype: raise TypeError( "The data type of '{}' in {} must be {}, but received {}. {}".format( diff --git a/python/paddle/base/executor.py b/python/paddle/base/executor.py index e5b513831afb8..b78c19ca6cd00 100755 --- a/python/paddle/base/executor.py +++ b/python/paddle/base/executor.py @@ -1894,13 +1894,7 @@ def _run_pir_impl( "Please ensure you create model correctly or you can pass " "the Program or the CompiledProgram manually." ) - else: - error_info = ( - "There are no operators in the program to be executed. " - "If you pass Program manually, please use base.program_guard " - "to ensure the current Program is being used." - ) - warnings.warn(error_info) + warnings.warn(error_info) if scope is None: scope = global_scope() diff --git a/python/paddle/incubate/autograd/primapi.py b/python/paddle/incubate/autograd/primapi.py index cc57f930de4a7..723c8b11d6fd8 100644 --- a/python/paddle/incubate/autograd/primapi.py +++ b/python/paddle/incubate/autograd/primapi.py @@ -266,7 +266,7 @@ def to_prim( blacklist = prim_config["forward_blacklist"] | blacklist with framework.program_guard(main_program): - print("Lowering composite forward ops begin...", flush=True) + logging.info("Lowering composite forward ops begin...") if len(blacklist) > 0 and len(whitelist) > 0: filter_ = lambda x: x.type in whitelist and x.type not in blacklist @@ -283,6 +283,4 @@ def to_prim( backward_length=backward_length, ) replace_ops = prim_config["composite_ops_record"] - print( - f"Lowering composite forward ops finish: {replace_ops}", flush=True - ) + logging.info(f"Lowering composite forward ops finish: {replace_ops}") From ab3f0f86d9ebef35eb5aa7d4001b5d927fa0a7b4 Mon Sep 17 00:00:00 2001 From: megemini Date: Tue, 10 Oct 2023 11:04:28 +0800 Subject: [PATCH 56/62] [Fix] fix float patch as string beginning (#57806) --- tools/sampcd_processor.py | 2 +- tools/test_sampcd_processor.py | 406 ++++++++++----------------------- 2 files changed, 122 insertions(+), 286 deletions(-) diff --git a/tools/sampcd_processor.py b/tools/sampcd_processor.py index ef5bc0ca45cfa..23044c127625b 100644 --- a/tools/sampcd_processor.py +++ b/tools/sampcd_processor.py @@ -102,7 +102,7 @@ def _patch_float_precision(digits): pattern_number = re.compile( r""" (?: - (?<=[\s*\[\(\'\"\:]) # number starts + (?:(?<=[\s*\[\(\'\"\:])|^) # number starts (?: # int/float or complex-real (?: [+-]? diff --git a/tools/test_sampcd_processor.py b/tools/test_sampcd_processor.py index 714685062359f..c6951bc4ec5d5 100644 --- a/tools/test_sampcd_processor.py +++ b/tools/test_sampcd_processor.py @@ -881,53 +881,7 @@ def test_patch_xdoctest(self): [1.94591032, 2.07944156, 2.1972246]]) """, - } - - test_results = get_test_results(doctester, docstrings_to_test) - self.assertEqual(len(test_results), 9) - - tr_0, tr_1, tr_2, tr_3, tr_4, tr_5, tr_6, tr_7, tr_8 = test_results - - self.assertIn('gpu_to_gpu', tr_0.name) - self.assertTrue(tr_0.passed) - - self.assertIn('cpu_to_cpu', tr_1.name) - self.assertTrue(tr_1.passed) - - self.assertIn('gpu_to_cpu', tr_2.name) - self.assertTrue(tr_2.passed) - - self.assertIn('cpu_to_gpu', tr_3.name) - self.assertTrue(tr_3.passed) - - self.assertIn('gpu_to_cpu_array', tr_4.name) - self.assertTrue(tr_4.passed) - - self.assertIn('cpu_to_gpu_array', tr_5.name) - self.assertTrue(tr_5.passed) - - self.assertIn('mass_array', tr_6.name) - self.assertTrue(tr_6.passed) - - self.assertIn('float_array', tr_7.name) - self.assertTrue(tr_7.passed) - - self.assertIn('float_array_diff', tr_8.name) - self.assertTrue(tr_8.passed) - - # reload xdoctest.checker - importlib.reload(xdoctest.checker) - - _clear_environ() - - test_capacity = {'cpu'} - doctester = Xdoctester( - style='freeform', target='codeblock', patch_float_precision=None - ) - doctester.prepare(test_capacity) - - docstrings_to_test = { - 'gpu_to_gpu': """ + 'float_begin': """ placeholder Examples: @@ -937,15 +891,11 @@ def test_patch_xdoctest(self): this is some blabla... - >>> import paddle - >>> paddle.device.set_device('gpu') - >>> a = paddle.to_tensor(.123456789) - >>> print(a) - Tensor(shape=[1], dtype=float32, place=Place(gpu:0), stop_gradient=True, - [0.123456780]) + >>> print(7.0) + 7. """, - 'cpu_to_cpu': """ + 'float_begin_long': """ placeholder Examples: @@ -955,15 +905,11 @@ def test_patch_xdoctest(self): this is some blabla... - >>> import paddle - >>> paddle.device.set_device('cpu') - >>> a = paddle.to_tensor(.123456789) - >>> print(a) - Tensor(shape=[1], dtype=float32, place=Place(cpu), stop_gradient=True, - [0.123456780]) + >>> print(7.0000023) + 7.0000024 """, - 'gpu_to_cpu': """ + 'float_begin_more': """ placeholder Examples: @@ -973,15 +919,11 @@ def test_patch_xdoctest(self): this is some blabla... - >>> import paddle - >>> paddle.device.set_device('gpu') - >>> a = paddle.to_tensor(.123456789) - >>> print(a) - Tensor(shape=[1], dtype=float32, place=Place(cpu), stop_gradient=True, - [0.123456780]) + >>> print(7.0, 5., 6.123456) + 7.0 5.0 6.123457 """, - 'cpu_to_gpu': """ + 'float_begin_more_diff': """ placeholder Examples: @@ -991,14 +933,11 @@ def test_patch_xdoctest(self): this is some blabla... - >>> import paddle - >>> paddle.device.set_device('cpu') - >>> a = paddle.to_tensor(.123456789) - >>> print(a) - Tensor(shape=[1], dtype=float32, place=Place(gpu:0), stop_gradient=True, - [0.123456780]) + >>> print(7.0, 5., 6.123456) + 7.0 5.0 6.123457 + """, - 'gpu_to_cpu_array': """ + 'float_begin_more_brief': """ placeholder Examples: @@ -1008,16 +947,11 @@ def test_patch_xdoctest(self): this is some blabla... - >>> import paddle - >>> paddle.device.set_device('gpu') - >>> a = paddle.to_tensor([[1.123456789 ,2,3], [2,3,4], [3,4,5]]) - >>> print(a) - Tensor(shape=[3, 3], dtype=float32, place=Place(cpu), stop_gradient=True, - [[1.123456780, 2., 3.], - [2., 3., 4.], - [3., 4., 5.]]) + >>> print(7.0, 5., 6.123456) + 7. 5. 6.123457 + """, - 'cpu_to_gpu_array': """ + 'float_begin_fail': """ placeholder Examples: @@ -1027,106 +961,109 @@ def test_patch_xdoctest(self): this is some blabla... - >>> import paddle - >>> paddle.device.set_device('cpu') - >>> a = paddle.to_tensor([[1.123456789,2,3], [2,3,4], [3,4,5]]) - >>> print(a) - Tensor(shape=[3, 3], dtype=float32, place=Place(gpu:0), stop_gradient=True, - [[1.123456780, 2., 3.], - [2., 3., 4.], - [3., 4., 5.]]) + >>> print(7.0100023) + 7.0000024 + """, - 'mass_array': """ - placeholder + } - Examples: + test_results = get_test_results(doctester, docstrings_to_test) + self.assertEqual(len(test_results), 15) - .. code-block:: python - :name: code-example-1 + ( + tr_0, + tr_1, + tr_2, + tr_3, + tr_4, + tr_5, + tr_6, + tr_7, + tr_8, + tr_9, + tr_10, + tr_11, + tr_12, + tr_13, + tr_14, + ) = test_results - this is some blabla... + self.assertIn('gpu_to_gpu', tr_0.name) + self.assertTrue(tr_0.passed) - >>> import paddle - >>> paddle.device.set_device('gpu') - >>> a = paddle.to_tensor( - ... [[1.123456780, 2., -3, .3], - ... [2, 3, +4., 1.2+10.34e-5j], - ... [3, 5.e-3, 1e2, 3e-8]] - ... ) - >>> # Tensor(shape=[3, 4], dtype=complex64, place=Place(gpu:0), stop_gradient=True, - >>> # [[ (1.1234568357467651+0j) , - >>> # (2+0j) , - >>> # (-3+0j) , - >>> # (0.30000001192092896+0j) ], - >>> # [ (2+0j) , - >>> # (3+0j) , - >>> # (4+0j) , - >>> # (1.2000000476837158+0.00010340000153519213j)], - >>> # [ (3+0j) , - >>> # (0.004999999888241291+0j) , - >>> # (100+0j) , - >>> # (2.999999892949745e-08+0j) ]]) - >>> print(a) - Tensor(shape=[3, 4], dtype=complex64, place=Place(AAA), stop_gradient=True, - [[ (1.123456+0j), - (2+0j), - (-3+0j), - (0.3+0j)], - [ (2+0j), - (3+0j), - (4+0j), - (1.2+0.00010340j)], - [ (3+0j), - (0.00499999+0j), - (100+0j), - (2.999999e-08+0j)]]) - """, - 'float_array': """ - placeholder + self.assertIn('cpu_to_cpu', tr_1.name) + self.assertTrue(tr_1.passed) - Examples: + self.assertIn('gpu_to_cpu', tr_2.name) + self.assertTrue(tr_2.passed) - .. code-block:: python - :name: code-example-1 + self.assertIn('cpu_to_gpu', tr_3.name) + self.assertTrue(tr_3.passed) - this is some blabla... + self.assertIn('gpu_to_cpu_array', tr_4.name) + self.assertTrue(tr_4.passed) - >>> import paddle - >>> paddle.device.set_device('cpu') - >>> x = [[2, 3, 4], [7, 8, 9]] - >>> x = paddle.to_tensor(x, dtype='float32') - >>> print(paddle.log(x)) - Tensor(shape=[2, 3], dtype=float32, place=Place(cpu), stop_gradient=True, - [[0.69314718, 1.09861231, 1.38629436], - [1.94591010, 2.07944155, 2.19722462]]) + self.assertIn('cpu_to_gpu_array', tr_5.name) + self.assertTrue(tr_5.passed) - """, - 'float_array_diff': """ - placeholder + self.assertIn('mass_array', tr_6.name) + self.assertTrue(tr_6.passed) - Examples: + self.assertIn('float_array', tr_7.name) + self.assertTrue(tr_7.passed) - .. code-block:: python - :name: code-example-1 + self.assertIn('float_array_diff', tr_8.name) + self.assertTrue(tr_8.passed) - this is some blabla... + self.assertIn('float_begin', tr_9.name) + self.assertTrue(tr_9.passed) - >>> import paddle - >>> paddle.device.set_device('cpu') - >>> x = [[2, 3, 4], [7, 8, 9]] - >>> x = paddle.to_tensor(x, dtype='float32') - >>> print(paddle.log(x)) - Tensor(shape=[2, 3], dtype=float32, place=Place(cpu), stop_gradient=True, - [[0.69314712, 1.09861221, 1.386294], - [1.94591032, 2.07944156, 2.1972246]]) + self.assertIn('float_begin_long', tr_10.name) + self.assertTrue(tr_10.passed) - """, - } + self.assertIn('float_begin_more', tr_11.name) + self.assertTrue(tr_11.passed) + + self.assertIn('float_begin_more_diff', tr_12.name) + self.assertTrue(tr_12.passed) + + self.assertIn('float_begin_more_brief', tr_13.name) + self.assertTrue(tr_13.passed) + + self.assertIn('float_begin_fail', tr_14.name) + self.assertFalse(tr_14.passed) + + # reload xdoctest.checker + importlib.reload(xdoctest.checker) + + _clear_environ() + + test_capacity = {'cpu'} + doctester = Xdoctester( + style='freeform', target='codeblock', patch_float_precision=None + ) + doctester.prepare(test_capacity) test_results = get_test_results(doctester, docstrings_to_test) - self.assertEqual(len(test_results), 9) + self.assertEqual(len(test_results), 15) - tr_0, tr_1, tr_2, tr_3, tr_4, tr_5, tr_6, tr_7, tr_8 = test_results + ( + tr_0, + tr_1, + tr_2, + tr_3, + tr_4, + tr_5, + tr_6, + tr_7, + tr_8, + tr_9, + tr_10, + tr_11, + tr_12, + tr_13, + tr_14, + ) = test_results self.assertIn('gpu_to_gpu', tr_0.name) self.assertFalse(tr_0.passed) @@ -1155,6 +1092,24 @@ def test_patch_xdoctest(self): self.assertIn('float_array_diff', tr_8.name) self.assertFalse(tr_8.passed) + self.assertIn('float_begin', tr_9.name) + self.assertFalse(tr_9.passed) + + self.assertIn('float_begin_long', tr_10.name) + self.assertFalse(tr_10.passed) + + self.assertIn('float_begin_more', tr_11.name) + self.assertFalse(tr_11.passed) + + self.assertIn('float_begin_more_diff', tr_12.name) + self.assertFalse(tr_12.passed) + + self.assertIn('float_begin_more_brief', tr_13.name) + self.assertFalse(tr_13.passed) + + self.assertIn('float_begin_fail', tr_14.name) + self.assertFalse(tr_14.passed) + def test_run_cpu(self): _clear_environ() @@ -1521,62 +1476,6 @@ def test_style_google(self): doctester = Xdoctester(style='google', target='codeblock') doctester.prepare(test_capacity) - docstrings_to_test = { - 'one_plus_one': """ - placeholder - - .. code-block:: python - :name: code-example-0 - - this is some blabla... - - >>> # doctest: +SKIP('skip') - >>> print(1+1) - 2 - - Examples: - - .. code-block:: python - :name: code-example-1 - - this is some blabla... - - >>> # doctest: +REQUIRES(env:CPU) - >>> print(1-1) - 0 - - Examples: - - .. code-block:: python - :name: code-example-2 - - >>> print(1+2) - 3 - """, - 'one_minus_one': """ - placeholder - - Examples: - - .. code-block:: python - :name: code-example-1 - - this is some blabla... - - >>> # doctest: +REQUIRES(env:GPU) - >>> print(1-1) - 0 - - Examples: - - .. code-block:: python - :name: code-example-2 - - >>> print(1+1) - 3 - """, - } - test_results = get_test_results(doctester, docstrings_to_test) self.assertEqual(len(test_results), 4) @@ -1849,27 +1748,6 @@ def test_no_code(self): doctester = Xdoctester(style='google', target='codeblock') doctester.prepare(test_capacity) - docstrings_to_test = { - 'one_plus_one': """ - placeholder - - .. code-block:: python - :name: code-example-0 - - this is some blabla... - - >>> # doctest: +SKIP('skip') - >>> print(1+1) - 2 - """, - 'one_minus_one': """ - placeholder - - Examples: - - """, - } - test_results = get_test_results(doctester, docstrings_to_test) self.assertEqual(len(test_results), 0) @@ -1879,27 +1757,6 @@ def test_no_code(self): doctester = Xdoctester(style='freeform', target='docstring') doctester.prepare(test_capacity) - docstrings_to_test = { - 'one_plus_one': """ - placeholder - - .. code-block:: python - :name: code-example-0 - - this is some blabla... - - >>> # doctest: +SKIP('skip') - >>> print(1+1) - 2 - """, - 'one_minus_one': """ - placeholder - - Examples: - - """, - } - test_results = get_test_results(doctester, docstrings_to_test) self.assertEqual(len(test_results), 2) @@ -1925,27 +1782,6 @@ def test_no_code(self): doctester = Xdoctester(style='freeform', target='codeblock') doctester.prepare(test_capacity) - docstrings_to_test = { - 'one_plus_one': """ - placeholder - - .. code-block:: python - :name: code-example-0 - - this is some blabla... - - >>> # doctest: +SKIP('skip') - >>> print(1+1) - 2 - """, - 'one_minus_one': """ - placeholder - - Examples: - - """, - } - test_results = get_test_results(doctester, docstrings_to_test) self.assertEqual(len(test_results), 1) From f4b6c1f2f688bdc0d233d21487e34f8a959516d6 Mon Sep 17 00:00:00 2001 From: gouzil <66515297+gouzil@users.noreply.github.com> Date: Tue, 10 Oct 2023 11:25:20 +0800 Subject: [PATCH 57/62] [clang-tidy] clang-tidy script to repeat (#57868) --- tools/codestyle/clang-tidy.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/codestyle/clang-tidy.py b/tools/codestyle/clang-tidy.py index ef1a5c76e1e43..d8f87d1a630d7 100644 --- a/tools/codestyle/clang-tidy.py +++ b/tools/codestyle/clang-tidy.py @@ -396,9 +396,9 @@ def main(): # Load the database and extract all files. database = json.load(open(os.path.join(build_path, db_path))) database = skip_check_file(database, build_path) - files = [ + files = { make_absolute(entry['file'], entry['directory']) for entry in database - ] + } max_task = args.j if max_task == 0: From a4528eb191bef9e556249e448bd70435eb9369d5 Mon Sep 17 00:00:00 2001 From: wanghuancoder Date: Tue, 10 Oct 2023 11:27:15 +0800 Subject: [PATCH 58/62] [CleanOps]del diag op (#57895) * del diag op --- paddle/fluid/operators/diag_op.cc | 66 ----------------------------- paddle/fluid/operators/diag_op.cu | 23 ---------- paddle/fluid/operators/diag_op.h | 59 -------------------------- test/legacy_test/test_diag.py | 70 ------------------------------- tools/parallel_UT_rule.py | 2 - 5 files changed, 220 deletions(-) delete mode 100644 paddle/fluid/operators/diag_op.cc delete mode 100644 paddle/fluid/operators/diag_op.cu delete mode 100644 paddle/fluid/operators/diag_op.h delete mode 100644 test/legacy_test/test_diag.py diff --git a/paddle/fluid/operators/diag_op.cc b/paddle/fluid/operators/diag_op.cc deleted file mode 100644 index f7b2c4915662c..0000000000000 --- a/paddle/fluid/operators/diag_op.cc +++ /dev/null @@ -1,66 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/diag_op.h" - -namespace paddle { -namespace operators { - -class DiagOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext *ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("Diagonal"), "Input", "Diagonal", "diag"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "diag"); - - auto s_dims = ctx->GetInputDim("Diagonal"); - - PADDLE_ENFORCE_EQ( - s_dims.size(), - 1UL, - platform::errors::InvalidArgument( - "The dimension of 'diagonal' must be 1, but now it is %d.", - s_dims.size())); - - ctx->SetOutputDim("Out", {s_dims[0], s_dims[0]}); - } -}; - -class DiagOpMaker : public framework::OpProtoAndCheckerMaker { - public: - void Make() override { - AddInput("Diagonal", - "Diagonal values of square matrix. It is a tensor with rank 1."); - AddOutput("Out", "A square matrix."); - AddComment(R"DOC( - Return a square matrix with specified diagonal values. -)DOC"); - } -}; -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OPERATOR( - diag, - ops::DiagOp, - ops::DiagOpMaker, - paddle::framework::EmptyGradOpMaker, - paddle::framework::EmptyGradOpMaker); -REGISTER_OP_CPU_KERNEL(diag, - ops::DiagKernel, - ops::DiagKernel, - ops::DiagKernel, - ops::DiagKernel); diff --git a/paddle/fluid/operators/diag_op.cu b/paddle/fluid/operators/diag_op.cu deleted file mode 100644 index c9afc983b03bb..0000000000000 --- a/paddle/fluid/operators/diag_op.cu +++ /dev/null @@ -1,23 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/diag_op.h" -#include "paddle/fluid/framework/op_registry.h" - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL(diag, - ops::DiagKernel, - ops::DiagKernel, - ops::DiagKernel, - ops::DiagKernel); diff --git a/paddle/fluid/operators/diag_op.h b/paddle/fluid/operators/diag_op.h deleted file mode 100644 index e3514e59e806d..0000000000000 --- a/paddle/fluid/operators/diag_op.h +++ /dev/null @@ -1,59 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/platform/for_range.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace paddle { -namespace operators { - -template -struct DiagFunctor { - DiagFunctor(const T* diagonal, int64_t numel, T* output) - : diagonal_(diagonal), numel_(numel), output_(output) {} - - HOSTDEVICE void operator()(size_t idx) const { - output_[idx * numel_ + idx] = diagonal_[idx]; - } - - const T* diagonal_; - int64_t numel_; - T* output_; -}; - -template -class DiagKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* diagonal = context.Input("Diagonal"); - auto* diag_data = diagonal->data(); - auto numel = diagonal->numel(); - auto* out = context.Output("Out"); - T* out_data = out->mutable_data(context.GetPlace()); - - phi::funcs::SetConstant set_zero; - auto& dev_ctx = context.template device_context(); - set_zero(dev_ctx, out, static_cast(0)); - - platform::ForRange for_range(dev_ctx, numel); - DiagFunctor functor(diag_data, numel, out_data); - for_range(functor); - } -}; - -} // namespace operators -} // namespace paddle diff --git a/test/legacy_test/test_diag.py b/test/legacy_test/test_diag.py deleted file mode 100644 index 4f713488b8206..0000000000000 --- a/test/legacy_test/test_diag.py +++ /dev/null @@ -1,70 +0,0 @@ -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import unittest - -import numpy as np -from op_test import OpTest - -import paddle -from paddle.base import Program, core, program_guard - - -class TestDiagOp(OpTest): - def setUp(self): - self.op_type = "diag" - self.init_config() - self.inputs = {'Diagonal': self.case} - - self.outputs = {'Out': np.diag(self.inputs['Diagonal'])} - - def test_check_output(self): - paddle.enable_static() - self.check_output() - - def init_config(self): - self.case = np.arange(3, 6) - - -class TestDiagOpCase1(TestDiagOp): - def init_config(self): - self.case = np.array([3], dtype='int32') - - -class TestDiagOpFp16(unittest.TestCase): - def test_fp16(self): - x_np = np.array([3], dtype='float16') - with paddle.static.program_guard(paddle.static.Program()): - x = paddle.static.data(shape=[1, 0], name='x', dtype='float16') - out = paddle.diag(x) - if core.is_compiled_with_cuda(): - place = paddle.CUDAPlace(0) - exe = paddle.static.Executor(place) - exe.run(paddle.static.default_startup_program()) - out = exe.run(feed={'x': x_np}, fetch_list=[out]) - - -class TestDiagError(unittest.TestCase): - def test_errors(self): - paddle.enable_static() - with program_guard(Program(), Program()): - - def test_diag_type(): - return paddle.diag(x=[1, 2, 3]) - - self.assertRaises(TypeError, test_diag_type) - - -if __name__ == "__main__": - unittest.main() diff --git a/tools/parallel_UT_rule.py b/tools/parallel_UT_rule.py index dbfdb276cf01a..8db28827b2830 100755 --- a/tools/parallel_UT_rule.py +++ b/tools/parallel_UT_rule.py @@ -871,7 +871,6 @@ 'test_retain_graph', 'test_network_with_dtype', 'test_basic_api_transformation', - 'test_diag', 'test_lod_array_length_op', 'test_reinforcement_learning', 'test_softmax_op', @@ -2835,7 +2834,6 @@ 'test_regularizer', 'test_sequence_reverse', 'test_shape_op', - 'test_diag', 'test_strided_slice_op', 'test_switch_case', 'test_isfinite_op', From 36bc339650225e7d21d53cc757745a4f7d57afe0 Mon Sep 17 00:00:00 2001 From: iLeGend <824040212@qq.com> Date: Tue, 10 Oct 2023 11:28:00 +0800 Subject: [PATCH 59/62] [xdoctest][task 232-235] reformat example code with google style in `python/paddle/distributed/*` (#57591) * [Doctest]fix No.232-235, test=docs_preview * fix format * add requires for rpc * fix typo * fix some * fix upcase --- python/paddle/distributed/rpc/rpc.py | 107 +++++---- .../distributed/sharding/group_sharded.py | 86 +++---- python/paddle/distributed/spawn.py | 141 ++++++------ .../transpiler/distribute_transpiler.py | 217 ++++++++++-------- 4 files changed, 290 insertions(+), 261 deletions(-) diff --git a/python/paddle/distributed/rpc/rpc.py b/python/paddle/distributed/rpc/rpc.py index ebe6bc54623d6..0d88c8fef1ce5 100644 --- a/python/paddle/distributed/rpc/rpc.py +++ b/python/paddle/distributed/rpc/rpc.py @@ -87,11 +87,13 @@ def init_rpc(name, rank=None, world_size=None, master_endpoint=None): Examples: .. code-block:: python - import paddle.distributed.rpc as rpc + >>> # doctest: +REQUIRES(env:DISTRIBUTED) + >>> import paddle.distributed.rpc as rpc - rpc.init_rpc("worker0", rank=0, world_size=1, - master_endpoint="127.0.0.1:8001") - rpc.shutdown() + >>> rpc.init_rpc("worker0", rank=0, world_size=1, + ... master_endpoint="127.0.0.1:8001") + + >>> rpc.shutdown() """ rank = int(os.environ["PADDLE_TRAINER_ID"]) if rank is None else rank @@ -161,15 +163,17 @@ def rpc_sync(to, fn, args=None, kwargs=None, timeout=_DEFAULT_RPC_TIMEOUT): Examples: .. code-block:: python - import paddle.distributed.rpc as rpc + >>> # doctest: +REQUIRES(env:DISTRIBUTED) + >>> import paddle.distributed.rpc as rpc + + >>> def add(a, b): + ... return a + b - def add(a, b): - return a + b + >>> rpc.init_rpc("worker0", rank=0, world_size=1, + ... master_endpoint="127.0.0.1:8002") - rpc.init_rpc("worker0", rank=0, world_size=1, - master_endpoint="127.0.0.1:8002") - ret = rpc.rpc_sync("worker0", add, args=(2, 3)) - rpc.shutdown() + >>> ret = rpc.rpc_sync("worker0", add, args=(2, 3)) + >>> rpc.shutdown() """ fut = _invoke_rpc(to, fn, args, kwargs, timeout) @@ -201,16 +205,20 @@ def rpc_async(to, fn, args=None, kwargs=None, timeout=_DEFAULT_RPC_TIMEOUT): Examples: .. code-block:: python - import paddle.distributed.rpc as rpc + >>> # doctest: +REQUIRES(env:DISTRIBUTED) + >>> import paddle.distributed.rpc as rpc + + >>> def add(a, b): + ... return a + b - def add(a, b): - return a + b + >>> rpc.init_rpc("worker0", rank=0, world_size=1, + ... master_endpoint="127.0.0.1:8003") - rpc.init_rpc("worker0", rank=0, world_size=1, - master_endpoint="127.0.0.1:8003") - fut = rpc.rpc_async("worker0", add, args=(2, 3)) - print(fut.wait()) - rpc.shutdown() + >>> fut = rpc.rpc_async("worker0", add, args=(2, 3)) + >>> print(fut.wait()) + 5 + + >>> rpc.shutdown() """ return _invoke_rpc(to, fn, args, kwargs, timeout) @@ -279,11 +287,13 @@ def shutdown(): Examples: .. code-block:: python - import paddle.distributed.rpc as rpc + >>> # doctest: +REQUIRES(env:DISTRIBUTED) + >>> import paddle.distributed.rpc as rpc + + >>> rpc.init_rpc("worker0", rank=0, world_size=1, + ... master_endpoint="127.0.0.1:8004") - rpc.init_rpc("worker0", rank=0, world_size=1, - master_endpoint="127.0.0.1:8004") - rpc.shutdown() + >>> rpc.shutdown() """ info = get_current_worker_info() @@ -309,17 +319,18 @@ class `WorkerInfo` with attribute `name`, `rank`, `ip` and `port`. Examples: .. code-block:: python - import paddle.distributed.rpc as rpc - import os + >>> # doctest: +REQUIRES(env:DISTRIBUTED) + >>> import paddle.distributed.rpc as rpc + >>> import os - os.environ["PADDLE_WORKER_ENDPOINT"] = "127.0.0.1:9002" - rpc.init_rpc("worker0", rank=0, world_size=1, - master_endpoint="127.0.0.1:8005") + >>> os.environ["PADDLE_WORKER_ENDPOINT"] = "127.0.0.1:9002" + >>> rpc.init_rpc("worker0", rank=0, world_size=1, + ... master_endpoint="127.0.0.1:8005") - print(rpc.get_worker_info("worker0")) - # {name: worker0, rank: 0, ip: 127.0.0.1, port: 9002} + >>> print(rpc.get_worker_info("worker0")) + {name: worker0, rank: 0, ip: 127.0.0.1, port: 9002} - rpc.shutdown() + >>> rpc.shutdown() """ return core.rpc_get_worker_info(name) @@ -335,17 +346,18 @@ def get_all_worker_infos(): Examples: .. code-block:: python - import paddle.distributed.rpc as rpc - import os + >>> # doctest: +REQUIRES(env:DISTRIBUTED) + >>> import paddle.distributed.rpc as rpc + >>> import os - os.environ["PADDLE_WORKER_ENDPOINT"] = "127.0.0.1:9003" - rpc.init_rpc("worker0", rank=0, world_size=1, - master_endpoint="127.0.0.1:8006") + >>> os.environ["PADDLE_WORKER_ENDPOINT"] = "127.0.0.1:9003" + >>> rpc.init_rpc("worker0", rank=0, world_size=1, + ... master_endpoint="127.0.0.1:8006") - print(rpc.get_all_worker_infos()) - # [{name: worker0, rank: 0, ip: 127.0.0.1, port: 9003}] + >>> print(rpc.get_all_worker_infos()) + [{name: worker0, rank: 0, ip: 127.0.0.1, port: 9003}] - rpc.shutdown() + >>> rpc.shutdown() """ return core.rpc_get_all_worker_infos() @@ -361,17 +373,18 @@ class `WorkerInfo` with attribute `name`, `rank`, `ip` and `port`. Examples: .. code-block:: python - import paddle.distributed.rpc as rpc - import os + >>> # doctest: +REQUIRES(env:DISTRIBUTED) + >>> import paddle.distributed.rpc as rpc + >>> import os - os.environ["PADDLE_WORKER_ENDPOINT"] = "127.0.0.1:9004" - rpc.init_rpc("worker0", rank=0, world_size=1, - master_endpoint="127.0.0.1:8007") + >>> os.environ["PADDLE_WORKER_ENDPOINT"] = "127.0.0.1:9004" + >>> rpc.init_rpc("worker0", rank=0, world_size=1, + ... master_endpoint="127.0.0.1:8007") - print(rpc.get_current_worker_info()) - # {name: worker0, rank: 0, ip: 127.0.0.1, port: 9004} + >>> print(rpc.get_current_worker_info()) + {name: worker0, rank: 0, ip: 127.0.0.1, port: 9004} - rpc.shutdown() + >>> rpc.shutdown() """ return core.rpc_get_current_worker_info() diff --git a/python/paddle/distributed/sharding/group_sharded.py b/python/paddle/distributed/sharding/group_sharded.py index 350f6eff4d001..b0f5ab0b629ca 100644 --- a/python/paddle/distributed/sharding/group_sharded.py +++ b/python/paddle/distributed/sharding/group_sharded.py @@ -77,32 +77,33 @@ def group_sharded_parallel( Examples: .. code-block:: python - # required: distributed - import paddle - from paddle.nn import Linear - from paddle.distributed import fleet - from paddle.distributed.sharding import group_sharded_parallel + >>> # doctest: +REQUIRES(env:DISTRIBUTED) + >>> import paddle + >>> from paddle.nn import Linear + >>> from paddle.distributed import fleet + >>> from paddle.distributed.sharding import group_sharded_parallel - fleet.init(is_collective=True) - group = paddle.distributed.new_group([0, 1]) - model = Linear(1000, 1000) + >>> fleet.init(is_collective=True) + >>> group = paddle.distributed.new_group([0, 1]) + >>> model = Linear(1000, 1000) - clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0) - optimizer = paddle.optimizer.AdamW(learning_rate=0.001, parameters=model.parameters(), weight_decay=0.00001, grad_clip=clip) + >>> clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0) + >>> optimizer = paddle.optimizer.AdamW(learning_rate=0.001, parameters=model.parameters(), weight_decay=0.00001, grad_clip=clip) - # wrap sharding model, optimizer and scaler - model, optimizer, scaler = group_sharded_parallel(model, optimizer, "p_g", scaler=scaler) + >>> # wrap sharding model, optimizer and scaler + >>> model, optimizer, scaler = group_sharded_parallel(model, optimizer, "p_g", scaler=scaler) - img, label = data - label.stop_gradient = True - img.stop_gradient = True + >>> img, label = data + >>> label.stop_gradient = True + >>> img.stop_gradient = True - out = model(img) - loss = paddle.nn.functional.cross_entropy(input=out, label=label) + >>> out = model(img) + >>> loss = paddle.nn.functional.cross_entropy(input=out, label=label) + + >>> loss.backward() + >>> optimizer.step() + >>> optimizer.clear_grad() - loss.backward() - optimizer.step() - optimizer.clear_grad() """ device = paddle.get_device().split(":")[0] @@ -195,35 +196,36 @@ def save_group_sharded_model(model, output, optimizer=None): Examples: .. code-block:: python - # required: distributed - import paddle - from paddle.nn import Linear - from paddle.distributed import fleet - from paddle.distributed.sharding import group_sharded_parallel, save_group_sharded_model + >>> # doctest: +REQUIRES(env:DISTRIBUTED) + >>> import paddle + >>> from paddle.nn import Linear + >>> from paddle.distributed import fleet + >>> from paddle.distributed.sharding import group_sharded_parallel, save_group_sharded_model + + >>> fleet.init(is_collective=True) + >>> group = paddle.distributed.new_group([0, 1]) + >>> model = Linear(1000, 1000) - fleet.init(is_collective=True) - group = paddle.distributed.new_group([0, 1]) - model = Linear(1000, 1000) + >>> clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0) + >>> optimizer = paddle.optimizer.AdamW(learning_rate=0.001, parameters=model.parameters(), weight_decay=0.00001, grad_clip=clip) - clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0) - optimizer = paddle.optimizer.AdamW(learning_rate=0.001, parameters=model.parameters(), weight_decay=0.00001, grad_clip=clip) + >>> # wrap sharding model, optimizer and scaler + >>> model, optimizer, scaler = group_sharded_parallel(model, optimizer, "p_g", scaler=scaler) - # wrap sharding model, optimizer and scaler - model, optimizer, scaler = group_sharded_parallel(model, optimizer, "p_g", scaler=scaler) + >>> img, label = data + >>> label.stop_gradient = True + >>> img.stop_gradient = True - img, label = data - label.stop_gradient = True - img.stop_gradient = True + >>> out = model(img) + >>> loss = paddle.nn.functional.cross_entropy(input=out, label=label) - out = model(img) - loss = paddle.nn.functional.cross_entropy(input=out, label=label) + >>> loss.backward() + >>> optimizer.step() + >>> optimizer.clear_grad() - loss.backward() - optimizer.step() - optimizer.clear_grad() + >>> # save model and optimizer state_dict + >>> save_group_sharded_model(model, optimizer, output=output_dir) - # save model and optimizer state_dict - save_group_sharded_model(model, optimizer, output=output_dir) """ logger_.info( "==========Begin to save group sharded model and optimizer==========" diff --git a/python/paddle/distributed/spawn.py b/python/paddle/distributed/spawn.py index 91039b3b3bac3..970afae464030 100644 --- a/python/paddle/distributed/spawn.py +++ b/python/paddle/distributed/spawn.py @@ -494,79 +494,74 @@ def spawn(func, args=(), nprocs=-1, join=True, daemon=False, **options): Examples: .. code-block:: python - import paddle - import paddle.nn as nn - import paddle.optimizer as opt - import paddle.distributed as dist - - class LinearNet(nn.Layer): - def __init__(self): - super().__init__() - self._linear1 = nn.Linear(10, 10) - self._linear2 = nn.Linear(10, 1) - - def forward(self, x): - return self._linear2(self._linear1(x)) - - def train(print_result=False): - # 1. initialize parallel environment - group = dist.init_parallel_env() - process_group = group.process_group if group else None - - # 2. create data parallel layer & optimizer - layer = LinearNet() - dp_layer = paddle.DataParallel(layer, group = process_group) - - loss_fn = nn.MSELoss() - adam = opt.Adam( - learning_rate=0.001, parameters=dp_layer.parameters()) - - # 3. run layer - inputs = paddle.randn([10, 10], 'float32') - outputs = dp_layer(inputs) - labels = paddle.randn([10, 1], 'float32') - loss = loss_fn(outputs, labels) - - if print_result is True: - print("loss:", loss.numpy()) - - loss.backward() - - adam.step() - adam.clear_grad() - - # Usage 1: only pass function. - # If your training method no need any argument, and - # use all visible devices for parallel training. - if __name__ == '__main__': - dist.spawn(train) - - # Usage 2: pass function and arguments. - # If your training method need some arguments, and - # use all visible devices for parallel training. - if __name__ == '__main__': - dist.spawn(train, args=(True,)) - - # Usage 3: pass function, arguments and nprocs. - # If your training method need some arguments, and - # only use part of visible devices for parallel training. - # If your machine hold 8 cards {0,1,2,3,4,5,6,7}, - # this case will use cards {0,1}; If you set - # CUDA_VISIBLE_DEVICES=4,5,6,7, this case will use - # cards {4,5} - if __name__ == '__main__': - dist.spawn(train, args=(True,), nprocs=2) - - # Usage 4: pass function, arguments, nprocs and gpus. - # If your training method need some arguments, and - # only use part of visible devices for parallel training, - # but you can't set your machine's environment variable - # CUDA_VISIBLE_DEVICES, such as it is None or all cards - # {0,1,2,3,4,5,6,7}, you can pass `gpus` to - # select the GPU cards you want to use. For example, - # this case will use cards {4,5} if your machine hold 8 cards. - if __name__ == '__main__': - dist.spawn(train, args=(True,), nprocs=2, gpus='4,5') + >>> # doctest: +REQUIRES(env:DISTRIBUTED) + >>> import paddle + >>> import paddle.nn as nn + >>> import paddle.optimizer as opt + >>> import paddle.distributed as dist + + >>> class LinearNet(nn.Layer): + ... def __init__(self): + ... super().__init__() + ... self._linear1 = nn.Linear(10, 10) + ... self._linear2 = nn.Linear(10, 1) + ... def forward(self, x): + ... return self._linear2(self._linear1(x)) + + >>> def train(print_result=False): + ... # 1. initialize parallel environment + ... group = dist.init_parallel_env() + ... process_group = group.process_group if group else None + ... # 2. create data parallel layer & optimizer + ... layer = LinearNet() + ... dp_layer = paddle.DataParallel(layer, group = process_group) + ... loss_fn = nn.MSELoss() + ... adam = opt.Adam( + ... learning_rate=0.001, parameters=dp_layer.parameters()) + ... # 3. run layer + ... inputs = paddle.randn([10, 10], 'float32') + ... outputs = dp_layer(inputs) + ... labels = paddle.randn([10, 1], 'float32') + ... loss = loss_fn(outputs, labels) + ... if print_result is True: + ... print("loss:", loss.numpy()) + ... loss.backward() + ... adam.step() + ... adam.clear_grad() + + >>> # Usage 1: only pass function. + >>> # If your training method no need any argument, and + >>> # use all visible devices for parallel training. + >>> if __name__ == '__main__': + ... dist.spawn(train) + + >>> # Usage 2: pass function and arguments. + >>> # If your training method need some arguments, and + >>> # use all visible devices for parallel training. + >>> if __name__ == '__main__': + ... dist.spawn(train, args=(True,)) + + >>> # Usage 3: pass function, arguments and nprocs. + >>> # If your training method need some arguments, and + >>> # only use part of visible devices for parallel training. + >>> # If your machine hold 8 cards {0,1,2,3,4,5,6,7}, + >>> # this case will use cards {0,1}; If you set + >>> # CUDA_VISIBLE_DEVICES=4,5,6,7, this case will use + >>> # cards {4,5} + >>> if __name__ == '__main__': + ... dist.spawn(train, args=(True,), nprocs=2) + + >>> # Usage 4: pass function, arguments, nprocs and gpus. + >>> # If your training method need some arguments, and + >>> # only use part of visible devices for parallel training, + >>> # but you can't set your machine's environment variable + >>> # CUDA_VISIBLE_DEVICES, such as it is None or all cards + >>> # {0,1,2,3,4,5,6,7}, you can pass `gpus` to + >>> # select the GPU cards you want to use. For example, + >>> # this case will use cards {4,5} if your machine hold 8 cards. + >>> if __name__ == '__main__': + ... dist.spawn(train, args=(True,), nprocs=2, gpus='4,5') + """ # Give an error hint when the users enter a configuration option # that does not exist diff --git a/python/paddle/distributed/transpiler/distribute_transpiler.py b/python/paddle/distributed/transpiler/distribute_transpiler.py index 47929406ecde9..3d86d6dd9afce 100644 --- a/python/paddle/distributed/transpiler/distribute_transpiler.py +++ b/python/paddle/distributed/transpiler/distribute_transpiler.py @@ -175,13 +175,14 @@ class DistributeTranspilerConfig: Examples: .. code-block:: python - from paddle.distributed.transpiler.ps_dispatcher import RoundRobin - import paddle.distributed.transpiler as transpiler + >>> from paddle.distributed.transpiler.distribute_transpiler import RoundRobin + >>> import paddle.distributed.transpiler as transpiler + + >>> config = transpiler.DistributeTranspilerConfig() + >>> config.slice_var_up = True + >>> config.split_method = RoundRobin + >>> config.min_block_size = 81920 - config = transpiler.DistributeTranspilerConfig() - config.slice_var_up = True - config.split_method = RoundRobin - config.min_block_size = 81920 """ slice_var_up = True @@ -282,53 +283,57 @@ class DistributeTranspiler: Examples: .. code-block:: python - import paddle - import paddle.base as base - import paddle.distributed.transpiler as transpiler - - paddle.enable_static() - - x = paddle.static.data(name='x', shape=[1,13], dtype='float32') - y = paddle.static.data(name='y', shape=[1], dtype='float32') - y_predict = paddle.static.nn.fc(x, size=1, activation=None) - - cost =paddle.nn.functional.square_error_cost(input=y_predict, label=y) - avg_loss = paddle.mean(cost) - - sgd_optimizer = paddle.optimizer.SGD(learning_rate=0.001) - sgd_optimizer.minimize(avg_loss) - - # for pserver mode - pserver_endpoints = "192.168.0.1:6174,192.168.0.2:6174" - trainer_endpoints = "192.168.0.1:6174,192.168.0.2:6174" - current_endpoint = "192.168.0.1:6174" - trainer_id = 0 - trainers = 4 - role = "PSERVER" - t = transpiler.DistributeTranspiler() - t.transpile( - trainer_id, pservers=pserver_endpoints, trainers=trainers) - if role == "PSERVER": - pserver_program = t.get_pserver_program(current_endpoint) - pserver_startup_program = t.get_startup_program(current_endpoint, - pserver_program) - elif role == "TRAINER": - trainer_program = t.get_trainer_program() - - # for nccl2 mode - trainer_num = 2 - trainer_id = 0 - config = transpiler.DistributeTranspilerConfig() - config.mode = "nccl2" - trainer_endpoints = "192.168.0.1:6174,192.168.0.2:6174" - t = transpiler.DistributeTranspiler(config=config) - t.transpile(trainer_id=trainer_id, trainers=trainer_endpoints, current_endpoint="192.168.0.1:6174") - exe = paddle.static.ParallelExecutor( - use_cuda=True, - loss_name=avg_loss.name, - num_trainers=trainer_num, - trainer_id=trainer_id - ) + >>> # doctest: +REQUIRES(env:DISTRIBUTED) + >>> import paddle + >>> import paddle.base as base + >>> import paddle.distributed.transpiler as transpiler + + >>> paddle.enable_static() + + >>> x = paddle.static.data(name='x', shape=[1,13], dtype='float32') + >>> y = paddle.static.data(name='y', shape=[1], dtype='float32') + >>> y_predict = paddle.static.nn.fc(x, size=1, activation=None) + + >>> cost = paddle.nn.functional.square_error_cost(input=y_predict, label=y) + >>> avg_loss = paddle.mean(cost) + + >>> sgd_optimizer = paddle.optimizer.SGD(learning_rate=0.001) + >>> sgd_optimizer.minimize(avg_loss) + + >>> # for pserver mode + >>> pserver_endpoints = "192.168.0.1:6174,192.168.0.2:6174" + >>> trainer_endpoints = "192.168.0.1:6174,192.168.0.2:6174" + >>> current_endpoint = "192.168.0.1:6174" + >>> trainer_id = 0 + >>> trainers = 4 + >>> role = "PSERVER" + + >>> t = transpiler.DistributeTranspiler() + >>> t.transpile( + ... trainer_id, pservers=pserver_endpoints, trainers=trainers) + + >>> if role == "PSERVER": + ... pserver_program = t.get_pserver_program(current_endpoint) + ... pserver_startup_program = t.get_startup_program(current_endpoint, + ... pserver_program) + ... elif role == "TRAINER": + ... trainer_program = t.get_trainer_program() + + >>> # for nccl2 mode + >>> trainer_num = 2 + >>> trainer_id = 0 + >>> config = transpiler.DistributeTranspilerConfig() + >>> config.mode = "nccl2" + >>> trainer_endpoints = "192.168.0.1:6174,192.168.0.2:6174" + >>> t = transpiler.DistributeTranspiler(config=config) + >>> t.transpile(trainer_id=trainer_id, trainers=trainer_endpoints, current_endpoint="192.168.0.1:6174") + >>> exe = paddle.static.ParallelExecutor( + ... use_cuda=True, + ... loss_name=avg_loss.name, + ... num_trainers=trainer_num, + ... trainer_id=trainer_id + ... ) + """ def __init__(self, config=None): @@ -609,13 +614,15 @@ def transpile( Examples: .. code-block:: python - transpiler = paddle.distributed.transpiler.DistributeTranspiler() - t.transpile( - trainer_id=0, - pservers="127.0.0.1:7000,127.0.0.1:7001", - trainers=2, - sync_mode=False, - current_endpoint="127.0.0.1:7000") + >>> # doctest: +REQUIRES(env:DISTRIBUTED) + >>> t = paddle.distributed.transpiler.DistributeTranspiler() + >>> t.transpile( + ... trainer_id=0, + ... pservers="127.0.0.1:7000,127.0.0.1:7001", + ... trainers=2, + ... sync_mode=False, + ... current_endpoint="127.0.0.1:7000") + """ from paddle.distributed.distribute_lookup_table import ( find_distributed_lookup_table, @@ -1127,14 +1134,17 @@ def get_trainer_program(self, wait_port=True): Examples: .. code-block:: python - import paddle.distributed.transpiler as transpiler - #this is an example, find available endpoints in your case - pserver_endpoints = "192.168.0.1:6174,192.168.0.2:6174" - trainer_id = 0 - trainers = 4 - t = transpiler.DistributeTranspiler() - t.transpile(trainer_id, trainers=trainers, pservers=pserver_endpoints) - trainer_program = t.get_trainer_program() + >>> # doctest: +REQUIRES(env:DISTRIBUTED) + >>> import paddle.distributed.transpiler as transpiler + >>> # this is an example, find available endpoints in your case + >>> pserver_endpoints = "192.168.0.1:6174,192.168.0.2:6174" + >>> trainer_id = 0 + >>> trainers = 4 + + >>> t = transpiler.DistributeTranspiler() + >>> t.transpile(trainer_id, trainers=trainers, pservers=pserver_endpoints) + >>> trainer_program = t.get_trainer_program() + """ # remove optimize ops and add a send op to main_program # FIXME(typhoonzero): Also ops like clip_gradient, lrn_decay? @@ -1273,16 +1283,20 @@ def get_pserver_program(self, endpoint): Examples: .. code-block:: python - import paddle.distributed.transpiler as transpiler - #this is an example, find available endpoints in your case - pserver_endpoints = "192.168.0.1:6174,192.168.0.2:6174" - current_endpoint = "192.168.0.1:6174" - trainer_id = 0 - trainers = 4 - t = transpiler.DistributeTranspiler() - t.transpile( - trainer_id, pservers=pserver_endpoints, trainers=trainers) - pserver_program = t.get_pserver_program(current_endpoint) + >>> # doctest: +REQUIRES(env:DISTRIBUTED) + >>> import paddle.distributed.transpiler as transpiler + >>> # this is an example, find available endpoints in your case + >>> pserver_endpoints = "192.168.0.1:6174,192.168.0.2:6174" + >>> current_endpoint = "192.168.0.1:6174" + >>> trainer_id = 0 + >>> trainers = 4 + + >>> t = transpiler.DistributeTranspiler() + >>> t.transpile( + ... trainer_id, pservers=pserver_endpoints, trainers=trainers) + + >>> pserver_program = t.get_pserver_program(current_endpoint) + """ # TODO(panyx0718): Revisit this assumption. what if #blocks > #pservers. # NOTE: assume blocks of the same variable is not distributed @@ -1582,16 +1596,19 @@ def get_pserver_programs(self, endpoint): Examples: .. code-block:: python - import paddle.distributed.transpiler as transpiler - #this is an example, find available endpoints in your case - pserver_endpoints = "192.168.0.1:6174,192.168.0.2:6174" - current_endpoint = "192.168.0.1:6174" - trainer_id = 0 - trainers = 4 - t = transpiler.DistributeTranspiler() - t.transpile( - trainer_id, pservers=pserver_endpoints, trainers=trainers) - pserver_program, pserver_startup_program = t.get_pserver_programs(current_endpoint) + >>> # doctest: +REQUIRES(env:DISTRIBUTED) + >>> import paddle.distributed.transpiler as transpiler + >>> # this is an example, find available endpoints in your case + >>> pserver_endpoints = "192.168.0.1:6174,192.168.0.2:6174" + >>> current_endpoint = "192.168.0.1:6174" + >>> trainer_id = 0 + >>> trainers = 4 + + >>> t = transpiler.DistributeTranspiler() + >>> t.transpile( + ... trainer_id, pservers=pserver_endpoints, trainers=trainers) + >>> pserver_program, pserver_startup_program = t.get_pserver_programs(current_endpoint) + """ pserver_prog = self.get_pserver_program(endpoint) pserver_startup = self.get_startup_program( @@ -1621,17 +1638,19 @@ def get_startup_program( Examples: .. code-block:: python - pserver_endpoints = "192.168.0.1:6174,192.168.0.2:6174" - trainer_endpoints = "192.168.0.1:6174,192.168.0.2:6174" - current_endpoint = "192.168.0.1:6174" - trainer_id = 0 - trainers = 4 - - t = paddle.distributed.transpiler.DistributeTranspiler() - t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers) - pserver_program = t.get_pserver_program(current_endpoint) - pserver_startup_program = t.get_startup_program(current_endpoint, - pserver_program) + >>> # doctest: +REQUIRES(env:DISTRIBUTED) + >>> pserver_endpoints = "192.168.0.1:6174,192.168.0.2:6174" + >>> trainer_endpoints = "192.168.0.1:6174,192.168.0.2:6174" + >>> current_endpoint = "192.168.0.1:6174" + >>> trainer_id = 0 + >>> trainers = 4 + + >>> t = paddle.distributed.transpiler.DistributeTranspiler() + >>> t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers) + >>> pserver_program = t.get_pserver_program(current_endpoint) + >>> pserver_startup_program = t.get_startup_program(current_endpoint, + ... pserver_program) + """ s_prog = Program() orig_s_prog = self.startup_program From d1a99fc10353158ca8b24315b8446cc3683fd268 Mon Sep 17 00:00:00 2001 From: Aurelius84 Date: Tue, 10 Oct 2023 11:28:41 +0800 Subject: [PATCH 60/62] [PIR]Polish GroupOp and Interface code (#57829) * [PIR]Polish GroupOp and Interface code * fix comment --- .../hlir/dialect/operator/ir/manual_op.cc | 11 +++++----- .../cinn/hlir/dialect/operator/ir/manual_op.h | 4 ++-- .../fluid/pir/transforms/build_cinn_pass.cc | 2 +- paddle/pir/core/operation.cc | 1 + paddle/pir/core/value.cc | 4 +--- test/cpp/pir/cinn/CMakeLists.txt | 22 +++---------------- test/cpp/pir/cinn/build_cinn_pass_test.cc | 2 +- test/cpp/pir/cinn/group_op_test.cc | 6 ++--- 8 files changed, 18 insertions(+), 34 deletions(-) diff --git a/paddle/cinn/hlir/dialect/operator/ir/manual_op.cc b/paddle/cinn/hlir/dialect/operator/ir/manual_op.cc index abe2ca94b9690..db81b53a16f96 100644 --- a/paddle/cinn/hlir/dialect/operator/ir/manual_op.cc +++ b/paddle/cinn/hlir/dialect/operator/ir/manual_op.cc @@ -32,15 +32,16 @@ void GroupOp::Build(pir::Builder &builder, argument.output_types = output_types; } -pir::Block *GroupOp::Block() { +pir::Block *GroupOp::block() { pir::Region ®ion = (*this)->region(0); if (region.empty()) region.emplace_back(); return region.front(); } -std::vector GroupOp::Ops() { - auto *block = this->Block(); - return std::vector(block->begin(), block->end()); +std::vector GroupOp::ops() { + auto *inner_block = this->block(); + return std::vector(inner_block->begin(), + inner_block->end()); } void GroupOp::Verify() {} @@ -54,7 +55,7 @@ void GroupOp::Print(pir::IrPrinter &printer) { os << " -> "; printer.PrintOpReturnType(op); os << " {"; - for (auto &sub_op : Ops()) { + for (auto &sub_op : ops()) { os << "\n"; printer.PrintOperation(sub_op); } diff --git a/paddle/cinn/hlir/dialect/operator/ir/manual_op.h b/paddle/cinn/hlir/dialect/operator/ir/manual_op.h index 0c8aa88e4fd2b..9d469d9f776c4 100644 --- a/paddle/cinn/hlir/dialect/operator/ir/manual_op.h +++ b/paddle/cinn/hlir/dialect/operator/ir/manual_op.h @@ -33,8 +33,8 @@ class GroupOp : public pir::Op { pir::OperationArgument &argument, // NOLINT const std::vector &output_types); - pir::Block *Block(); - std::vector Ops(); + pir::Block *block(); + std::vector ops(); void Verify(); void Print(pir::IrPrinter &printer); // NOLINT diff --git a/paddle/fluid/pir/transforms/build_cinn_pass.cc b/paddle/fluid/pir/transforms/build_cinn_pass.cc index 4ad820fe03b6a..245e26cabad7e 100644 --- a/paddle/fluid/pir/transforms/build_cinn_pass.cc +++ b/paddle/fluid/pir/transforms/build_cinn_pass.cc @@ -559,7 +559,7 @@ void ReplaceWithGroupOp(pir::Block* block, } // step 2: Replace the old op with GroupOp. auto new_group_op = builder.Build(output_types); - pir::Block* group_block = new_group_op.Block(); + pir::Block* group_block = new_group_op.block(); for (auto* op : group_ops) { op->MoveTo(group_block, group_block->begin()); } diff --git a/paddle/pir/core/operation.cc b/paddle/pir/core/operation.cc index 48f5ff85cd5ce..6a13963c93587 100644 --- a/paddle/pir/core/operation.cc +++ b/paddle/pir/core/operation.cc @@ -283,6 +283,7 @@ void Operation::SetParent(Block *parent, const Block::Iterator &position) { } void Operation::MoveTo(Block *block, Block::Iterator position) { + IR_ENFORCE(parent_, "Operation does not have parent"); Operation *op = parent_->Take(this); block->insert(position, op); } diff --git a/paddle/pir/core/value.cc b/paddle/pir/core/value.cc index 4e364e916c6c3..13b0b4a5cfee8 100644 --- a/paddle/pir/core/value.cc +++ b/paddle/pir/core/value.cc @@ -40,9 +40,7 @@ bool Value::operator!=(const Value &other) const { bool Value::operator!() const { return impl_ == nullptr; } -bool Value::operator<(const Value &other) const { - return std::hash{}(*this) < std::hash{}(other); -} +bool Value::operator<(const Value &other) const { return impl_ < other.impl_; } Value::operator bool() const { return impl_; } diff --git a/test/cpp/pir/cinn/CMakeLists.txt b/test/cpp/pir/cinn/CMakeLists.txt index 7952e53da32c0..6b984f3a03ae9 100644 --- a/test/cpp/pir/cinn/CMakeLists.txt +++ b/test/cpp/pir/cinn/CMakeLists.txt @@ -23,27 +23,11 @@ if(WITH_TESTING AND WITH_CINN) convert_to_dialect) set_tests_properties(test_jit_instruction PROPERTIES LABELS "RUN_TYPE=CINN") - cc_test_old( - test_group_op - SRCS - group_op_test.cc - DEPS - cinn_op_dialect - pir - phi - gtest - glog) + paddle_test(test_group_op SRCS group_op_test.cc DEPS cinn_op_dialect) set_tests_properties(test_group_op PROPERTIES LABELS "RUN_TYPE=CINN") - cc_test_old( - test_pir_build_cinn_pass - SRCS - build_cinn_pass_test.cc - DEPS - pd_build_cinn_pass - pir_pass - gtest - glog) + paddle_test(test_pir_build_cinn_pass SRCS build_cinn_pass_test.cc DEPS + pd_build_cinn_pass) set_tests_properties(test_pir_build_cinn_pass PROPERTIES LABELS "RUN_TYPE=CINN") endif() diff --git a/test/cpp/pir/cinn/build_cinn_pass_test.cc b/test/cpp/pir/cinn/build_cinn_pass_test.cc index 2d6d7b09db386..40fefeb3d2173 100644 --- a/test/cpp/pir/cinn/build_cinn_pass_test.cc +++ b/test/cpp/pir/cinn/build_cinn_pass_test.cc @@ -62,7 +62,7 @@ TEST(BuildCinnPassTest, AllOpSupportCinn) { CHECK_EQ(origin_program->block()->size(), 1u); pir::Operation* group_op = origin_program->block()->front(); pir::Block* group_block = - group_op->dyn_cast().Block(); + group_op->dyn_cast().block(); CHECK_EQ(group_block->size(), 6u); std::vector op_names = { diff --git a/test/cpp/pir/cinn/group_op_test.cc b/test/cpp/pir/cinn/group_op_test.cc index a5bd90a54f0f0..6e0f05a8cb244 100644 --- a/test/cpp/pir/cinn/group_op_test.cc +++ b/test/cpp/pir/cinn/group_op_test.cc @@ -51,7 +51,7 @@ std::shared_ptr<::pir::Program> BuildGroupProgram() { const std::vector shape = {64, 128}; auto group_op1 = builder.Build( CreateDenseTensorTypes(phi::make_ddim(shape))); - pir::Block* block1 = group_op1.Block(); + pir::Block* block1 = group_op1.block(); builder.SetInsertionPointToEnd(block1); auto full_op_x = builder.Build( shape, value_one, phi::DataType::FLOAT32, phi::GPUPlace()); @@ -60,7 +60,7 @@ std::shared_ptr<::pir::Program> BuildGroupProgram() { builder.SetInsertionPointToEnd(program->block()); auto group_op2 = builder.Build( CreateDenseTensorTypes(phi::make_ddim(shape))); - pir::Block* block2 = group_op2.Block(); + pir::Block* block2 = group_op2.block(); builder.SetInsertionPointToEnd(block2); auto tan_op_x = builder.Build(group_op1->result(0)); @@ -84,7 +84,7 @@ TEST(GroupOp, TestBuild) { int i = 0; for (auto* sub_op : *(program->block())) { EXPECT_TRUE(sub_op->isa()); - EXPECT_EQ(sub_op->dyn_cast().Ops().size(), + EXPECT_EQ(sub_op->dyn_cast().ops().size(), op_num[i]); ++i; } From af17aace669a00b0788c280f79d4d6594e631ee0 Mon Sep 17 00:00:00 2001 From: Haohongxiang <86215757+haohongxiang@users.noreply.github.com> Date: Tue, 10 Oct 2023 13:22:34 +0800 Subject: [PATCH 61/62] [Develop] Support detach of EagerParamBase in recompute (#57930) --- .../distributed/fleet/recompute/recompute.py | 26 +++++++- .../fleet/test_dygraph_recompute_for_eager.py | 65 ++++++++++++++++++- 2 files changed, 85 insertions(+), 6 deletions(-) mode change 100755 => 100644 python/paddle/distributed/fleet/recompute/recompute.py mode change 100755 => 100644 test/collective/fleet/test_dygraph_recompute_for_eager.py diff --git a/python/paddle/distributed/fleet/recompute/recompute.py b/python/paddle/distributed/fleet/recompute/recompute.py old mode 100755 new mode 100644 index 43e4dde69440a..080b56ac6478d --- a/python/paddle/distributed/fleet/recompute/recompute.py +++ b/python/paddle/distributed/fleet/recompute/recompute.py @@ -13,11 +13,13 @@ # limitations under the License. import contextlib +import copy import weakref import paddle from paddle import framework from paddle.autograd import PyLayer +from paddle.base.framework import EagerParamBase from paddle.distributed.fleet.meta_parallel.parallel_layers.random import ( get_rng_state_tracker, ) @@ -28,6 +30,15 @@ __all__ = [] +def _varbase_help(param): + state = copy.deepcopy(param.__dict__) + new_param = EagerParamBase( + shape=param.shape, dtype=param.dtype, name=param.name, **state + ) + param._share_buffer_to(new_param) + return new_param + + def detach_variable(inputs): out = [] for inp in inputs: @@ -38,14 +49,23 @@ def detach_variable(inputs): out.append(inp) continue + if isinstance(inp, EagerParamBase): + out.append(_varbase_help(inp)) + continue + if type(inp) is tuple: detach_inp = [] for i in inp: # detach all tensors in the tuple assert isinstance(i, core.eager.Tensor) - tmp_i = i.detach() - tmp_i.stop_gradient = i.stop_gradient - detach_inp.append(tmp_i) + + if isinstance(i, EagerParamBase): + detach_inp.append(_varbase_help(i)) + else: + tmp_i = i.detach() + tmp_i.stop_gradient = i.stop_gradient + detach_inp.append(tmp_i) + out.append(tuple(detach_inp)) continue diff --git a/test/collective/fleet/test_dygraph_recompute_for_eager.py b/test/collective/fleet/test_dygraph_recompute_for_eager.py old mode 100755 new mode 100644 index f54208639072d..288f69c03d933 --- a/test/collective/fleet/test_dygraph_recompute_for_eager.py +++ b/test/collective/fleet/test_dygraph_recompute_for_eager.py @@ -12,12 +12,14 @@ # See the License for the specific language governing permissions and # limitations under the License. +import copy import random import unittest import numpy as np import paddle +from paddle.base.framework import EagerParamBase from paddle.distributed.fleet.utils import recompute @@ -54,6 +56,8 @@ def forward(self, x, pos=None): if pos is None: return self.block(x) else: + if isinstance(pos, tuple): + pos = pos[0] return self.block(x) + pos @@ -70,12 +74,14 @@ def __init__( segments=1, use_raw_recompute=False, recompute_kwargs={}, + raise_value_error=False, ): super().__init__() self.recompute_blocks = recompute_blocks self.recompute_kwargs = recompute_kwargs self.use_fleet_sq = use_fleet_sq self.use_raw_recompute = use_raw_recompute + self.raise_value_error = raise_value_error self.segments = segments self.runfunc0 = get_fc_block(0, input_size, is_last=False) @@ -120,13 +126,20 @@ def forward(self, inputs): inputs = recompute(self.layers[0], inputs) return self.layers[1](inputs) + recompute_kwargs = copy.deepcopy(self.recompute_kwargs) + + pos = ( + recompute_kwargs.pop("pos", None) + if not self.raise_value_error + else None + ) for i in range(len(self.layers)): if i in self.recompute_blocks: inputs = recompute( - self.layers[i], inputs, **self.recompute_kwargs + self.layers[i], inputs, pos, **recompute_kwargs ) else: - inputs = self.layers[i](inputs) + inputs = self.layers[i](inputs, pos) return inputs @@ -134,6 +147,7 @@ def forward(self, inputs): def run_model( recompute_block=[], recompute_kwargs={}, + raise_value_error=False, use_fleet_sq=False, use_raw_recompute=False, segments=1, @@ -153,6 +167,7 @@ def run_model( use_raw_recompute=use_raw_recompute, segments=segments, recompute_kwargs=recompute_kwargs, + raise_value_error=raise_value_error, ) if pure_fp16: @@ -302,7 +317,9 @@ def test_recompute_kwargs(self): kwargs = {"pos": pos, "use_reentrant": True} with self.assertRaises(ValueError): loss_ref, param_ref, grad_ref = run_model( - recompute_block=[2], recompute_kwargs=kwargs + recompute_block=[2], + recompute_kwargs=kwargs, + raise_value_error=True, ) kwargs = {"pos": pos, "use_reentrant": False} @@ -310,6 +327,48 @@ def test_recompute_kwargs(self): recompute_block=[2], recompute_kwargs=kwargs ) + def test_recompute_inputs_with_param(self): + pos = paddle.randn(shape=[10, 10], dtype="float32") + new_pos = EagerParamBase( + shape=pos.shape, dtype=pos.dtype, name=pos.name + ) + pos._share_buffer_to(new_pos) + new_pos.stop_gradient = False + + loss, param, grad = run_model( + recompute_block=[], recompute_kwargs={"pos": new_pos} + ) + + loss_ref, param_ref, grad_ref = run_model( + recompute_block=[1, 2, 3], recompute_kwargs={"pos": new_pos} + ) + + self.assertEqual(loss_ref, loss) + self.assertEqual(param_ref, param) + self.assertEqual(grad_ref, grad) + + def test_recompute_inputs_with_tuple(self): + pos = paddle.randn(shape=[10, 10], dtype="float32") + new_pos = EagerParamBase( + shape=pos.shape, dtype=pos.dtype, name=pos.name + ) + pos._share_buffer_to(new_pos) + pos.stop_gradient = False + new_pos.stop_gradient = False + + loss, param, grad = run_model( + recompute_block=[2, 4], recompute_kwargs={"pos": (pos,)} + ) + + loss_ref, param_ref, grad_ref = run_model( + recompute_block=[1, 2, 3], + recompute_kwargs={"pos": (new_pos,)}, + ) + + self.assertEqual(loss_ref, loss) + self.assertEqual(param_ref, param) + self.assertEqual(grad_ref, grad) + if __name__ == '__main__': unittest.main() From 7e4b34afb23c8f77f75e6a2d96767b2aa84ee818 Mon Sep 17 00:00:00 2001 From: liuzhenhai93 Date: Tue, 10 Oct 2023 13:26:13 +0800 Subject: [PATCH 62/62] Group sharded stage3 amp 02 (#57934) --- .../sharding/group_sharded_stage3.py | 75 ++++-- .../fleet/utils/hybrid_parallel_util.py | 2 +- test/collective/fleet/CMakeLists.txt | 15 ++ .../dygraph_group_sharded_stage3_bf16.py | 227 ++++++++++++++++++ .../test_dygraph_sharding_stage3_bf16.py | 26 ++ test/collective/fleet/testslist.csv | 1 + tools/gpups_test.sh | 1 + 7 files changed, 331 insertions(+), 16 deletions(-) create mode 100644 test/collective/fleet/dygraph_group_sharded_stage3_bf16.py create mode 100644 test/collective/fleet/test_dygraph_sharding_stage3_bf16.py diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage3.py b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage3.py index 12c5ac37c8b10..8a61ab904cb30 100644 --- a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage3.py +++ b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage3.py @@ -176,6 +176,9 @@ def __init__( if "grad_clip" in item.keys(): item["grad_clip"] = self._optim._grad_clip + # check main_grad + self._check_main_grad() + # Synchronous all ranks models if pertrain_sync_models: self._sync_params_and_buffers() @@ -203,6 +206,16 @@ def __init__( self._redefine_opt_step() self._redefine_opt_clear() + def _check_main_grad(self): + self.use_main_grad = None + for param in self._layer.parameters(): + if self.use_main_grad is None and hasattr(param, "main_grad"): + self.use_main_grad = True + if self.use_main_grad: + assert hasattr( + param, "main_grad" + ), "Params have different main grad attributes." + @paddle.autograd.no_grad() def _sync_params_and_buffers(self): """ @@ -235,8 +248,11 @@ def _clear_gradients(self): assert hasattr( param, "fw_storage" ), f"Find {param.name} don't have fw_storage attribute." - - param.fw_storage.clear_gradient(False) + if self.use_main_grad: + param.fw_storage.main_grad._clear() + param.fw_storage.main_grad = None + else: + param.fw_storage.clear_gradient(False) param.bw_storage._clear() param.bw_storage = None # 2.Handle unslice param @@ -245,7 +261,12 @@ def _clear_gradients(self): grad_storage.buffer.zero_() else: for param in list(self._unslice_params): - param.clear_gradient(False) + if self.use_main_grad: + param.main_grad._clear() + param.main_grad = None + else: + param.clear_gradient(False) + if ( self._default_device in paddle.device.get_all_custom_device_type() @@ -350,7 +371,9 @@ def _handle_unslice_params(self): if param.dtype not in self._grad_storages.keys(): self._grad_storages[param.dtype] = GradStorage( buffer_size[param.dtype], - dtype=param.dtype, + dtype=param.dtype + if not self.use_main_grad + else paddle.float32, device=self._default_device, destination=self._rank, parm2align=self._unslice_params2align, @@ -596,8 +619,11 @@ def _update_params(self): ), f"Find {param.name} don't have fw_storage attribute" param.fw_storage = _TensorWrapper(param) - assert param.fw_storage.grad is None - param.fw_storage._copy_gradient_from(param.bw_storage) + if self.use_main_grad: + param.fw_storage.main_grad = param.bw_storage + else: + assert param.fw_storage.grad is None + param.fw_storage._copy_gradient_from(param.bw_storage) update_list.append(param) # 2.Handle unslice param @@ -617,9 +643,13 @@ def _update_params(self): for grad_storage in self._grad_storages.values(): for p in grad_storage._params: - tmp_g = _device2cpu(p.grad, convert_dtype=True) - p.clear_gradient(False) - p._copy_gradient_from(tmp_g) + if self.use_main_grad: + tmp_g = _device2cpu(p.main_grad, convert_dtype=True) + p.main_grad = tmp_g + else: + tmp_g = _device2cpu(p.grad, convert_dtype=True) + p.clear_gradient(False) + p._copy_gradient_from(tmp_g) del tmp_g grad_storage.buffer._clear() @@ -650,6 +680,7 @@ def get_all_parameters(self, convert2cpu=False): if convert2cpu: for param in trainable_params: t_flow.full_param[param.name][0]._share_buffer_to(param) + del t_flow.full_param[param.name] # a _allgather_buffer call should be matched with a _release_param call later, # but the _allgather_buffer call here has no match. @@ -708,7 +739,11 @@ def allreduce_(*_): param.bw_storage, full_grad._slice(start, end).detach().clone(), ) - param.clear_gradient(False) + + if self.use_main_grad: + param.main_grad = None + else: + param.clear_gradient(False) del self._task_flow.full_grad[param.name] if param.name in self._task_flow.full_param.keys(): @@ -726,6 +761,7 @@ def allreduce_(*_): del self._task_flow.full_param[param.name] if self._offload: + # revert back to cpu for offload update param.fw_storage._clear_data() param.master_weight._share_buffer_to(param.fw_storage) @@ -929,11 +965,14 @@ class TaskFlow: def __init__( self, + full_param={}, + full_grad={}, + use_calc={}, callback=None, ): - self.full_param = {} - self.full_grad = {} - self.use_calc = {} + self.full_param = full_param + self.full_grad = full_grad + self.use_calc = use_calc self.callback = callback @@ -1014,6 +1053,7 @@ def _allgather_buffer( continue if offload: + # convert to device for collective comm param.fw_storage = _cpu2device(param) buffer_size = param2buffer_size[param.name] @@ -1046,17 +1086,22 @@ def _allgather_buffer( @paddle.autograd.no_grad() def _create_params_grad(trainable_params, param2buffer_size, task_flow): for param in trainable_params: + use_main_grad = hasattr(param, "main_grad") if not param.trainable: continue if param.name in task_flow.full_grad.keys(): continue assert isinstance(param2buffer_size[param.name], int) temp_grad = paddle.zeros( - [param2buffer_size[param.name]], dtype=param.dtype + [param2buffer_size[param.name]], + dtype=param.dtype if not use_main_grad else paddle.float32, ) temp_tensor = temp_grad._slice(0, param._numel()) temp_tensor.get_tensor()._set_dims(param.shape) - param._copy_gradient_from(temp_tensor) + if use_main_grad: + param.main_grad = temp_tensor + else: + param._copy_gradient_from(temp_tensor) del temp_tensor task_flow.full_grad[param.name] = temp_grad return task_flow diff --git a/python/paddle/distributed/fleet/utils/hybrid_parallel_util.py b/python/paddle/distributed/fleet/utils/hybrid_parallel_util.py index c68dfeefd2c60..86194c66016b2 100644 --- a/python/paddle/distributed/fleet/utils/hybrid_parallel_util.py +++ b/python/paddle/distributed/fleet/utils/hybrid_parallel_util.py @@ -252,7 +252,7 @@ def fused_allreduce_gradients(parameter_list, hcg): scale = 1.0 if dp_enabled: group = hcg.get_data_parallel_group() - scale = group.nranks + scale = scale / group.nranks if sep_enabled: sep_group = hcg.get_sep_parallel_group() dp_sep_group = hcg.get_dp_sep_parallel_group() diff --git a/test/collective/fleet/CMakeLists.txt b/test/collective/fleet/CMakeLists.txt index 4e1a2a970d3e9..309acb6164007 100644 --- a/test/collective/fleet/CMakeLists.txt +++ b/test/collective/fleet/CMakeLists.txt @@ -134,6 +134,21 @@ if(LOCAL_ALL_ARCH AND LOCAL_ALL_PLAT) set_tests_properties(test_dygraph_sharding_stage3_for_eager PROPERTIES TIMEOUT "350") endif() +if(LOCAL_ALL_ARCH AND LOCAL_ALL_PLAT) + bash_test_modules( + test_dygraph_sharding_stage3_bf16 + START_BASH + ../../legacy_test/dist_test.sh + TIMEOUT + "200" + LABELS + "RUN_TYPE=DIST" + ENVS + "PADDLE_DIST_UT_PORT=22038;NVIDIA_TF32_OVERRIDE=0;http_proxy=;https_proxy=;PYTHONPATH=../..:${PADDLE_BINARY_DIR}/python" + ) + set_tests_properties(test_dygraph_sharding_stage3_bf16 PROPERTIES TIMEOUT + "200") +endif() if(WITH_NCCL) if(LOCAL_ALL_ARCH AND LOCAL_ALL_PLAT) py_test_modules( diff --git a/test/collective/fleet/dygraph_group_sharded_stage3_bf16.py b/test/collective/fleet/dygraph_group_sharded_stage3_bf16.py new file mode 100644 index 0000000000000..002426e94b0d2 --- /dev/null +++ b/test/collective/fleet/dygraph_group_sharded_stage3_bf16.py @@ -0,0 +1,227 @@ +# -*- coding: UTF-8 -*- + +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +import paddle +from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_stage3 import ( + GroupShardedStage3, +) +from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_utils import ( + GroupShardedScaler, +) +from paddle.distributed.fleet.utils import mix_precision_utils +from paddle.nn import Linear, ReLU + +seed = 2022 +epoch = 2 +linear_size = 1000 + +np.random.seed(seed) +paddle.seed(seed) + + +class MLP(paddle.nn.Layer): + def __init__(self, linear_size=1000): + super().__init__() + + self._linear1 = Linear(linear_size, 4 * linear_size) + self._linear2 = Linear(4 * linear_size, linear_size) + self._linear3 = Linear(linear_size, 10) + self._relu = ReLU() + + def forward(self, inputs): + y = self._linear1(inputs) + y = self._linear2(y) + y = self._linear3(y) + y = self._relu(y) + return y + + +class RandomDataset(paddle.io.Dataset): + def __init__(self, num_samples=200, linear_size=1000): + self.num_samples = num_samples + self.linear_size = linear_size + + def __getitem__(self, idx): + img = np.random.rand(self.linear_size).astype('float32') + return img + + def __len__(self): + return self.num_samples + + +def optimizer_setting(model, use_pure_bf16, use_main_grad): + if use_main_grad: + assert use_pure_bf16 + model = mix_precision_utils.MixPrecisionLayer(model, dtype="bfloat16") + optimizer = paddle.optimizer.AdamW( + parameters=model.parameters(), + learning_rate=0.00001, + weight_decay=0.00001, + grad_clip=paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0), + multi_precision=use_pure_bf16, + ) + if use_main_grad: + optimizer = mix_precision_utils.MixPrecisionOptimizer(optimizer) + + return optimizer + + +def train_mlp( + model, + sharding_stage, + use_pure_bf16=False, + accumulate_grad=False, + use_main_grad=False, + test_scaler=False, +): + if sharding_stage != "dp": + group = paddle.distributed.new_group([0, 1], backend="nccl") + scaler = None + if test_scaler: + assert sharding_stage == 2 + assert not accumulate_grad + scaler = paddle.amp.GradScaler(init_loss_scaling=1024) + scaler = GroupShardedScaler(scaler) + optimizer = optimizer_setting( + model=model, use_pure_bf16=use_pure_bf16, use_main_grad=use_main_grad + ) + if use_pure_bf16: + level = 'O2' + custom_white_list = None + model = paddle.amp.decorate(models=model, dtype="bfloat16", level=level) + else: + level = 'O1' + custom_white_list = [ + "matmul_v2", + "elementwise_add", + "relu", + "reduce_mean", + ] + + paddle.seed(2023) + np.random.seed(2023) + train_loader = paddle.io.DataLoader( + RandomDataset(), + batch_size=100, + shuffle=False, + drop_last=True, + num_workers=0, + ) + + if sharding_stage == 3: + model.to(device="gpu") + + if not use_pure_bf16: + for param in model.parameters(): + t = paddle.cast( + paddle.cast(param, dtype='bfloat16'), dtype='float32' + ) + param.set_value(t) + + if sharding_stage == 3: + model = GroupShardedStage3(model, optimizer, group=group) + else: + model = paddle.DataParallel(model) + + losses = [] + for eop in range(epoch): + model.train() + + for batch_id, data in enumerate(train_loader()): + data.stop_gradient = True + + with paddle.amp.auto_cast( + True, + level=level, + dtype="bfloat16", + custom_white_list=custom_white_list, + ): + out = model(data) + loss = paddle.mean(out) + + losses.append(loss) + + if test_scaler: + assert scaler is not None + scaler.scale(loss).backward() + scaler.step(optimizer) + scaler.update() + optimizer.clear_grad() + else: + loss.backward() + if not accumulate_grad: + optimizer.step() + optimizer.clear_grad() + + if accumulate_grad: + optimizer.step() + optimizer.clear_grad() + + return losses + + +def test_stage3_bf16(): + if not paddle.amp.is_bfloat16_supported(): + return + paddle.distributed.init_parallel_env() + mlp = MLP() + state_dict = mlp.state_dict() + + # stage3 bf16 O1 vs stage3 bf16 O2 main_grad + mlp1 = MLP() + mlp2 = MLP() + mlp1.set_state_dict(state_dict) + mlp2.set_state_dict(state_dict) + o1_losses = train_mlp(mlp1, sharding_stage=3, use_pure_bf16=False) + o2_losses = train_mlp( + mlp2, sharding_stage=3, use_pure_bf16=True, use_main_grad=True + ) + for i in range(len(o1_losses)): + o1_32_loss = paddle.cast(o1_losses[i], dtype='float32').detach() + o2_32_loss = paddle.cast(o2_losses[i], dtype='float32').detach() + np.testing.assert_array_equal(o1_32_loss, o2_32_loss) + + # grad accumulation test + mlp3 = MLP() + mlp4 = MLP() + mlp3.set_state_dict(state_dict) + mlp4.set_state_dict(state_dict) + o1_losses_grad_acc = train_mlp( + mlp3, sharding_stage=3, use_pure_bf16=False, accumulate_grad=True + ) + o2_losses_grad_acc = train_mlp( + mlp4, + sharding_stage=3, + use_pure_bf16=True, + use_main_grad=True, + accumulate_grad=True, + ) + for i in range(len(o2_losses_grad_acc)): + o2_loss_grad_acc = paddle.cast( + o2_losses_grad_acc[i], dtype='float32' + ).detach() + o1_loss_grad_acc = paddle.cast( + o1_losses_grad_acc[i], dtype='float32' + ).detach() + np.testing.assert_array_equal(o2_loss_grad_acc, o1_loss_grad_acc) + + return + + +if __name__ == '__main__': + test_stage3_bf16() diff --git a/test/collective/fleet/test_dygraph_sharding_stage3_bf16.py b/test/collective/fleet/test_dygraph_sharding_stage3_bf16.py new file mode 100644 index 0000000000000..f34191d848605 --- /dev/null +++ b/test/collective/fleet/test_dygraph_sharding_stage3_bf16.py @@ -0,0 +1,26 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +from legacy_test.test_parallel_dygraph_dataparallel import TestMultipleGpus + + +class TestDygraphShardingStage3(TestMultipleGpus): + def test_dygraph_sharding_stage3_bf16(self): + self.run_mnist_2gpu('dygraph_group_sharded_stage3_bf16.py') + + +if __name__ == "__main__": + unittest.main() diff --git a/test/collective/fleet/testslist.csv b/test/collective/fleet/testslist.csv index 43dd55c3754b3..664bb0bc8a502 100644 --- a/test/collective/fleet/testslist.csv +++ b/test/collective/fleet/testslist.csv @@ -11,6 +11,7 @@ test_rnn_dp,,GPU;XPU,,DIST,../../legacy_test/dist_test.sh,2,,http_proxy=;https_p test_parallel_dygraph_mp_layers,,GPU,120,DIST,../../legacy_test/dist_test.sh,2,,http_proxy=;https_proxy=;PYTHONPATH=../..,WITH_NCCL test_tcp_store,LINUX;APPLE,,,DIST,../../legacy_test/dist_test.sh,2,,http_proxy=;https_proxy=;PYTHONPATH=../.., test_dygraph_sharding_stage3_for_eager,,,350,DIST,../../legacy_test/dist_test.sh,2,,http_proxy=;https_proxy=;PYTHONPATH=../.., +test_dygraph_sharding_stage3_bf16,,,200,DIST,../../legacy_test/dist_test.sh,2,,NVIDIA_TF32_OVERRIDE=0;http_proxy=;https_proxy=;PYTHONPATH=../.., test_communicator_half_async,,,120,DIST,test_runner.py,2,,FLAGS_communicator_send_queue_size=1;FLAGS_communicator_max_merge_var_num=1;http_proxy=;https_proxy=;PYTHONPATH=../..,WITH_NCCL test_parallel_dygraph_pipeline_parallel,,GPU,500,DIST,../../legacy_test/dist_test.sh,2,,http_proxy=;https_proxy=;PYTHONPATH=../.., test_parallel_dygraph_pipeline_parallel_sync_send,,GPU;XPU,300,DIST,../../legacy_test/dist_test.sh,2,,http_proxy=;https_proxy=;PYTHONPATH=../..;PADDLE_P2P_SYNC_SEND=1, diff --git a/tools/gpups_test.sh b/tools/gpups_test.sh index fff44b872461e..a1e515355c9c0 100644 --- a/tools/gpups_test.sh +++ b/tools/gpups_test.sh @@ -59,6 +59,7 @@ parallel_list="^init_phi_test$|\ ^test_dist_fleet_ps11$|\ ^test_dist_fleet_ps12$|\ ^test_dygraph_sharding_stage2_bf16$|\ +^test_dygraph_sharding_stage3_bf16$|\ ^test_executor_feed_non_tensor$|\ ^test_flash_attention$|\ ^test_fused_adam_op$|\