From 1b295a6fc58c309713cd8341d5d3776456fd7092 Mon Sep 17 00:00:00 2001 From: micheldemessieres Date: Sun, 3 May 2020 20:12:50 -0400 Subject: [PATCH] Amesos2: Fix SuperLU TRSV for Cuda Also fix build for TRSV off. --- packages/amesos2/src/Amesos2_Superlu_def.hpp | 119 ++++++++++++------- 1 file changed, 76 insertions(+), 43 deletions(-) diff --git a/packages/amesos2/src/Amesos2_Superlu_def.hpp b/packages/amesos2/src/Amesos2_Superlu_def.hpp index 247f476f96c6..26a3bde53376 100644 --- a/packages/amesos2/src/Amesos2_Superlu_def.hpp +++ b/packages/amesos2/src/Amesos2_Superlu_def.hpp @@ -391,19 +391,34 @@ Superlu::solve_impl(const Teuchos::Ptr > Teuchos::TimeMonitor redistTimer( this->timers_.vecRedistTime_ ); #endif - Util::get_1d_copy_helper_kokkos_view, - host_solve_array_t>::do_get(B, host_bValues_, - as(ld_rhs), - (is_contiguous_ == true) ? ROOTED : CONTIGUOUS_AND_ROOTED, - this->rowIndexBase_); - // In general we may want to write directly to the x space without a copy. // So we 'get' x which may be a direct view assignment to the MV. - Util::get_1d_copy_helper_kokkos_view, - host_solve_array_t>::do_get(X, host_xValues_, - as(ld_rhs), - (is_contiguous_ == true) ? ROOTED : CONTIGUOUS_AND_ROOTED, - this->rowIndexBase_); + if(use_triangular_solves_) { // to device +#ifdef HAVE_AMESOS2_TRIANGULAR_SOLVES + Util::get_1d_copy_helper_kokkos_view, + device_solve_array_t>::do_get(X, device_xValues_, + as(ld_rhs), + (is_contiguous_ == true) ? ROOTED : CONTIGUOUS_AND_ROOTED, + this->rowIndexBase_); + Util::get_1d_copy_helper_kokkos_view, + device_solve_array_t>::do_get(B, device_bValues_, + as(ld_rhs), + (is_contiguous_ == true) ? ROOTED : CONTIGUOUS_AND_ROOTED, + this->rowIndexBase_); +#endif + } + else { // to host + Util::get_1d_copy_helper_kokkos_view, + host_solve_array_t>::do_get(X, host_xValues_, + as(ld_rhs), + (is_contiguous_ == true) ? ROOTED : CONTIGUOUS_AND_ROOTED, + this->rowIndexBase_); + Util::get_1d_copy_helper_kokkos_view, + host_solve_array_t>::do_get(B, host_bValues_, + as(ld_rhs), + (is_contiguous_ == true) ? ROOTED : CONTIGUOUS_AND_ROOTED, + this->rowIndexBase_); + } } // If equilibration was applied at numeric, then gssvx and gsisx are going to @@ -413,36 +428,28 @@ Superlu::solve_impl(const Teuchos::Ptr > // skip this. Generally need an API which tells us what happened internally // in above get_1d_copy_helper_kokkos_view - whether is was copy or assign. if(data_.equed != 'N') { - host_solve_array_t copyB(Kokkos::ViewAllocateWithoutInitializing("copyB"), - host_bValues_.extent(0), host_bValues_.extent(1)); - Kokkos::deep_copy(copyB, host_bValues_); - host_bValues_ = copyB; + if(use_triangular_solves_) { // to device +#ifdef HAVE_AMESOS2_TRIANGULAR_SOLVES + device_solve_array_t copyB(Kokkos::ViewAllocateWithoutInitializing("copyB"), + device_bValues_.extent(0), device_bValues_.extent(1)); + Kokkos::deep_copy(copyB, device_bValues_); + device_bValues_ = copyB; +#endif + } + else { + host_solve_array_t copyB(Kokkos::ViewAllocateWithoutInitializing("copyB"), + host_bValues_.extent(0), host_bValues_.extent(1)); + Kokkos::deep_copy(copyB, host_bValues_); + host_bValues_ = copyB; + } } int ierr = 0; // returned error code magnitude_type rpg, rcond; if ( this->root_ ) { - Kokkos::resize(data_.ferr, nrhs); - Kokkos::resize(data_.berr, nrhs); - - { -#ifdef HAVE_AMESOS2_TIMERS - Teuchos::TimeMonitor mvConvTimer(this->timers_.vecConvTime_); -#endif - SLU::Dtype_t dtype = type_map::dtype; - int i_ld_rhs = as(ld_rhs); - function_map::create_Dense_Matrix(&(data_.B), i_ld_rhs, as(nrhs), - host_bValues_.data(), i_ld_rhs, - SLU::SLU_DN, dtype, SLU::SLU_GE); - function_map::create_Dense_Matrix(&(data_.X), i_ld_rhs, as(nrhs), - host_xValues_.data(), i_ld_rhs, - SLU::SLU_DN, dtype, SLU::SLU_GE); - } - // Note: the values of B and X (after solution) are adjusted // appropriately within gssvx for row and column scalings. - { // Do solve! #ifdef HAVE_AMESOS2_TIMERS Teuchos::TimeMonitor solveTimer(this->timers_.solveTime_); @@ -452,6 +459,23 @@ Superlu::solve_impl(const Teuchos::Ptr > triangular_solve(); } else { + Kokkos::resize(data_.ferr, nrhs); + Kokkos::resize(data_.berr, nrhs); + + { + #ifdef HAVE_AMESOS2_TIMERS + Teuchos::TimeMonitor mvConvTimer(this->timers_.vecConvTime_); + #endif + SLU::Dtype_t dtype = type_map::dtype; + int i_ld_rhs = as(ld_rhs); + function_map::create_Dense_Matrix(&(data_.B), i_ld_rhs, as(nrhs), + host_bValues_.data(), i_ld_rhs, + SLU::SLU_DN, dtype, SLU::SLU_GE); + function_map::create_Dense_Matrix(&(data_.X), i_ld_rhs, as(nrhs), + host_xValues_.data(), i_ld_rhs, + SLU::SLU_DN, dtype, SLU::SLU_GE); + } + if(ILU_Flag_==false) { function_map::gssvx(&(data_.options), &(data_.A), data_.perm_c.data(), data_.perm_r.data(), @@ -475,16 +499,16 @@ Superlu::solve_impl(const Teuchos::Ptr > #endif &(data_.mem_usage), &(data_.stat), &ierr); } - } + // Cleanup X and B stores + SLU::Destroy_SuperMatrix_Store( &(data_.X) ); + SLU::Destroy_SuperMatrix_Store( &(data_.B) ); + data_.X.Store = NULL; + data_.B.Store = NULL; } - // Cleanup X and B stores - SLU::Destroy_SuperMatrix_Store( &(data_.X) ); - SLU::Destroy_SuperMatrix_Store( &(data_.B) ); - data_.X.Store = NULL; - data_.B.Store = NULL; - } + } // do solve + } // if ( this->root_ ) /* All processes should have the same error code */ Teuchos::broadcast(*(this->getComm()), 0, &ierr); @@ -507,6 +531,16 @@ Superlu::solve_impl(const Teuchos::Ptr > Teuchos::TimeMonitor redistTimer(this->timers_.vecRedistTime_); #endif + if(use_triangular_solves_) { // to device +#ifdef HAVE_AMESOS2_TRIANGULAR_SOLVES + Util::put_1d_data_helper_kokkos_view< + MultiVecAdapter,device_solve_array_t>::do_put(X, device_xValues_, + as(ld_rhs), + (is_contiguous_ == true) ? ROOTED : CONTIGUOUS_AND_ROOTED, + this->rowIndexBase_); +#endif + } + else { Util::put_1d_data_helper_kokkos_view< MultiVecAdapter,host_solve_array_t>::do_put(X, host_xValues_, as(ld_rhs), @@ -514,6 +548,8 @@ Superlu::solve_impl(const Teuchos::Ptr > this->rowIndexBase_); } + } + return(ierr); } @@ -901,9 +937,6 @@ void Superlu::triangular_solve() const { #ifdef HAVE_AMESOS2_TRIANGULAR_SOLVES - deep_copy_or_assign_view(device_xValues_, host_xValues_); - deep_copy_or_assign_view(device_bValues_, host_bValues_); - size_t ld_rhs = device_xValues_.extent(0); size_t nrhs = device_xValues_.extent(1);