Skip to content

Commit

Permalink
Merge 'trilinos/Trilinos:develop' (996ebb2) into 'tcad-charon/Trilino…
Browse files Browse the repository at this point in the history
…s:develop' (7cb81d9).

* trilinos-develop:
  Tempus: fix SSPERK54 bug and add error estimator
  Put pack the LD_LIBRARY_PATH to fix intel-18 test failures (ATDV-272)
  Intrepid2: Remove deprecation mascros (issue trilinos#7070)
  tpetra:  removing computation of unused fields lowerTriangular_, upperTriangular_, nodeNumDiags_, globalNumDiags_ for trilinos#7446, trilinos#2630, trilinos#2658
  Add if guard KOKKOS_ENABLE_CUDA_UVM
  Change COMPLEX to ADELUS_COMPLEX
  Add specialization for UVM
  Amesos2: Clean up Basker type handling
  Amesos2: Refactor Basker to use Kokkos views
  Use cuda host pinned memory for MPI_Irecv/MPI_Send
  Remove copy in solve and do some cleanups
  • Loading branch information
Jenkins Pipeline committed Jul 10, 2020
2 parents 7cb81d9 + 996ebb2 commit 6a142f4
Show file tree
Hide file tree
Showing 28 changed files with 803 additions and 1,480 deletions.
2 changes: 0 additions & 2 deletions cmake/std/atdm/cee-rhel6/environment.sh
Original file line number Diff line number Diff line change
Expand Up @@ -199,8 +199,6 @@ fi
# to be safe. Also, we need to set OMP_* env vars here because the SPARC
# modules change them!

atdm_remove_substrings_from_env_var LD_LIBRARY_PATH ":" "/usr/local/epd/canopy2/opt/Canopy/edm/envs/User/lib"

# Use updated Ninja and CMake
module load sems-env
module load sems-cmake/3.12.2
Expand Down
8 changes: 4 additions & 4 deletions packages/adelus/src/Adelus_defines.h
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@
#ifdef SREAL

#define ADELUS_DATA_TYPE float
#undef COMPLEX
#undef ADELUS_COMPLEX
#define ADELUS_MPI_DATA_TYPE MPI_FLOAT
#define ADELUS_MPI_DATA_TYPE2 MPI_FLOAT_INT

Expand All @@ -74,7 +74,7 @@
#elif defined(DREAL)

#define ADELUS_DATA_TYPE double
#undef COMPLEX
#undef ADELUS_COMPLEX
#define ADELUS_MPI_DATA_TYPE MPI_DOUBLE
#define ADELUS_MPI_DATA_TYPE2 MPI_DOUBLE_INT

Expand All @@ -94,7 +94,7 @@
float i;
} scomplex ;
#define ADELUS_DATA_TYPE scomplex
#define COMPLEX
#define ADELUS_COMPLEX
#define ADELUS_MPI_DATA_TYPE MPI_COMPLEX
#define ADELUS_MPI_DATA_TYPE2 MPI_FLOAT_INT

Expand All @@ -114,7 +114,7 @@
double i;
} dcomplex ;
#define ADELUS_DATA_TYPE dcomplex
#define COMPLEX
#define ADELUS_COMPLEX
#define ADELUS_MPI_DATA_TYPE MPI_DOUBLE_COMPLEX
#define ADELUS_MPI_DATA_TYPE2 MPI_DOUBLE_INT

Expand Down
13 changes: 10 additions & 3 deletions packages/adelus/src/Adelus_factor.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -240,7 +240,7 @@ void factor(ZDView& ZV, // matrix and rhs
View1DHostPinnType h_row3 ( "h_row3", my_cols + blksz + nrhs );
#endif

Kokkos::fence();
//Kokkos::fence();
for (j=0; j<ncols_matrix; j++) {
c_owner = col_owner(j); r_owner = row_owner(j);
ringdist = mesh_col(me) - mesh_col(c_owner);
Expand Down Expand Up @@ -914,17 +914,19 @@ void factor(ZDView& ZV, // matrix and rhs

#ifdef GET_TIMING
totalfactortime = MPI_Wtime() - t2;
#endif
#ifdef GET_TIMING

localpivtime = iamaxtime+getlocalpivtime;
msgtime = xpivmsgtime+bcastpivstime+bcastpivrtime+bcastcolstime+bcastcolrtime+bcastrowtime+sendrowtime+recvrowtime;
copytime = pivotswaptime+copycoltime+copyrowtime+copyrow1time+copypivrowtime+copypivrow1time;
dgemmtime = updatetime+colupdtime+rowupdtime+scaltime;
#ifdef ADELUS_SHOW_TIMING_DETAILS
showtime("Time to do iamax",&iamaxtime);
showtime("Time to get local pivot",&getlocalpivtime);
#endif
showtime("Total finding local pivot time",&localpivtime);
double tmp = 100*localpivtime/totalfactortime;
showtime("Percent finding local pivot time",&tmp);
#ifdef ADELUS_SHOW_TIMING_DETAILS
showtime("Time to xchgpivot",&xpivmsgtime);
showtime("Time to do send in bcast pivot",&bcastpivstime);
showtime("Time to do recv in bcast pivot",&bcastpivrtime);
Expand All @@ -939,6 +941,7 @@ void factor(ZDView& ZV, // matrix and rhs
showtime("Time to bcast piv row",&bcastrowtime);
showtime("Time to send cur row",&sendrowtime);
showtime("Time to recv cur row",&recvrowtime);
#endif
showtime("Total msg passing time",&msgtime);
tmp = 100*msgtime/totalfactortime;
showtime("Percent msg passing time",&tmp);
Expand All @@ -947,18 +950,22 @@ void factor(ZDView& ZV, // matrix and rhs
tmp = 100*copyhostpinnedtime/totalfactortime;
showtime("Percent copy between host pinned mem and dev mem time",&tmp);
#endif
#ifdef ADELUS_SHOW_TIMING_DETAILS
showtime("Time to swap pivot",&pivotswaptime);
showtime("Time to copy cur col",&copycoltime);
showtime("Time to copy cur row to sav row",&copyrowtime);
showtime("Time to copy piv row to sav piv",&copypivrowtime);
showtime("Time to copy sav row to cur row",&copyrow1time);
showtime("Time to copy sav piv to piv row",&copypivrow1time);
#endif
showtime("Total copying time",&copytime);
tmp = 100*copytime/totalfactortime;
showtime("Percent copying time",&tmp);
#ifdef ADELUS_SHOW_TIMING_DETAILS
showtime("Time to scale cur col",&scaltime);
showtime("Time to update cur col",&colupdtime);
showtime("Time to update piv row",&rowupdtime);
#endif
showtime("Time to update matrix",&updatetime);
showtime("Total update time",&dgemmtime);
tmp = 100*dgemmtime/totalfactortime;
Expand Down
2 changes: 1 addition & 1 deletion packages/adelus/src/Adelus_perm1.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ namespace Adelus {
template<class XView, class YView>
void zcopy_ld_local_index(int N, XView& X, YView& Y) {
Kokkos::parallel_for(Kokkos::RangePolicy<typename XView::device_type::execution_space>(0,N), KOKKOS_LAMBDA (const int i) {
#ifdef COMPLEX
#ifdef ADELUS_COMPLEX
int lidx = (int)(X(N).real());
#else
int lidx = (int)(X(N));
Expand Down
99 changes: 38 additions & 61 deletions packages/adelus/src/Adelus_solve.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,8 @@
#include "Kokkos_Core.hpp"
#include "KokkosBlas3_gemm.hpp"

#define IBM_MPI_WRKAROUND2

extern int me;

extern int ncols_matrix; // number of cols in the matrix
Expand Down Expand Up @@ -113,7 +115,7 @@ void back_solve6(ZDView& ZV)
typedef typename ZDView::device_type::memory_space memory_space;
typedef Kokkos::View<value_type**, Kokkos::LayoutLeft, memory_space> ViewMatrixType;

#if defined(CUDA_HOST_PINNED_MPI) && defined(KOKKOS_ENABLE_CUDA)
#if (defined(CUDA_HOST_PINNED_MPI) || defined(IBM_MPI_WRKAROUND2)) && defined(KOKKOS_ENABLE_CUDA)
typedef Kokkos::View<value_type**, Kokkos::LayoutLeft, Kokkos::CudaHostPinnedSpace> View2DHostPinnType;//CudaHostPinnedSpace
#endif

Expand Down Expand Up @@ -143,15 +145,13 @@ void back_solve6(ZDView& ZV)
int my_col_id, my_row_id, id_temp;
int dest_right, dest_left;

int blas_length;

#ifdef GET_TIMING
double t1,t2;
double allocviewtime,eliminaterhstime,bcastrowtime,updrhstime,sendrhstime,recvrhstime,copyrhstime;
double allocviewtime,eliminaterhstime,bcastrowtime,updrhstime,xchgrhstime;
double totalsolvetime;
#if defined(CUDA_HOST_PINNED_MPI) && defined(KOKKOS_ENABLE_CUDA)
double copyhostpinnedtime;
#endif
//#if defined(CUDA_HOST_PINNED_MPI) && defined(KOKKOS_ENABLE_CUDA)
// double copyhostpinnedtime;
//#endif
#endif

MPI_Request msgrequest;
Expand Down Expand Up @@ -180,21 +180,22 @@ void back_solve6(ZDView& ZV)
max_bytes = max_bytes*sizeof(ADELUS_DATA_TYPE)*my_rows;

#ifdef GET_TIMING
allocviewtime=eliminaterhstime=bcastrowtime=updrhstime=sendrhstime=recvrhstime=copyrhstime=0.0;
#if defined(CUDA_HOST_PINNED_MPI) && defined(KOKKOS_ENABLE_CUDA)
copyhostpinnedtime=0.0;
#endif
#endif
allocviewtime=eliminaterhstime=bcastrowtime=updrhstime=xchgrhstime=0.0;
//#if defined(CUDA_HOST_PINNED_MPI) && defined(KOKKOS_ENABLE_CUDA)
// copyhostpinnedtime=0.0;
//#endif

#ifdef GET_TIMING
t1 = MPI_Wtime();
#endif

ViewMatrixType row1( "row1", one, nrhs ); // row1: diagonal row (temp variables)
#if (defined(CUDA_HOST_PINNED_MPI) || defined(IBM_MPI_WRKAROUND2)) && defined(KOKKOS_ENABLE_CUDA)
View2DHostPinnType h_row2( "h_row2", my_rows, max_bytes/sizeof(ADELUS_DATA_TYPE)/my_rows );
#else
ViewMatrixType row2( "row2", my_rows, max_bytes/sizeof(ADELUS_DATA_TYPE)/my_rows );

#if defined(CUDA_HOST_PINNED_MPI) && defined(KOKKOS_ENABLE_CUDA)
#endif
#if (defined(CUDA_HOST_PINNED_MPI) || defined(IBM_MPI_WRKAROUND2)) && defined(KOKKOS_ENABLE_CUDA)
View2DHostPinnType h_row1( "h_row1", one, nrhs );
View2DHostPinnType h_row2( "h_row2", my_rows, max_bytes/sizeof(ADELUS_DATA_TYPE)/my_rows );
View2DHostPinnType h_rhs ( "h_rhs", my_rows, nrhs );
#endif

Expand Down Expand Up @@ -309,76 +310,50 @@ void back_solve6(ZDView& ZV)
}
}

#ifdef GET_TIMING
t1 = MPI_Wtime();
#endif
if (j != 1-nprocs_row-extra) {
dest[0] = dest_right;
if (me != dest[0]) {
bytes[0] = max_bytes;
type[0] = SOROWTYPE+j;

#if defined(CUDA_HOST_PINNED_MPI) && defined(KOKKOS_ENABLE_CUDA)
#if (defined(CUDA_HOST_PINNED_MPI) || defined(IBM_MPI_WRKAROUND2)) && defined(KOKKOS_ENABLE_CUDA)
MPI_Irecv(reinterpret_cast<char *>(h_row2.data()), bytes[0], MPI_CHAR, MPI_ANY_SOURCE, type[0], MPI_COMM_WORLD, &msgrequest);
#else //CUDA-aware MPI
#else
MPI_Irecv(reinterpret_cast<char *>( row2.data()), bytes[0], MPI_CHAR, MPI_ANY_SOURCE, type[0], MPI_COMM_WORLD, &msgrequest);
#endif

n_rhs_this = bytes[0]/sizeof(ADELUS_DATA_TYPE)/my_rows;

#if defined(CUDA_HOST_PINNED_MPI) && defined(KOKKOS_ENABLE_CUDA)
#ifdef GET_TIMING
t1 = MPI_Wtime();
#endif
#if (defined(CUDA_HOST_PINNED_MPI) || defined(IBM_MPI_WRKAROUND2)) && defined(KOKKOS_ENABLE_CUDA)
Kokkos::deep_copy(subview(h_rhs, Kokkos::ALL(), Kokkos::make_pair(0, n_rhs_this)), subview(ZV, Kokkos::ALL(), Kokkos::make_pair(my_cols, my_cols+n_rhs_this)));
#ifdef GET_TIMING
copyhostpinnedtime += (MPI_Wtime()-t1);
#endif
#endif

#ifdef GET_TIMING
t1 = MPI_Wtime();
#endif
dest[1] = dest_left;
bytes[1] = n_rhs_this * sizeof(ADELUS_DATA_TYPE) * my_rows;
type[1] = SOROWTYPE+j;

#if defined(CUDA_HOST_PINNED_MPI) && defined(KOKKOS_ENABLE_CUDA)
#if (defined(CUDA_HOST_PINNED_MPI) || defined(IBM_MPI_WRKAROUND2)) && defined(KOKKOS_ENABLE_CUDA)
MPI_Send(reinterpret_cast<char *>(h_rhs.data()), bytes[1], MPI_CHAR, dest[1], type[1], MPI_COMM_WORLD);
#else //CUDA-aware MPI
MPI_Send(reinterpret_cast<char *>(ZV.data()+my_rows*my_cols), bytes[1], MPI_CHAR, dest[1], type[1], MPI_COMM_WORLD);
#endif
#ifdef GET_TIMING
sendrhstime += (MPI_Wtime()-t1);
#endif

#ifdef GET_TIMING
t1 = MPI_Wtime();
#endif
MPI_Wait(&msgrequest,&msgstatus);
#ifdef GET_TIMING
recvrhstime += (MPI_Wtime()-t1);
#endif

#if defined(CUDA_HOST_PINNED_MPI) && defined(KOKKOS_ENABLE_CUDA)
#ifdef GET_TIMING
t1 = MPI_Wtime();
#endif
Kokkos::deep_copy(row2,h_row2);
#ifdef GET_TIMING
copyhostpinnedtime += (MPI_Wtime()-t1);
#endif
#endif

#ifdef GET_TIMING
t1 = MPI_Wtime();
#endif
// Copy row2 -> rhs
blas_length = n_rhs_this*my_rows;
#ifdef KOKKOS_ENABLE_CUDA//Use memcpy for now, can use deep_copy in the future //deep_copy is slower than BLAS XCOPY
int blas_length = n_rhs_this*my_rows;
#if (defined(CUDA_HOST_PINNED_MPI) || defined(IBM_MPI_WRKAROUND2)) && defined(KOKKOS_ENABLE_CUDA)//Use memcpy for now, can use deep_copy in the future //deep_copy is slower than BLAS XCOPY
//Kokkos::deep_copy(subview(ZV, Kokkos::ALL(), Kokkos::make_pair(my_cols, my_cols+n_rhs_this)), subview(h_row2, Kokkos::ALL(), Kokkos::make_pair(0, n_rhs_this)));
cudaMemcpy(reinterpret_cast<ADELUS_DATA_TYPE *>(ZV.data()+my_rows*my_cols), reinterpret_cast<ADELUS_DATA_TYPE *>(h_row2.data()), blas_length*sizeof(ADELUS_DATA_TYPE), cudaMemcpyHostToDevice);
#else
#ifdef KOKKOS_ENABLE_CUDA
cudaMemcpy(reinterpret_cast<ADELUS_DATA_TYPE *>(ZV.data()+my_rows*my_cols), reinterpret_cast<ADELUS_DATA_TYPE *>(row2.data()), blas_length*sizeof(ADELUS_DATA_TYPE), cudaMemcpyDeviceToDevice);
#else
memcpy(reinterpret_cast<ADELUS_DATA_TYPE *>(ZV.data()+my_rows*my_cols), reinterpret_cast<ADELUS_DATA_TYPE *>(row2.data()), blas_length*sizeof(ADELUS_DATA_TYPE));
#endif
#ifdef GET_TIMING
copyrhstime += (MPI_Wtime()-t1);
#endif
}
on_col++;
Expand All @@ -387,6 +362,10 @@ void back_solve6(ZDView& ZV)
act_col--;
}
}
#ifdef GET_TIMING
xchgrhstime += (MPI_Wtime()-t1);
#endif

}

#ifdef GET_TIMING
Expand All @@ -397,12 +376,10 @@ void back_solve6(ZDView& ZV)
showtime("Time to eliminate rhs",&eliminaterhstime);
showtime("Time to bcast temp row",&bcastrowtime);
showtime("Time to update rhs",&updrhstime);
showtime("Time to send in rhs",&sendrhstime);
showtime("Time to recv rhs",&recvrhstime);
#if defined(CUDA_HOST_PINNED_MPI) && defined(KOKKOS_ENABLE_CUDA)
showtime("Time to copy host pinned mem <--> dev mem",&copyhostpinnedtime);
#endif
showtime("Time to copy rhs",&copyrhstime);
//#if defined(CUDA_HOST_PINNED_MPI) && defined(KOKKOS_ENABLE_CUDA)
// showtime("Time to copy host pinned mem <--> dev mem",&copyhostpinnedtime);
//#endif
showtime("Time to xchg rhs",&xchgrhstime);
showtime("Total time in solve",&totalsolvetime);
#endif
}
Expand Down
42 changes: 42 additions & 0 deletions packages/adelus/src/BlasWrapper_copy_spec.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -451,6 +451,48 @@ BLASWRAPPER_CCOPY_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutStride, Kokkos::LayoutLeft
BLASWRAPPER_CCOPY_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutStride, Kokkos::LayoutRight, Kokkos::CudaSpace, false)
BLASWRAPPER_CCOPY_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutStride, Kokkos::LayoutStride, Kokkos::CudaSpace, false)

#if defined (KOKKOS_ENABLE_CUDA_UVM)
BLASWRAPPER_DCOPY_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, false)
BLASWRAPPER_DCOPY_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutLeft, Kokkos::LayoutRight, Kokkos::CudaUVMSpace, false)
BLASWRAPPER_DCOPY_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutLeft, Kokkos::LayoutStride, Kokkos::CudaUVMSpace, false)
BLASWRAPPER_DCOPY_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutRight, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, false)
BLASWRAPPER_DCOPY_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace, false)
BLASWRAPPER_DCOPY_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutRight, Kokkos::LayoutStride, Kokkos::CudaUVMSpace, false)
BLASWRAPPER_DCOPY_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutStride, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, false)
BLASWRAPPER_DCOPY_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutStride, Kokkos::LayoutRight, Kokkos::CudaUVMSpace, false)
BLASWRAPPER_DCOPY_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutStride, Kokkos::LayoutStride, Kokkos::CudaUVMSpace, false)

BLASWRAPPER_SCOPY_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, false)
BLASWRAPPER_SCOPY_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutLeft, Kokkos::LayoutRight, Kokkos::CudaUVMSpace, false)
BLASWRAPPER_SCOPY_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutLeft, Kokkos::LayoutStride, Kokkos::CudaUVMSpace, false)
BLASWRAPPER_SCOPY_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutRight, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, false)
BLASWRAPPER_SCOPY_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace, false)
BLASWRAPPER_SCOPY_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutRight, Kokkos::LayoutStride, Kokkos::CudaUVMSpace, false)
BLASWRAPPER_SCOPY_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutStride, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, false)
BLASWRAPPER_SCOPY_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutStride, Kokkos::LayoutRight, Kokkos::CudaUVMSpace, false)
BLASWRAPPER_SCOPY_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutStride, Kokkos::LayoutStride, Kokkos::CudaUVMSpace, false)

BLASWRAPPER_ZCOPY_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, false)
BLASWRAPPER_ZCOPY_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutLeft, Kokkos::LayoutRight, Kokkos::CudaUVMSpace, false)
BLASWRAPPER_ZCOPY_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutLeft, Kokkos::LayoutStride, Kokkos::CudaUVMSpace, false)
BLASWRAPPER_ZCOPY_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutRight, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, false)
BLASWRAPPER_ZCOPY_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace, false)
BLASWRAPPER_ZCOPY_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutRight, Kokkos::LayoutStride, Kokkos::CudaUVMSpace, false)
BLASWRAPPER_ZCOPY_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutStride, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, false)
BLASWRAPPER_ZCOPY_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutStride, Kokkos::LayoutRight, Kokkos::CudaUVMSpace, false)
BLASWRAPPER_ZCOPY_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutStride, Kokkos::LayoutStride, Kokkos::CudaUVMSpace, false)

BLASWRAPPER_CCOPY_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, false)
BLASWRAPPER_CCOPY_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutLeft, Kokkos::LayoutRight, Kokkos::CudaUVMSpace, false)
BLASWRAPPER_CCOPY_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutLeft, Kokkos::LayoutStride, Kokkos::CudaUVMSpace, false)
BLASWRAPPER_CCOPY_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutRight, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, false)
BLASWRAPPER_CCOPY_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace, false)
BLASWRAPPER_CCOPY_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutRight, Kokkos::LayoutStride, Kokkos::CudaUVMSpace, false)
BLASWRAPPER_CCOPY_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutStride, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, false)
BLASWRAPPER_CCOPY_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutStride, Kokkos::LayoutRight, Kokkos::CudaUVMSpace, false)
BLASWRAPPER_CCOPY_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutStride, Kokkos::LayoutStride, Kokkos::CudaUVMSpace, false)
#endif

}
}

Expand Down
Loading

0 comments on commit 6a142f4

Please sign in to comment.