diff --git a/cmake/ProjectCompilerPostConfig.cmake b/cmake/ProjectCompilerPostConfig.cmake index 2f47f7104de1..1cdedd56a667 100644 --- a/cmake/ProjectCompilerPostConfig.cmake +++ b/cmake/ProjectCompilerPostConfig.cmake @@ -43,7 +43,7 @@ IF (KokkosEnable) ENDIF() set(upcoming_warnings shadow ${Trilinos_ADDITIONAL_WARNINGS}) -set(promoted_warnings parentheses sign-compare unused-variable reorder) +set(promoted_warnings parentheses sign-compare unused-variable reorder uninitialized) if("${Trilinos_WARNINGS_MODE}" STREQUAL "WARN") enable_warnings("${upcoming_warnings}") diff --git a/cmake/TPLs/FindTPLBLAS.cmake b/cmake/TPLs/FindTPLBLAS.cmake index e2b802502c85..d726dac1ab88 100644 --- a/cmake/TPLs/FindTPLBLAS.cmake +++ b/cmake/TPLs/FindTPLBLAS.cmake @@ -22,4 +22,4 @@ if (MSVC AND NOT endif() tribits_tpl_find_include_dirs_and_libraries( BLAS - REQUIRED_LIBS_NAMES "blas blas_win32") + REQUIRED_LIBS_NAMES "blas blas_win32 openblas") diff --git a/cmake/TPLs/FindTPLLAPACK.cmake b/cmake/TPLs/FindTPLLAPACK.cmake index 8852cbf6b062..170962a309f8 100644 --- a/cmake/TPLs/FindTPLLAPACK.cmake +++ b/cmake/TPLs/FindTPLLAPACK.cmake @@ -16,4 +16,4 @@ if (MSVC AND NOT endif() tribits_tpl_find_include_dirs_and_libraries( LAPACK - REQUIRED_LIBS_NAMES "lapack lapack_win32") + REQUIRED_LIBS_NAMES "lapack lapack_win32 openblas") diff --git a/cmake/tribits/common_tpls/FindTPLBLAS.cmake b/cmake/tribits/common_tpls/FindTPLBLAS.cmake index 1ebe176a80c5..95360d3ebd0f 100644 --- a/cmake/tribits/common_tpls/FindTPLBLAS.cmake +++ b/cmake/tribits/common_tpls/FindTPLBLAS.cmake @@ -8,7 +8,7 @@ # @HEADER -set(REQUIRED_LIBS_NAMES "blas blas_win32") +set(REQUIRED_LIBS_NAMES "blas blas_win32 openblas") # # Second, search for BLAS components (if allowed) using the standard diff --git a/cmake/tribits/common_tpls/FindTPLLAPACK.cmake b/cmake/tribits/common_tpls/FindTPLLAPACK.cmake index 9874532fea7c..436cd0ac801c 100644 --- a/cmake/tribits/common_tpls/FindTPLLAPACK.cmake +++ b/cmake/tribits/common_tpls/FindTPLLAPACK.cmake @@ -14,7 +14,7 @@ # to trigger the right behavior in the function # tribits_tpl_find_include_dirs_and_libraries(). # -set(REQUIRED_LIBS_NAMES "lapack lapack_win32") +set(REQUIRED_LIBS_NAMES "lapack lapack_win32 openblas") # # Second, search for LAPACK components (if allowed) using the standard diff --git a/packages/adelus/src/Adelus_forward.hpp b/packages/adelus/src/Adelus_forward.hpp index d6a6db280cc3..19b04ee6f2b8 100644 --- a/packages/adelus/src/Adelus_forward.hpp +++ b/packages/adelus/src/Adelus_forward.hpp @@ -105,9 +105,11 @@ void forward(HandleType& ahandle, ZViewType& Z, RHSViewType& RHS) // count_row++; //} int curr_lrid = k/nprocs_col;//note: nprocs_col (global var) cannot be read in a device function - Kokkos::parallel_for(Kokkos::RangePolicy(0,RHS.extent(1)), KOKKOS_LAMBDA (const int i) { - ck(0,i) = RHS(curr_lrid,i); - }); + if (curr_lrid < static_cast(RHS.extent(0))) { //note: to avoid out-of-bounds access on the RHS + Kokkos::parallel_for(Kokkos::RangePolicy(0,RHS.extent(1)), KOKKOS_LAMBDA (const int i) { + ck(0,i) = RHS(curr_lrid,i); + }); + } #if defined(ADELUS_HOST_PINNED_MEM_MPI) && (defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)) Kokkos::deep_copy(h_ck,ck); diff --git a/packages/framework/ini-files/config-specs.ini b/packages/framework/ini-files/config-specs.ini index cf190a554ffb..a59a18d50591 100644 --- a/packages/framework/ini-files/config-specs.ini +++ b/packages/framework/ini-files/config-specs.ini @@ -392,18 +392,16 @@ opt-set-cmake-var Scotch_LIBRARY_DIRS PATH : ${SEMS_SCOTCH_LIBRARY_PATH|ENV} # Explicit libraries opt-set-cmake-var TPL_DLlib_LIBRARIES PATH : ${DL_LIBRARIES|ENV} opt-set-cmake-var TPL_Netcdf_LIBRARIES STRING : ${SEMS_NETCDF_LIBRARY_PATH|ENV}/libnetcdf.so;${SEMS_NETCDF_LIBRARY_PATH|ENV}/libpnetcdf.a -opt-set-cmake-var TPL_BLAS_LIBRARIES STRING : ${BLAS_LIBRARIES|ENV} -opt-set-cmake-var TPL_LAPACK_LIBRARIES STRING : ${LAPACK_LIBRARIES|ENV} -[COMMON_SPACK_TPLS] -use COMMON - -# BLAS & LAPACK +[SPACK_NETLIB_BLAS_LAPACK] opt-set-cmake-var TPL_BLAS_LIBRARIES STRING FORCE : -L${BLAS_ROOT|ENV}/lib;-lblas;-lgfortran;-lgomp opt-set-cmake-var TPL_BLAS_LIBRARY_DIRS STRING FORCE : ${BLAS_ROOT|ENV}/lib opt-set-cmake-var TPL_LAPACK_LIBRARIES STRING FORCE : -L${BLAS_ROOT|ENV}/lib;-llapack;-lgfortran;-lgomp opt-set-cmake-var TPL_LAPACK_LIBRARY_DIRS STRING FORCE : ${BLAS_ROOT|ENV}/lib +[COMMON_SPACK_TPLS] +use COMMON + # Boost opt-set-cmake-var BoostLib_INCLUDE_DIRS PATH FORCE : ${BOOST_INC|ENV} opt-set-cmake-var BoostLib_LIBRARY_DIRS PATH FORCE : ${BOOST_LIB|ENV} @@ -460,8 +458,6 @@ use COMMON_SPACK_TPLS # Overrides from [COMMON_SPACK_TPLS] to let container handle the values opt-set-cmake-var TPL_Netcdf_LIBRARIES STRING FORCE : "" -opt-set-cmake-var TPL_BLAS_LIBRARIES STRING FORCE : "" -opt-set-cmake-var TPL_LAPACK_LIBRARIES STRING FORCE : "" opt-set-cmake-var TPL_HDF5_LIBRARIES STRING FORCE : "" opt-set-cmake-var SuperLU_LIBRARY_NAMES STRING FORCE : superlu;m @@ -1176,11 +1172,7 @@ opt-set-cmake-var Rythmos_StepperBuilder_UnitTest_MPI_1_DISABLE BOOL : ON [RHEL_COMPILER|CUDA] use NODE-TYPE|CUDA -opt-set-cmake-var MPI_EXEC FILEPATH : mpiexec -opt-set-cmake-var TPL_BLAS_LIBRARY_DIRS STRING FORCE : ${CBLAS_ROOT|ENV}/lib -opt-set-cmake-var TPL_BLAS_LIBRARIES STRING FORCE : ${CBLAS_ROOT|ENV}/lib/libblas.a;-L${CBLAS_ROOT|ENV}/lib;-lgfortran;-lgomp;-lm -opt-set-cmake-var TPL_LAPACK_LIBRARY_DIRS STRING FORCE : ${LAPACK_ROOT|ENV}/lib -opt-set-cmake-var TPL_LAPACK_LIBRARIES STRING FORCE : -L${LAPACK_ROOT|ENV}/lib;-lgfortran;-lgomp;${LAPACK_ROOT|ENV}/lib/liblapack.a +opt-set-cmake-var MPI_EXEC FILEPATH : mpiexec [COMPILER|GNU] opt-set-cmake-var MPI_EXEC FILEPATH : mpirun @@ -1189,8 +1181,7 @@ opt-set-cmake-var Trilinos_WARNINGS_MODE STRING : WARN [COMPILER|INTEL] opt-set-cmake-var MPI_EXEC FILEPATH : mpirun -[SEMS_COMMON_CUDA] -# TPL ENABLE/DISABLE settings +[CUDA_TPL_ENABLES] opt-set-cmake-var TPL_ENABLE_BLAS BOOL FORCE : ON opt-set-cmake-var TPL_ENABLE_BinUtils BOOL FORCE : OFF opt-set-cmake-var TPL_ENABLE_Boost BOOL FORCE : ON @@ -1211,20 +1202,21 @@ opt-set-cmake-var TPL_ENABLE_Scotch BOOL FORCE : OFF opt-set-cmake-var TPL_ENABLE_SuperLU BOOL FORCE : OFF opt-set-cmake-var TPL_ENABLE_SuperLUDist BOOL FORCE : OFF opt-set-cmake-var TPL_ENABLE_Zlib BOOL FORCE : ON +opt-set-cmake-var EpetraExt_ENABLE_HDF5 BOOL FORCE : OFF +opt-set-cmake-var Kokkos_ENABLE_CUDA BOOL FORCE : ON +opt-set-cmake-var Zoltan_ENABLE_Scotch BOOL FORCE : OFF -#TPL_*_LIBRARIES -opt-set-cmake-var TPL_BLAS_LIBRARIES STRING FORCE : -L${BLAS_ROOT|ENV}/lib;-lopenblas;-lgfortran;-lgomp -opt-set-cmake-var TPL_BLAS_LIBRARY_DIRS STRING FORCE : ${BLAS_ROOT|ENV}/lib +[SEMS_CUDA_TPL_LOCATIONS] opt-set-cmake-var TPL_BoostLib_LIBRARIES STRING FORCE : ${BOOST_LIB|ENV}/libboost_program_options.a;${BOOST_LIB|ENV}/libboost_system.a opt-set-cmake-var TPL_Boost_LIBRARIES STRING FORCE : ${BOOST_LIB|ENV}/libboost_program_options.a;${BOOST_LIB|ENV}/libboost_system.a opt-set-cmake-var TPL_DLlib_LIBRARIES FILEPATH FORCE : "-ldl" opt-set-cmake-var TPL_HDF5_LIBRARIES STRING FORCE : ${HDF5_LIB|ENV}/libhdf5_hl.so;${HDF5_LIB|ENV}/libhdf5.a;${ZLIB_LIB|ENV}/libz.a;-ldl -opt-set-cmake-var TPL_LAPACK_LIBRARIES STRING FORCE : -L${BLAS_ROOT|ENV}/lib;-lopenblas;-lgfortran;-lgomp -opt-set-cmake-var TPL_LAPACK_LIBRARY_DIRS STRING FORCE : ${BLAS_ROOT|ENV}/lib opt-set-cmake-var TPL_METIS_LIBRARIES STRING FORCE : ${METIS_LIB|ENV}/libmetis.so opt-set-cmake-var TPL_Netcdf_LIBRARIES STRING FORCE : -L${NETCDF_C_ROOT|ENV}/lib64;${NETCDF_C_ROOT|ENV}/lib/libnetcdf.a;${PARALLEL_NETCDF_ROOT|ENV}/lib/libpnetcdf.a;${TPL_HDF5_LIBRARIES|CMAKE} +# see https://github.com/trilinos/Trilinos/issues/11109#issuecomment-1272146298 +opt-set-cmake-var TPL_BLAS_LIBRARIES STRING FORCE : /lib64/libblas.so.3 +opt-set-cmake-var TPL_LAPACK_LIBRARIES STRING FORCE : /lib64/liblapack.so.3 -#TPL_[INCLUDE|LIBRARY]_DIRS opt-set-cmake-var Netcdf_INCLUDE_DIRS STRING FORCE : ${NETCDF_C_INC|ENV} opt-set-cmake-var ParMETIS_INCLUDE_DIRS STRING FORCE : ${PARMETIS_INC|ENV} opt-set-cmake-var ParMETIS_LIBRARY_DIRS STRING FORCE : ${PARMETIS_LIB|ENV} @@ -1233,12 +1225,13 @@ opt-set-cmake-var Scotch_LIBRARY_DIRS STRING FORCE : ${SCOTCH_LIB|ENV} opt-set-cmake-var SuperLU_INCLUDE_DIRS STRING FORCE : ${SUPERLU_INC|ENV} opt-set-cmake-var SuperLU_LIBRARY_DIRS STRING FORCE : ${SUPERLU_LIB|ENV} +[CUDA] +use CUDA_TPL_ENABLES + #CXX Settings opt-set-cmake-var CMAKE_CXX_FLAGS STRING : -fPIC -Wall -Warray-bounds -Wchar-subscripts -Wcomment -Wenum-compare -Wformat -Wuninitialized -Wmaybe-uninitialized -Wmain -Wnarrowing -Wnonnull -Wreorder -Wreturn-type -Wsequence-point -Wtrigraphs -Wunused-function -Wunused-but-set-variable -Wwrite-strings #Package Options -opt-set-cmake-var EpetraExt_ENABLE_HDF5 BOOL FORCE : OFF -opt-set-cmake-var Kokkos_ENABLE_CUDA BOOL FORCE : ON opt-set-cmake-var Kokkos_ENABLE_CUDA_LAMBDA BOOL FORCE : ON opt-set-cmake-var Kokkos_ENABLE_CXX11_DISPATCH_LAMBDA BOOL FORCE : ON #opt-set-cmake-var Kokkos_ENABLE_Debug_Bounds_Check BOOL FORCE : ON @@ -1247,10 +1240,15 @@ opt-set-cmake-var Panzer_FADTYPE STRING FORCE : "Sacado::Fad::DFad" opt-set-cmake-var Phalanx_KOKKOS_DEVICE_TYPE STRING FORCE : CUDA opt-set-cmake-var Sacado_ENABLE_HIERARCHICAL_DFAD BOOL FORCE : ON opt-set-cmake-var Tpetra_INST_SERIAL BOOL FORCE : ON -opt-set-cmake-var Zoltan_ENABLE_Scotch BOOL FORCE : OFF -[CUDA11-RUN-SERIAL-TESTS] +[SEMS_COMMON_CUDA] +use CUDA +use SEMS_CUDA_TPL_LOCATIONS + +[CUDA-RUN-SERIAL-TESTS] opt-set-cmake-var Kokkos_CoreUnitTest_Cuda1_SET_RUN_SERIAL BOOL FORCE : ON +opt-set-cmake-var Kokkos_CoreUnitTest_CudaTimingBased_SET_RUN_SERIAL BOOL FORCE : ON +opt-set-cmake-var Kokkos_CoreUnitTest_Default_SET_RUN_SERIAL BOOL FORCE : ON opt-set-cmake-var KokkosKernels_sparse_cuda_MPI_1_SET_RUN_SERIAL BOOL FORCE : ON opt-set-cmake-var KokkosKernels_batched_dla_cuda_MPI_1_SET_RUN_SERIAL BOOL FORCE : ON opt-set-cmake-var Intrepid2_unit-test_MonolithicExecutable_Intrepid2_Tests_MPI_1_SET_RUN_SERIAL BOOL FORCE : ON @@ -1335,6 +1333,7 @@ use USE-DEPRECATED|YES use PACKAGE-ENABLES|NO-PACKAGE-ENABLES use COMMON_SPACK_TPLS +use SPACK_NETLIB_BLAS_LAPACK opt-set-cmake-var MPI_EXEC_PRE_NUMPROCS_FLAGS STRING : --bind-to;none --mca btl vader,self opt-set-cmake-var Teko_DISABLE_LSCSTABALIZED_TPETRA_ALPAH_INV_D BOOL : ON @@ -1375,6 +1374,7 @@ use USE-UVM|NO use USE-DEPRECATED|YES use PACKAGE-ENABLES|NO-PACKAGE-ENABLES use COMMON_SPACK_TPLS +use SPACK_NETLIB_BLAS_LAPACK opt-set-cmake-var MPI_EXEC_PRE_NUMPROCS_FLAGS STRING : --bind-to;none --mca btl vader,self @@ -1468,14 +1468,10 @@ opt-set-cmake-var TPL_ENABLE_SuperLUDist BOOL FORCE : OFF opt-set-cmake-var TPL_ENABLE_Zlib BOOL FORCE : ON #TPL_*_LIBRARIES -# see https://github.com/trilinos/Trilinos/issues/11109#issuecomment-1272146298 -opt-set-cmake-var TPL_BLAS_LIBRARIES STRING FORCE : /lib64/libblas.so.3 opt-set-cmake-var TPL_BoostLib_LIBRARIES STRING FORCE : ${BOOST_LIB|ENV}/libboost_program_options.a;${BOOST_LIB|ENV}/libboost_system.a opt-set-cmake-var TPL_Boost_LIBRARIES STRING FORCE : ${BOOST_LIB|ENV}/libboost_program_options.a;${BOOST_LIB|ENV}/libboost_system.a opt-set-cmake-var TPL_DLlib_LIBRARIES FILEPATH FORCE : "-ldl" opt-set-cmake-var TPL_HDF5_LIBRARIES STRING FORCE : ${HDF5_LIB|ENV}/libhdf5_hl.so;${HDF5_LIB|ENV}/libhdf5.a;${ZLIB_LIB|ENV}/libz.a;-ldl -# see https://github.com/trilinos/Trilinos/issues/11109#issuecomment-1272146298 -opt-set-cmake-var TPL_LAPACK_LIBRARIES STRING FORCE : /lib64/liblapack.so.3 opt-set-cmake-var TPL_METIS_LIBRARIES STRING FORCE : ${METIS_LIB|ENV}/libmetis.so opt-set-cmake-var TPL_Netcdf_LIBRARIES STRING FORCE : -L${NETCDF_C_ROOT|ENV}/lib64;${NETCDF_C_ROOT|ENV}/lib/libnetcdf.a;${PARALLEL_NETCDF_ROOT|ENV}/lib/libpnetcdf.a;${TPL_HDF5_LIBRARIES|CMAKE} @@ -1532,7 +1528,7 @@ opt-set-cmake-var Adelus_vector_random_npr4_rhs1_MPI_4_DISABLE BOOL : ON use PACKAGE-ENABLES|NO-EPETRA -use CUDA11-RUN-SERIAL-TESTS +use CUDA-RUN-SERIAL-TESTS [rhel8_sems-cuda-11.4.2-sems-gnu-10.1.0-sems-openmpi-4.1.4_release_static_Volta70_no-asan_complex_no-fpic_mpi_pt_no-rdc_no-uvm_deprecated-on_all] # uses sems-v2 modules @@ -1561,8 +1557,7 @@ use PACKAGE-ENABLES|NO-PACKAGE-ENABLES use PACKAGE-ENABLES|NO-EPETRA use COMMON_SPACK_TPLS use SEMS_COMMON_CUDA - -use CUDA11-RUN-SERIAL-TESTS +use CUDA-RUN-SERIAL-TESTS opt-set-cmake-var Trilinos_ENABLE_TESTS BOOL FORCE : OFF @@ -1570,6 +1565,12 @@ opt-set-cmake-var Trilinos_ENABLE_TESTS BOOL FORCE : OFF use rhel8_sems-cuda-11.4.2-gnu-10.1.0-openmpi-4.1.6_release_static_Volta70_no-asan_complex_no-fpic_mpi_pt_no-rdc_uvm_deprecated-on_no-package-enables use PACKAGE-ENABLES|ALL-NO-EPETRA +[rhel8_sems-cuda-11.4.2-gnu-10.1.0-openmpi-4.1.6_release_static_Volta70_no-asan_complex_no-fpic_mpi_pt_no-rdc_uvm_deprecated-on_all-no-epetra] +use rhel8_sems-cuda-11.4.2-gnu-10.1.0-openmpi-4.1.6_release_static_Volta70_no-asan_complex_no-fpic_mpi_pt_no-rdc_uvm_deprecated-on_all + +use CUDA-RUN-SERIAL-TESTS +opt-set-cmake-var Trilinos_ENABLE_TESTS BOOL FORCE : ON + [rhel8_sems-cuda-11.4.2-gnu-10.1.0-openmpi-4.1.6_release_static_Volta70_no-asan_complex_no-fpic_mpi_pt_no-rdc_no-uvm_deprecated-on_no-package-enables] # uses sems-v2 modules use RHEL8 @@ -1615,14 +1616,10 @@ opt-set-cmake-var TPL_ENABLE_SuperLUDist BOOL FORCE : OFF opt-set-cmake-var TPL_ENABLE_Zlib BOOL FORCE : ON #TPL_*_LIBRARIES -# see https://github.com/trilinos/Trilinos/issues/11109#issuecomment-1272146298 -opt-set-cmake-var TPL_BLAS_LIBRARIES STRING FORCE : /lib64/libblas.so.3 opt-set-cmake-var TPL_BoostLib_LIBRARIES STRING FORCE : ${BOOST_LIB|ENV}/libboost_program_options.a;${BOOST_LIB|ENV}/libboost_system.a opt-set-cmake-var TPL_Boost_LIBRARIES STRING FORCE : ${BOOST_LIB|ENV}/libboost_program_options.a;${BOOST_LIB|ENV}/libboost_system.a opt-set-cmake-var TPL_DLlib_LIBRARIES FILEPATH FORCE : "-ldl" opt-set-cmake-var TPL_HDF5_LIBRARIES STRING FORCE : ${HDF5_LIB|ENV}/libhdf5_hl.so;${HDF5_LIB|ENV}/libhdf5.a;${ZLIB_LIB|ENV}/libz.a;-ldl -# see https://github.com/trilinos/Trilinos/issues/11109#issuecomment-1272146298 -opt-set-cmake-var TPL_LAPACK_LIBRARIES STRING FORCE : /lib64/liblapack.so.3 opt-set-cmake-var TPL_METIS_LIBRARIES STRING FORCE : ${METIS_LIB|ENV}/libmetis.so opt-set-cmake-var TPL_Netcdf_LIBRARIES STRING FORCE : -L${NETCDF_C_ROOT|ENV}/lib64;${NETCDF_C_ROOT|ENV}/lib/libnetcdf.a;${PARALLEL_NETCDF_ROOT|ENV}/lib/libpnetcdf.a;${TPL_HDF5_LIBRARIES|CMAKE} @@ -1678,7 +1675,7 @@ opt-set-cmake-var Adelus_vector_random_npr4_rhs1_MPI_4_DISABLE BOOL : ON use PACKAGE-ENABLES|NO-EPETRA -use CUDA11-RUN-SERIAL-TESTS +use CUDA-RUN-SERIAL-TESTS [rhel8_sems-cuda-11.4.2-gnu-10.1.0-openmpi-4.1.6_release_static_Volta70_no-asan_complex_no-fpic_mpi_pt_no-rdc_no-uvm_deprecated-on_all] # uses sems-v2 modules @@ -1703,6 +1700,7 @@ use USE-DEPRECATED|YES use PACKAGE-ENABLES|NO-PACKAGE-ENABLES use COMMON_SPACK_TPLS +use SPACK_NETLIB_BLAS_LAPACK opt-set-cmake-var SuperLU_LIBRARY_NAMES STRING : superlu;m opt-set-cmake-var ML_ENABLE_SuperLU BOOL FORCE : OFF @@ -1739,6 +1737,7 @@ use USE-DEPRECATED|YES use PACKAGE-ENABLES|NO-PACKAGE-ENABLES use COMMON_SPACK_TPLS +use SPACK_NETLIB_BLAS_LAPACK opt-set-cmake-var MPI_EXEC_PRE_NUMPROCS_FLAGS STRING : --bind-to;none --mca btl vader,self opt-set-cmake-var CMAKE_CXX_EXTENSIONS BOOL : OFF @@ -1773,6 +1772,7 @@ use USE-DEPRECATED|YES use PACKAGE-ENABLES|NO-PACKAGE-ENABLES use COMMON_SPACK_TPLS +use SPACK_NETLIB_BLAS_LAPACK opt-set-cmake-var SuperLU_LIBRARY_NAMES STRING FORCE : superlu;m opt-set-cmake-var ML_ENABLE_SuperLU BOOL FORCE : OFF @@ -1877,11 +1877,6 @@ use PACKAGE-ENABLES|NO-PACKAGE-ENABLES use COMMON_SPACK_TPLS -opt-set-cmake-var TPL_BLAS_LIBRARY_DIRS STRING FORCE : ${OPENBLAS_ROOT|ENV}/lib -opt-set-cmake-var TPL_BLAS_LIBRARIES STRING FORCE : ${OPENBLAS_ROOT|ENV}/lib/libopenblas.a;-L${OPENBLAS_ROOT|ENV}/lib;-lgfortran;-lgomp;-lm -opt-set-cmake-var TPL_LAPACK_LIBRARY_DIRS STRING FORCE : ${OPENBLAS_ROOT|ENV}/lib -opt-set-cmake-var TPL_LAPACK_LIBRARIES STRING FORCE : ${OPENBLAS_ROOT|ENV}/lib/libopenblas.a;-L${OPENBLAS_ROOT|ENV}/lib;-lgfortran;-lgomp;-lm - opt-set-cmake-var MPI_EXEC_PRE_NUMPROCS_FLAGS STRING : --bind-to;none --mca btl vader,self opt-set-cmake-var CMAKE_CXX_FLAGS STRING FORCE : -Wall -Wno-clobbered -Wno-vla -Wno-pragmas -Wno-unknown-pragmas -Wno-unused-local-typedefs -Wno-literal-suffix -Wno-deprecated-declarations -Wno-misleading-indentation -Wno-int-in-bool-context -Wno-maybe-uninitialized -Wno-nonnull-compare -Wno-address -Wno-inline @@ -1934,11 +1929,6 @@ use COMMON_SPACK_TPLS opt-set-cmake-var CMAKE_CXX_FLAGS STRING : -Wall -Wno-clobbered -Wno-vla -Wno-pragmas -Wno-unknown-pragmas -Wno-unused-local-typedefs -Wno-literal-suffix -Wno-deprecated-declarations -Wno-misleading-indentation -Wno-int-in-bool-context -Wno-maybe-uninitialized -Wno-class-memaccess -Wno-nonnull-compare -Wno-address -Wno-inline -Wno-unused-but-set-variable -Wno-unused-label -opt-set-cmake-var TPL_BLAS_LIBRARY_DIRS STRING FORCE : ${OPENBLAS_ROOT|ENV}/lib -opt-set-cmake-var TPL_BLAS_LIBRARIES STRING FORCE : ${OPENBLAS_ROOT|ENV}/lib/libopenblas.a;-L${OPENBLAS_ROOT|ENV}/lib;-lgfortran;-lgomp;-lm -opt-set-cmake-var TPL_LAPACK_LIBRARY_DIRS STRING FORCE : ${OPENBLAS_ROOT|ENV}/lib -opt-set-cmake-var TPL_LAPACK_LIBRARIES STRING FORCE : ${OPENBLAS_ROOT|ENV}/lib/libopenblas.a;-L${OPENBLAS_ROOT|ENV}/lib;-lgfortran;-lgomp;-lm - opt-set-cmake-var TPL_Netcdf_LIBRARIES STRING FORCE : "" opt-set-cmake-var TPL_ENABLE_ParMETIS BOOL FORCE : OFF @@ -2112,7 +2102,9 @@ use USE-RDC|NO use USE-UVM|NO use USE-DEPRECATED|YES use PACKAGE-ENABLES|NO-EPETRA -use SEMS_COMMON_CUDA +use CUDA +use CUDA-RUN-SERIAL-TESTS + opt-set-cmake-var Trilinos_ENABLE_TESTS BOOL : ON opt-set-cmake-var TPL_ENABLE_X11 BOOL : OFF opt-set-cmake-var MPI_EXEC_PRE_NUMPROCS_FLAGS STRING FORCE : --bind-to;none --mca btl ^smcuda @@ -2120,7 +2112,6 @@ opt-set-cmake-var Kokkos_ENABLE_IMPL_CUDA_MALLOC_ASYNC BOOL : OFF [rhel8_cuda-11-gcc-openmpi_release_static_Ampere80_no-asan_complex_no-fpic_mpi_pt_no-rdc_no-uvm_deprecated-on_no-package-enables] use rhel8_cuda-gcc-openmpi_release_static_Ampere80_no-asan_complex_no-fpic_mpi_pt_no-rdc_no-uvm_deprecated-on_no-package-enables -use CUDA11-RUN-SERIAL-TESTS opt-set-cmake-var ROL_test_elementwise_TpetraMultiVector_MPI_4_DISABLE BOOL : ON [rhel8_cuda-gcc-openmpi_release_static_Ampere80_no-asan_complex_no-fpic_mpi_pt_no-rdc_uvm_deprecated-on_no-package-enables] @@ -2138,13 +2129,12 @@ use USE-RDC|NO use USE-UVM|YES use USE-DEPRECATED|YES use PACKAGE-ENABLES|NO-EPETRA -use SEMS_COMMON_CUDA -use CUDA11-RUN-SERIAL-TESTS +use CUDA +use CUDA-RUN-SERIAL-TESTS opt-set-cmake-var Trilinos_ENABLE_TESTS BOOL FORCE : OFF opt-set-cmake-var Kokkos_ENABLE_TESTS BOOL FORCE : ON - [rhel8_python_debug_shared_no-kokkos-arch_no-asan_no-complex_no-fpic_no-mpi_no-pt_no-rdc_no-uvm_deprecated-on_pr-framework] use PACKAGE-ENABLES|PR-FRAMEWORK diff --git a/packages/framework/pr_tools/LaunchDriver.py b/packages/framework/pr_tools/LaunchDriver.py index fe4f20f40372..b98603fa26d7 100755 --- a/packages/framework/pr_tools/LaunchDriver.py +++ b/packages/framework/pr_tools/LaunchDriver.py @@ -116,8 +116,11 @@ def main(argv): if args.kokkos_develop: cmd += " --kokkos-develop" + # extra-configure-args flag currently takes precedence over the env. var. if args.extra_configure_args: cmd += f" --extra-configure-args=\"{args.extra_configure_args}\"" + elif os.getenv("EXTRA_CONFIGURE_ARGS"): + cmd += f" --extra-configure-args=\"{os.getenv('EXTRA_CONFIGURE_ARGS')}\"" print("LaunchDriver> EXEC: " + cmd, flush=True) diff --git a/packages/framework/pr_tools/trilinosprhelpers/TrilinosPRConfigurationBase.py b/packages/framework/pr_tools/trilinosprhelpers/TrilinosPRConfigurationBase.py index 9c1f8567fe5e..3b585ad6eca6 100644 --- a/packages/framework/pr_tools/trilinosprhelpers/TrilinosPRConfigurationBase.py +++ b/packages/framework/pr_tools/trilinosprhelpers/TrilinosPRConfigurationBase.py @@ -801,7 +801,8 @@ def prepare_test(self): "F77", "F90", "FC", - "MODULESHOME" + "MODULESHOME", + "EXTRA_CONFIGURE_ARGS" ] self.message("") tr_env.set_environment.pretty_print_envvars(envvar_filter=envvars_to_print) diff --git a/packages/framework/pr_tools/trilinosprhelpers/TrilinosPRConfigurationStandard.py b/packages/framework/pr_tools/trilinosprhelpers/TrilinosPRConfigurationStandard.py index 401824ea8b6d..1f8038c6eaf5 100644 --- a/packages/framework/pr_tools/trilinosprhelpers/TrilinosPRConfigurationStandard.py +++ b/packages/framework/pr_tools/trilinosprhelpers/TrilinosPRConfigurationStandard.py @@ -17,7 +17,7 @@ class TrilinosPRConfigurationStandard(TrilinosPRConfigurationBase): Implements Standard mode Trilinos Pull Request Driver """ def __init__(self, args): - super(TrilinosPRConfigurationStandard, self).__init__(args) + super().__init__(args) def execute_test(self): diff --git a/packages/framework/pr_tools/unittests/test_PullRequestLinuxDriverTest.py b/packages/framework/pr_tools/unittests/test_PullRequestLinuxDriverTest.py index 8d596681d910..c02193f18a28 100755 --- a/packages/framework/pr_tools/unittests/test_PullRequestLinuxDriverTest.py +++ b/packages/framework/pr_tools/unittests/test_PullRequestLinuxDriverTest.py @@ -84,6 +84,7 @@ def setUp(self): req_mem_per_core=3.0, max_cores_allowed=12, num_concurrent_tests=-1, + slots_per_gpu=2, ccache_enable=False, dry_run=False, use_explicit_cachefile=False, diff --git a/packages/ifpack2/src/Ifpack2_LocalSparseTriangularSolver_def.hpp b/packages/ifpack2/src/Ifpack2_LocalSparseTriangularSolver_def.hpp index c5ecbcbde653..0c39e53f65f4 100644 --- a/packages/ifpack2/src/Ifpack2_LocalSparseTriangularSolver_def.hpp +++ b/packages/ifpack2/src/Ifpack2_LocalSparseTriangularSolver_def.hpp @@ -10,6 +10,9 @@ #ifndef IFPACK2_LOCALSPARSETRIANGULARSOLVER_DEF_HPP #define IFPACK2_LOCALSPARSETRIANGULARSOLVER_DEF_HPP +#include // ostringstream +#include // runtime_error + #include "Ifpack2_LocalSparseTriangularSolver_decl.hpp" #include "Tpetra_CrsMatrix.hpp" #include "Tpetra_Core.hpp" @@ -24,6 +27,53 @@ namespace Ifpack2 { namespace Details { + +#if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) && defined(KOKKOS_ENABLE_CUDA) + +inline void cusparse_error_throw(cusparseStatus_t cusparseStatus, const char* name, + const char* file, const int line) { + std::ostringstream out; +#if defined(CUSPARSE_VERSION) && (10300 <= CUSPARSE_VERSION) + out << name << " error( " << cusparseGetErrorName(cusparseStatus) << "): " << cusparseGetErrorString(cusparseStatus); +#else + out << name << " error( "; + switch (cusparseStatus) { + case CUSPARSE_STATUS_NOT_INITIALIZED: + out << "CUSPARSE_STATUS_NOT_INITIALIZED): cusparse handle was not " + "created correctly."; + break; + case CUSPARSE_STATUS_ALLOC_FAILED: + out << "CUSPARSE_STATUS_ALLOC_FAILED): you might tried to allocate too " + "much memory"; + break; + case CUSPARSE_STATUS_INVALID_VALUE: out << "CUSPARSE_STATUS_INVALID_VALUE)"; break; + case CUSPARSE_STATUS_ARCH_MISMATCH: out << "CUSPARSE_STATUS_ARCH_MISMATCH)"; break; + case CUSPARSE_STATUS_MAPPING_ERROR: out << "CUSPARSE_STATUS_MAPPING_ERROR)"; break; + case CUSPARSE_STATUS_EXECUTION_FAILED: out << "CUSPARSE_STATUS_EXECUTION_FAILED)"; break; + case CUSPARSE_STATUS_INTERNAL_ERROR: out << "CUSPARSE_STATUS_INTERNAL_ERROR)"; break; + case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED: out << "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED)"; break; + case CUSPARSE_STATUS_ZERO_PIVOT: out << "CUSPARSE_STATUS_ZERO_PIVOT)"; break; + default: out << "unrecognized error code): this is bad!"; break; + } +#endif // CUSPARSE_VERSION + if (file) { + out << " " << file << ":" << line; + } + throw std::runtime_error(out.str()); +} + +inline void cusparse_safe_call(cusparseStatus_t cusparseStatus, const char* name, const char* file = nullptr, + const int line = 0) { + if (CUSPARSE_STATUS_SUCCESS != cusparseStatus) { + cusparse_error_throw(cusparseStatus, name, file, line); + } +} + +#define IFPACK2_DETAILS_CUSPARSE_SAFE_CALL(call) \ + Ifpack2::Details::cusparse_safe_call(call, #call, __FILE__, __LINE__) + +#endif // defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) && defined(KOKKOS_ENABLE_CUDA) + struct TrisolverType { enum Enum { Internal, //!< Tpetra::CrsMatrix::localSolve @@ -675,7 +725,7 @@ compute () #if (CUSPARSE_VERSION >= 12100) auto *sptrsv_handle = kh_v_[i]->get_sptrsv_handle(); auto cusparse_handle = sptrsv_handle->get_cuSparseHandle(); - KOKKOS_CUSPARSE_SAFE_CALL( + IFPACK2_DETAILS_CUSPARSE_SAFE_CALL( cusparseSetStream(cusparse_handle->handle, exec_space_instances_[i].cuda_stream())); cusparseSpSV_updateMatrix(cusparse_handle->handle, cusparse_handle->spsvDescr, diff --git a/packages/kokkos/cmake/build_env_info.cmake b/packages/kokkos/cmake/build_env_info.cmake index ac28b2d8503a..76afbb74b63c 100644 --- a/packages/kokkos/cmake/build_env_info.cmake +++ b/packages/kokkos/cmake/build_env_info.cmake @@ -4,7 +4,7 @@ find_package(Git QUIET) set(CURRENT_LIST_DIR ${CMAKE_CURRENT_LIST_DIR}) set(pre_configure_dir ${CMAKE_CURRENT_LIST_DIR}) -set(post_configure_dir ${CMAKE_BINARY_DIR}/generated) +set(post_configure_dir ${CMAKE_CURRENT_BINARY_DIR}/generated) set(pre_configure_file ${pre_configure_dir}/Kokkos_Version_Info.cpp.in) set(post_configure_file ${post_configure_dir}/Kokkos_Version_Info.cpp) @@ -105,7 +105,7 @@ function(check_git_setup) ${CURRENT_LIST_DIR}/build_env_info.cmake BYPRODUCTS ${post_configure_file} ) - add_library(impl_git_version ${CMAKE_BINARY_DIR}/generated/Kokkos_Version_Info.cpp) + add_library(impl_git_version ${CMAKE_CURRENT_BINARY_DIR}/generated/Kokkos_Version_Info.cpp) target_include_directories(impl_git_version PUBLIC ${CMAKE_BINARY_DIR}/generated) target_compile_features(impl_git_version PRIVATE cxx_raw_string_literals) add_dependencies(impl_git_version AlwaysCheckGit) diff --git a/packages/muelu/example/basic/Simple.cpp b/packages/muelu/example/basic/Simple.cpp index 4afe89e0b5e1..6b1cd48bd8d0 100644 --- a/packages/muelu/example/basic/Simple.cpp +++ b/packages/muelu/example/basic/Simple.cpp @@ -85,6 +85,8 @@ int main_(Teuchos::CommandLineProcessor &clp, Xpetra::UnderlyingLib &lib, int ar clp.setOption("belosType", &belosType, "belos solver type: (Pseudoblock CG | Block CG | Pseudoblock GMRES | Block GMRES | ...) see BelosSolverFactory.hpp for exhaustive list of solvers"); bool computeCondEst = false; clp.setOption("condEst", "noCondEst", &computeCondEst, "compute condition number estimate (currently only available for Pseudoblock CG)"); + bool enforceBoundaryConditionsOnInitialGuess = true; + clp.setOption("enforceBCs", "noEnforceBCs", &enforceBoundaryConditionsOnInitialGuess, "enforce Dirichlet boundary condition on initial guess"); double tol = 1e-12; clp.setOption("tol", &tol, "solver convergence tolerance"); bool binaryFormat = false; @@ -201,7 +203,7 @@ int main_(Teuchos::CommandLineProcessor &clp, Xpetra::UnderlyingLib &lib, int ar // ========================================================================= { comm->barrier(); - SystemSolve(A, X, B, H, Prec, out, solveType, belosType, false, false, useML, cacheSize, 0, scaleResidualHist, solvePreconditioned, maxIts, tol, computeCondEst); + SystemSolve(A, X, B, H, Prec, out, solveType, belosType, false, false, useML, cacheSize, 0, scaleResidualHist, solvePreconditioned, maxIts, tol, computeCondEst, enforceBoundaryConditionsOnInitialGuess); comm->barrier(); } diff --git a/packages/muelu/src/Utils/MueLu_UtilitiesBase_decl.hpp b/packages/muelu/src/Utils/MueLu_UtilitiesBase_decl.hpp index 8aeac791865d..57018cd0b047 100644 --- a/packages/muelu/src/Utils/MueLu_UtilitiesBase_decl.hpp +++ b/packages/muelu/src/Utils/MueLu_UtilitiesBase_decl.hpp @@ -254,6 +254,13 @@ class UtilitiesBase { */ static Teuchos::ArrayRCP DetectDirichletRowsExt(const Xpetra::Matrix& A, bool& bHasZeroDiagonal, const Magnitude& tol = Teuchos::ScalarTraits::zero()); + /*! @brief Detect Dirichlet rows and copy values from RHS multivector to InitialGuess for Dirichlet rows. + + This can be used to assure that the InitialGuess satisfies the boundary conditions enforced on A. + Useful in particular for using CG when boundary conditions have only been enforce by one-and-zeroing rows of A, but not columns. + */ + static void EnforceInitialCondition(const Xpetra::Matrix& A, const Xpetra::MultiVector& RHS, Xpetra::MultiVector& InitialGuess, const Magnitude& tol = Teuchos::ScalarTraits::zero(), const bool count_twos_as_dirichlet = false); + /*! @brief Find non-zero values in an ArrayRCP Compares the value to 2 * machine epsilon diff --git a/packages/muelu/src/Utils/MueLu_UtilitiesBase_def.hpp b/packages/muelu/src/Utils/MueLu_UtilitiesBase_def.hpp index 6e181415b09b..47db9ba0635f 100644 --- a/packages/muelu/src/Utils/MueLu_UtilitiesBase_def.hpp +++ b/packages/muelu/src/Utils/MueLu_UtilitiesBase_def.hpp @@ -1180,6 +1180,37 @@ UtilitiesBase:: return boundaryNodes; } +template +void UtilitiesBase:: + EnforceInitialCondition(const Xpetra::Matrix& A, + const Xpetra::MultiVector& RHS, + Xpetra::MultiVector& InitialGuess, + const typename Teuchos::ScalarTraits::magnitudeType& tol, + const bool count_twos_as_dirichlet) { + using range_type = Kokkos::RangePolicy; + + auto dirichletRows = DetectDirichletRows_kokkos(A, tol, count_twos_as_dirichlet); + + LocalOrdinal numRows = A.getLocalNumRows(); + LocalOrdinal numVectors = RHS.getNumVectors(); + TEUCHOS_ASSERT_EQUALITY(numVectors, Teuchos::as(InitialGuess.getNumVectors())); +#ifdef MUELU_DEBUG + TEUCHOS_ASSERT(RHS.getMap()->isCompatible(InitialGuess.getMap())); +#endif + + auto lclRHS = RHS.getDeviceLocalView(Xpetra::Access::ReadOnly); + auto lclInitialGuess = InitialGuess.getDeviceLocalView(Xpetra::Access::ReadWrite); + + Kokkos::parallel_for( + "MueLu:Utils::EnforceInitialCondition", range_type(0, numRows), + KOKKOS_LAMBDA(const LO row) { + if (dirichletRows(row)) { + for (LocalOrdinal j = 0; j < numVectors; ++j) + lclInitialGuess(row, j) = lclRHS(row, j); + } + }); +} + template void UtilitiesBase:: FindNonZeros(const Teuchos::ArrayRCP vals, diff --git a/packages/muelu/test/scaling/Driver.cpp b/packages/muelu/test/scaling/Driver.cpp index 39f2fd5d2053..6b2ef85b8845 100644 --- a/packages/muelu/test/scaling/Driver.cpp +++ b/packages/muelu/test/scaling/Driver.cpp @@ -204,6 +204,8 @@ int main_(Teuchos::CommandLineProcessor& clp, Xpetra::UnderlyingLib& lib, int ar clp.setOption("belosType", &belosType, "belos solver type: (Pseudoblock CG | Block CG | Pseudoblock GMRES | Block GMRES | ...) see BelosSolverFactory.hpp for exhaustive list of solvers"); bool computeCondEst = false; clp.setOption("condEst", "noCondEst", &computeCondEst, "compute condition number estimate (currently only available for Pseudoblock CG)"); + bool enforceBoundaryConditionsOnInitialGuess = true; + clp.setOption("enforceBCs", "noEnforceBCs", &enforceBoundaryConditionsOnInitialGuess, "enforce Dirichlet boundary condition on initial guess"); double dtol = 1e-12, tol; clp.setOption("tol", &dtol, "solver convergence tolerance"); bool binaryFormat = false; @@ -523,7 +525,7 @@ int main_(Teuchos::CommandLineProcessor& clp, Xpetra::UnderlyingLib& lib, int ar } // Solve the system numResolves+1 times - SystemSolve(A, X, B, H, Prec, out2, solveType, belosType, profileSolve, useAMGX, useML, cacheSize, numResolves, scaleResidualHist, solvePreconditioned, maxIts, tol, computeCondEst); + SystemSolve(A, X, B, H, Prec, out2, solveType, belosType, profileSolve, useAMGX, useML, cacheSize, numResolves, scaleResidualHist, solvePreconditioned, maxIts, tol, computeCondEst, enforceBoundaryConditionsOnInitialGuess); comm->barrier(); } catch (const std::exception& e) { diff --git a/packages/muelu/test/scaling/DriverCore.hpp b/packages/muelu/test/scaling/DriverCore.hpp index e04b9e51748b..89e3dd5c04c6 100644 --- a/packages/muelu/test/scaling/DriverCore.hpp +++ b/packages/muelu/test/scaling/DriverCore.hpp @@ -233,7 +233,8 @@ void SystemSolve(Teuchos::RCP using Teuchos::RCP; using Teuchos::rcp; @@ -272,6 +273,10 @@ void SystemSolve(Teuchos::RCP tm = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("Driver: 3 - LHS and RHS initialization"))); X->putScalar(zero); + if (enforceBoundaryConditionsOnInitialGuess) { + out << "Enforcing boundary conditions on initial guess\n"; + Utilities::EnforceInitialCondition(*A, *B, *X); + } tm = Teuchos::null; if (solveType == "none") { diff --git a/packages/muelu/test/unit_tests/Utilities.cpp b/packages/muelu/test/unit_tests/Utilities.cpp index 97842d36f208..77a18d329854 100644 --- a/packages/muelu/test/unit_tests/Utilities.cpp +++ b/packages/muelu/test/unit_tests/Utilities.cpp @@ -144,6 +144,47 @@ TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(Utilities, DetectDirichletRows, Scalar, LocalO } // DetectDirichletRows +TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(Utilities, EnforceInitialCondition, Scalar, LocalOrdinal, GlobalOrdinal, Node) { +#include + MUELU_TESTING_SET_OSTREAM; + MUELU_TESTING_LIMIT_SCOPE(Scalar, GlobalOrdinal, Node); + + typedef typename Teuchos::ScalarTraits TST; + + RCP A = TestHelpers::TestFactory::Build1DPoisson(100); + Teuchos::ArrayView indices; + Teuchos::ArrayView values; + + LocalOrdinal localRowToZero = 5; + A->resumeFill(); + A->getLocalRowView(localRowToZero, indices, values); + Array newvalues(values.size(), TST::zero()); + for (int j = 0; j < indices.size(); j++) + // keep diagonal + if (indices[j] == localRowToZero) newvalues[j] = values[j]; + A->replaceLocalValues(localRowToZero, indices, newvalues); + + A->fillComplete(); + + auto RHS = MultiVectorFactory::Build(A->getRangeMap(), 1); + RHS->randomize(); + auto X = MultiVectorFactory::Build(A->getDomainMap(), 1); + X->putScalar(666. * TST::one()); + Utilities::EnforceInitialCondition(*A, *RHS, *X, TST::magnitude(0.26)); + + auto lclRHS = RHS->getHostLocalView(Xpetra::Access::ReadOnly); + auto lclX = X->getHostLocalView(Xpetra::Access::ReadOnly); + + // row 5 is Dirichlet + for (size_t row = 0; row < A->getLocalNumRows(); ++row) { + if (row == 5) { + TEST_EQUALITY(lclRHS(row, 0), lclX(row, 0)); + } else { + TEST_EQUALITY(666. * TST::one(), lclX(row, 0)); + } + } +} // EnforceInitialCondition + TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(Utilities, GetDiagonalInverse, Scalar, LocalOrdinal, GlobalOrdinal, Node) { #include MUELU_TESTING_SET_OSTREAM; @@ -683,6 +724,7 @@ TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(Utilities, TransposeNonsymmetricConstMatrix, S #define MUELU_ETI_GROUP(Scalar, LO, GO, Node) \ TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(Utilities, MatMatMult_EpetraVsTpetra, Scalar, LO, GO, Node) \ TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(Utilities, DetectDirichletRows, Scalar, LO, GO, Node) \ + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(Utilities, EnforceInitialCondition, Scalar, LO, GO, Node) \ TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(Utilities, GetDiagonalInverse, Scalar, LO, GO, Node) \ TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(Utilities, GetLumpedDiagonal, Scalar, LO, GO, Node) \ TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(Utilities, GetInverse, Scalar, LO, GO, Node) \ diff --git a/packages/phalanx/example/FiniteElementAssembly/Mesh.cpp b/packages/phalanx/example/FiniteElementAssembly/Mesh.cpp index b18837921c17..0c8ad139218d 100644 --- a/packages/phalanx/example/FiniteElementAssembly/Mesh.cpp +++ b/packages/phalanx/example/FiniteElementAssembly/Mesh.cpp @@ -204,7 +204,7 @@ KOKKOS_INLINE_FUNCTION void Mesh::operator() (const ComputeJac_Tag& , const team_t& team) const { const int cell = team.league_rank(); - Kokkos::parallel_for(Kokkos::TeamThreadRange(team,0,qp_.extent(0)), [=] (const int& qp) { + Kokkos::parallel_for(Kokkos::TeamThreadRange(team,0,qp_.extent(0)), [&] (const int& qp) { for (int basis=0; basis < static_cast(basis_.extent(1)); ++basis) { for (int i=0; i < 3; ++i) { for (int j=0; j < 3; ++j) { @@ -220,7 +220,7 @@ KOKKOS_INLINE_FUNCTION void Mesh::operator() (const ComputeInvJac_Tag& , const team_t& team) const { const int cell = team.league_rank(); - Kokkos::parallel_for(Kokkos::TeamThreadRange(team,0,qp_.extent(0)), [=] (const int& qp) { + Kokkos::parallel_for(Kokkos::TeamThreadRange(team,0,qp_.extent(0)), [&] (const int& qp) { inv_jac_(cell,qp,0,0) = jac_(cell,qp,1,1) * jac_(cell,qp,2,2) - jac_(cell,qp,1,2) * jac_(cell,qp,2,1); inv_jac_(cell,qp,1,1) = jac_(cell,qp,2,2) * jac_(cell,qp,0,0) - jac_(cell,qp,2,0) * jac_(cell,qp,0,2); inv_jac_(cell,qp,2,2) = jac_(cell,qp,0,0) * jac_(cell,qp,1,1) - jac_(cell,qp,0,1) * jac_(cell,qp,1,0); @@ -246,7 +246,7 @@ KOKKOS_INLINE_FUNCTION void Mesh::operator() (const ComputeCoords_Tag& , const team_t& team) const { const int cell = team.league_rank(); - Kokkos::parallel_for(Kokkos::TeamThreadRange(team,0,qp_.extent(0)), [=] (const int& qp) { + Kokkos::parallel_for(Kokkos::TeamThreadRange(team,0,qp_.extent(0)), [&] (const int& qp) { for (int basis=0; basis < static_cast(basis_.extent(1)); ++basis) { qp_coords_(cell,qp,0) += basis_(qp,basis) * coords_(cell,basis,0); qp_coords_(cell,qp,1) += basis_(qp,basis) * coords_(cell,basis,1); @@ -260,7 +260,7 @@ KOKKOS_INLINE_FUNCTION void Mesh::operator() (const ComputeGradBasisReal_Tag& , const team_t& team) const { const int cell = team.league_rank(); - Kokkos::parallel_for(Kokkos::TeamThreadRange(team,0,grad_basis_real_.extent(1)), [=] (const int& qp) { + Kokkos::parallel_for(Kokkos::TeamThreadRange(team,0,grad_basis_real_.extent(1)), [&] (const int& qp) { const int num_basis = static_cast(grad_basis_real_.extent(2)); for (int basis=0; basis < num_basis; ++basis) for (int dim1=0; dim1 < 3; ++dim1) diff --git a/packages/phalanx/example/FiniteElementAssembly/WorksetBuilder.hpp b/packages/phalanx/example/FiniteElementAssembly/WorksetBuilder.hpp index aab8d2e09ce0..42e4bb4d28d5 100644 --- a/packages/phalanx/example/FiniteElementAssembly/WorksetBuilder.hpp +++ b/packages/phalanx/example/FiniteElementAssembly/WorksetBuilder.hpp @@ -31,7 +31,7 @@ struct WorksetBuilder { void operator() (const CopyWorksetDetJac_Tag& , const team_t& team) const { const int cell = team.league_rank(); - Kokkos::parallel_for(Kokkos::TeamThreadRange(team,0,mesh_det_jac.extent(1)), [=] (const int& qp) { + Kokkos::parallel_for(Kokkos::TeamThreadRange(team,0,mesh_det_jac.extent(1)), [&] (const int& qp) { workset_det_jac(cell,qp) = mesh_det_jac(cell+first_cell_global_index,qp); //printf("det_jac=%f\n",workset.det_jac_(cell,qp)); }); @@ -41,7 +41,7 @@ struct WorksetBuilder { void operator() (const CopyWorksetGradBasisReal_Tag& , const team_t& team) const { const int cell = team.league_rank(); - Kokkos::parallel_for(Kokkos::TeamThreadRange(team,0,mesh_det_jac.extent(1)), [=] (const int& qp) { + Kokkos::parallel_for(Kokkos::TeamThreadRange(team,0,mesh_det_jac.extent(1)), [&] (const int& qp) { for (int basis=0; basis < static_cast(mesh_grad_basis_real.extent(2)); ++basis) for (int dim=0; dim < static_cast(mesh_grad_basis_real.extent(3)); ++dim) workset_grad_basis_real(cell,qp,basis,dim) = diff --git a/packages/phalanx/example/FiniteElementAssembly/evaluators/GatherSolution_Def.hpp b/packages/phalanx/example/FiniteElementAssembly/evaluators/GatherSolution_Def.hpp index 511117b15828..bc4ca937f18a 100644 --- a/packages/phalanx/example/FiniteElementAssembly/evaluators/GatherSolution_Def.hpp +++ b/packages/phalanx/example/FiniteElementAssembly/evaluators/GatherSolution_Def.hpp @@ -50,7 +50,7 @@ operator()(const Kokkos::TeamPolicy::member_type& team) const const int cell = team.league_rank(); if (team.team_rank() == 0) { // Fix gcc 5/6 lambda bug by changing to capture by value (potentially less efficient) - Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,field.extent(1)), [=] (const int& node) { + Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,field.extent(1)), [&] (const int& node) { field(cell,node) = x( gids(cell_global_offset_index+cell,node) * num_equations + field_index); }); } @@ -94,8 +94,7 @@ operator()(const Kokkos::TeamPolicy::member_type& team) const { const int cell = team.league_rank(); if (team.team_rank() == 0) { - // Fix gcc 5/6 lambda bug by changing to capture by value (potentially less efficient) - Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,field.extent(1)), [=] (const int& node) { + Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,field.extent(1)), [&] (const int& node) { field(cell,node).val() = x(gids(cell_global_offset_index+cell,node) * num_equations + field_index); field(cell,node).fastAccessDx(num_equations * node + field_index) = 1.0; }); diff --git a/packages/phalanx/example/FiniteElementAssembly/evaluators/ScatterResidual_Def.hpp b/packages/phalanx/example/FiniteElementAssembly/evaluators/ScatterResidual_Def.hpp index 1ddb6ffd7d82..71524568daa1 100644 --- a/packages/phalanx/example/FiniteElementAssembly/evaluators/ScatterResidual_Def.hpp +++ b/packages/phalanx/example/FiniteElementAssembly/evaluators/ScatterResidual_Def.hpp @@ -54,7 +54,7 @@ operator()(const Kokkos::TeamPolicy::member_type& team) const const int local_cell = team.league_rank(); if (team.team_rank() == 0) { // Fix gcc 5/6 lambda bug by changing to capture by value (potentially less efficient) - Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,residual_contribution.extent(1)), [=] (const int& node) { + Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,residual_contribution.extent(1)), [&] (const int& node) { const int residual_index = gids(cell_global_offset_index+local_cell,node) * num_equations + equation_index; global_residual_atomic(residual_index) += residual_contribution(local_cell,node); }); @@ -106,15 +106,13 @@ operator()(const Kokkos::TeamPolicy::member_type& team) const const int num_nodes = residual_contribution.extent(1); if (team.team_rank() == 0) { - // Fix gcc 5/6 lambda bug by changing to capture by value (potentially less efficient) - Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,num_nodes), [=] (const int& node) { + Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,num_nodes), [&] (const int& node) { const int global_row_index = gids(cell_global_offset_index+cell,node) * num_equations + equation_index; global_residual_atomic(global_row_index) += residual_contribution(cell,node).val(); }); } - // Fix gcc 5/6 lambda bug by changing to capture by value (potentially less efficient) - Kokkos::parallel_for(Kokkos::TeamThreadRange(team,0,num_nodes), [=] (const int& node) { + Kokkos::parallel_for(Kokkos::TeamThreadRange(team,0,num_nodes), [&] (const int& node) { const int global_row_index = gids(cell_global_offset_index+cell,node) * num_equations + equation_index; @@ -122,8 +120,7 @@ operator()(const Kokkos::TeamPolicy::member_type& team) const for (int col_node=0; col_node < num_nodes; ++col_node) { // loop over equations - // Fix gcc 5/6 lambda bug by changing to capture by value (potentially less efficient) - Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,num_equations),[=] (const int& col_eq) { + Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,num_equations),[&] (const int& col_eq) { const int global_col_index = gids(cell_global_offset_index+cell,col_node) * num_equations + col_eq; const int derivative_index = col_node * num_equations + col_eq; global_jacobian.sumIntoValues(global_row_index,&global_col_index,1, diff --git a/packages/phalanx/example/FiniteElementAssembly_MixedFieldTypes/evaluators/GatherSolution_Def.hpp b/packages/phalanx/example/FiniteElementAssembly_MixedFieldTypes/evaluators/GatherSolution_Def.hpp index defeee2c940a..787f3eeaeefe 100644 --- a/packages/phalanx/example/FiniteElementAssembly_MixedFieldTypes/evaluators/GatherSolution_Def.hpp +++ b/packages/phalanx/example/FiniteElementAssembly_MixedFieldTypes/evaluators/GatherSolution_Def.hpp @@ -50,7 +50,7 @@ operator()(const Kokkos::TeamPolicy::member_type& team) const const int cell = team.league_rank(); if (team.team_rank() == 0) { // Fix gcc 5/6 lambda bug by changing to capture by value (potentially less efficient) - Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,field.extent(1)), [=] (const int& node) { + Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,field.extent(1)), [&] (const int& node) { field(cell,node) = x( gids(cell_global_offset_index+cell,node) * num_equations + field_index); }); } @@ -95,7 +95,7 @@ operator()(const Kokkos::TeamPolicy::member_type& team) const const int cell = team.league_rank(); if (team.team_rank() == 0) { // Fix gcc 5/6 lambda bug by changing to capture by value (potentially less efficient) - Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,field.extent(1)), [=] (const int& node) { + Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,field.extent(1)), [&] (const int& node) { field(cell,node).val() = x(gids(cell_global_offset_index+cell,node) * num_equations + field_index); field(cell,node).fastAccessDx(num_equations * node + field_index) = 1.0; }); diff --git a/packages/phalanx/example/FiniteElementAssembly_MixedFieldTypes/evaluators/ScatterResidual_Def.hpp b/packages/phalanx/example/FiniteElementAssembly_MixedFieldTypes/evaluators/ScatterResidual_Def.hpp index 1ddb6ffd7d82..72d729ae454b 100644 --- a/packages/phalanx/example/FiniteElementAssembly_MixedFieldTypes/evaluators/ScatterResidual_Def.hpp +++ b/packages/phalanx/example/FiniteElementAssembly_MixedFieldTypes/evaluators/ScatterResidual_Def.hpp @@ -53,8 +53,7 @@ operator()(const Kokkos::TeamPolicy::member_type& team) const { const int local_cell = team.league_rank(); if (team.team_rank() == 0) { - // Fix gcc 5/6 lambda bug by changing to capture by value (potentially less efficient) - Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,residual_contribution.extent(1)), [=] (const int& node) { + Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,residual_contribution.extent(1)), [&] (const int& node) { const int residual_index = gids(cell_global_offset_index+local_cell,node) * num_equations + equation_index; global_residual_atomic(residual_index) += residual_contribution(local_cell,node); }); @@ -106,15 +105,13 @@ operator()(const Kokkos::TeamPolicy::member_type& team) const const int num_nodes = residual_contribution.extent(1); if (team.team_rank() == 0) { - // Fix gcc 5/6 lambda bug by changing to capture by value (potentially less efficient) - Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,num_nodes), [=] (const int& node) { + Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,num_nodes), [&] (const int& node) { const int global_row_index = gids(cell_global_offset_index+cell,node) * num_equations + equation_index; global_residual_atomic(global_row_index) += residual_contribution(cell,node).val(); }); } - // Fix gcc 5/6 lambda bug by changing to capture by value (potentially less efficient) - Kokkos::parallel_for(Kokkos::TeamThreadRange(team,0,num_nodes), [=] (const int& node) { + Kokkos::parallel_for(Kokkos::TeamThreadRange(team,0,num_nodes), [&] (const int& node) { const int global_row_index = gids(cell_global_offset_index+cell,node) * num_equations + equation_index; @@ -122,8 +119,7 @@ operator()(const Kokkos::TeamPolicy::member_type& team) const for (int col_node=0; col_node < num_nodes; ++col_node) { // loop over equations - // Fix gcc 5/6 lambda bug by changing to capture by value (potentially less efficient) - Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,num_equations),[=] (const int& col_eq) { + Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,num_equations),[&] (const int& col_eq) { const int global_col_index = gids(cell_global_offset_index+cell,col_node) * num_equations + col_eq; const int derivative_index = col_node * num_equations + col_eq; global_jacobian.sumIntoValues(global_row_index,&global_col_index,1, diff --git a/packages/phalanx/test/EvaluatorUnitTester/AllRanksEvaluator_Def.hpp b/packages/phalanx/test/EvaluatorUnitTester/AllRanksEvaluator_Def.hpp index b69110b58d97..fd69b0154179 100644 --- a/packages/phalanx/test/EvaluatorUnitTester/AllRanksEvaluator_Def.hpp +++ b/packages/phalanx/test/EvaluatorUnitTester/AllRanksEvaluator_Def.hpp @@ -60,7 +60,7 @@ operator()(const Kokkos::TeamPolicy::member_type& team) const { const int i = team.league_rank(); - Kokkos::single(Kokkos::PerTeam(team), [=] () { + Kokkos::single(Kokkos::PerTeam(team), [&] () { x1(i) = f1(i) * f1(i); }); diff --git a/packages/phalanx/test/EvaluatorUnitTester/DuplicateFieldEvaluator_Def.hpp b/packages/phalanx/test/EvaluatorUnitTester/DuplicateFieldEvaluator_Def.hpp index 6c17b2eee83f..c6301328f516 100644 --- a/packages/phalanx/test/EvaluatorUnitTester/DuplicateFieldEvaluator_Def.hpp +++ b/packages/phalanx/test/EvaluatorUnitTester/DuplicateFieldEvaluator_Def.hpp @@ -42,7 +42,7 @@ operator()(const Kokkos::TeamPolicy::member_type& team) const const int cell = team.league_rank(); const int num_qp = static_cast(a.extent(1)); - Kokkos::parallel_for(Kokkos::TeamThreadRange(team,0,num_qp), [=] (const int& qp) { + Kokkos::parallel_for(Kokkos::TeamThreadRange(team,0,num_qp), [&] (const int& qp) { const int num_dim = static_cast(c.extent(2)); a(cell,qp) = 0.0; for (int i = 0; i < num_dim; ++i) diff --git a/packages/phalanx/test/EvaluatorUnitTester/SimpleEvaluator_Def.hpp b/packages/phalanx/test/EvaluatorUnitTester/SimpleEvaluator_Def.hpp index 50b8eac74e77..847422c9ce6f 100644 --- a/packages/phalanx/test/EvaluatorUnitTester/SimpleEvaluator_Def.hpp +++ b/packages/phalanx/test/EvaluatorUnitTester/SimpleEvaluator_Def.hpp @@ -40,7 +40,7 @@ operator()(const Kokkos::TeamPolicy::member_type& team) const const int cell = team.league_rank(); const int num_qp = static_cast(a.extent(1)); - Kokkos::parallel_for(Kokkos::TeamThreadRange(team,0,num_qp), [=] (const int& qp) { + Kokkos::parallel_for(Kokkos::TeamThreadRange(team,0,num_qp), [&] (const int& qp) { const int num_dim = static_cast(c.extent(2)); a(cell,qp) = 0.0; for (int i = 0; i < num_dim; ++i) diff --git a/packages/phalanx/test/Kokkos/tKokkos.cpp b/packages/phalanx/test/Kokkos/tKokkos.cpp index b42e8355e4b3..560fffcae102 100644 --- a/packages/phalanx/test/Kokkos/tKokkos.cpp +++ b/packages/phalanx/test/Kokkos/tKokkos.cpp @@ -954,7 +954,7 @@ namespace phalanx_test { n.count_ += 1; n.mean_ += ( a(i) - n_minus_one.mean_ ) / n.count_; n.M2_ += ( a(i) - n_minus_one.mean_ ) * ( a(i) - n.mean_ ); - success_local = Kokkos::atomic_compare_exchange_strong(&(values()),n_minus_one,n); + success_local = (n_minus_one == Kokkos::atomic_compare_exchange(&(values()),n_minus_one,n)); } while (!success_local); }); PHX::Device().fence(); diff --git a/packages/phalanx/test/Kokkos/tKokkosNestedLambda.cpp b/packages/phalanx/test/Kokkos/tKokkosNestedLambda.cpp index 1a5b2b7bd2d2..a744c5d1631c 100644 --- a/packages/phalanx/test/Kokkos/tKokkosNestedLambda.cpp +++ b/packages/phalanx/test/Kokkos/tKokkosNestedLambda.cpp @@ -34,9 +34,9 @@ class MyFunctor { { const int cell = team.league_rank(); const int num_pts = a_.extent(1); - Kokkos::parallel_for(Kokkos::TeamThreadRange(team,0,num_pts), [=] (const int& pt) { + Kokkos::parallel_for(Kokkos::TeamThreadRange(team,0,num_pts), [&] (const int& pt) { const int num_eq = a_.extent(2); - Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,num_eq), [=] (const int& eq) { + Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,num_eq), [&] (const int& eq) { c_(cell,pt,eq) = a_(cell,pt,eq) + b_(cell,pt,eq); }); }); diff --git a/packages/phalanx/test/Kokkos/tKokkosPerf.cpp b/packages/phalanx/test/Kokkos/tKokkosPerf.cpp index abf9001649d9..4c5d25cb9dbf 100644 --- a/packages/phalanx/test/Kokkos/tKokkosPerf.cpp +++ b/packages/phalanx/test/Kokkos/tKokkosPerf.cpp @@ -82,7 +82,7 @@ namespace phalanx_test { { const int i = thread.league_rank(); const int num_qp = rho_.extent(1); - Kokkos::parallel_for(Kokkos::TeamThreadRange(thread,0,num_qp), [=] (const int& ip) { + Kokkos::parallel_for(Kokkos::TeamThreadRange(thread,0,num_qp), [&] (const int& ip) { rho_(i,ip) = k_(0) * P_(i,ip) / T_(i,ip); }); } diff --git a/packages/phalanx/test/Kokkos/tKokkosVirtualFunctionOnDevice.cpp b/packages/phalanx/test/Kokkos/tKokkosVirtualFunctionOnDevice.cpp index 5f5aca03b35b..07ffee0b5340 100644 --- a/packages/phalanx/test/Kokkos/tKokkosVirtualFunctionOnDevice.cpp +++ b/packages/phalanx/test/Kokkos/tKokkosVirtualFunctionOnDevice.cpp @@ -46,12 +46,13 @@ namespace phalanx_test { // Derived class class IdealGasLaw : public EquationOfState { - double mass_; // mass + // double mass_; // mass double gamma_; // ratio of specific heats - double r_; // Boltzmann constant + // double r_; // Boltzmann constant public: KOKKOS_FUNCTION - IdealGasLaw() : mass_(28.0), gamma_(5./3.), r_(1.38066e-23) {} + // IdealGasLaw() : mass_(28.0), gamma_(5./3.), r_(1.38066e-23) {} + IdealGasLaw() : gamma_(5./3.) {} KOKKOS_FUNCTION double a(const double& rho, diff --git a/packages/phalanx/test/UnmanagedFields/MDField/MDField_TestEvaluators_Def.hpp b/packages/phalanx/test/UnmanagedFields/MDField/MDField_TestEvaluators_Def.hpp index 0e6687c439cd..c24bdcfc3384 100644 --- a/packages/phalanx/test/UnmanagedFields/MDField/MDField_TestEvaluators_Def.hpp +++ b/packages/phalanx/test/UnmanagedFields/MDField/MDField_TestEvaluators_Def.hpp @@ -12,8 +12,8 @@ #include "Phalanx_DataLayout_MDALayout.hpp" #include "Phalanx_FieldTag_Tag.hpp" -class CELL; -class BASIS; +struct CELL; +struct BASIS; namespace PHX { diff --git a/packages/phalanx/test/Utilities/Evaluator_MockDAG_Def.hpp b/packages/phalanx/test/Utilities/Evaluator_MockDAG_Def.hpp index 4c6884f8cbdd..2ba4ec5ef18f 100644 --- a/packages/phalanx/test/Utilities/Evaluator_MockDAG_Def.hpp +++ b/packages/phalanx/test/Utilities/Evaluator_MockDAG_Def.hpp @@ -14,8 +14,8 @@ #include "Phalanx_FieldTag_Tag.hpp" #include "Phalanx_MDField.hpp" -class CELL; -class BASIS; +struct CELL; +struct BASIS; namespace PHX { diff --git a/packages/shylu/shylu_node/tacho/CMakeLists.txt b/packages/shylu/shylu_node/tacho/CMakeLists.txt index 2efce7b95211..239098f10aba 100644 --- a/packages/shylu/shylu_node/tacho/CMakeLists.txt +++ b/packages/shylu/shylu_node/tacho/CMakeLists.txt @@ -7,6 +7,12 @@ IF (Kokkos_ENABLE_CUDA) ENDIF() ENDIF() +IF (Kokkos_ENABLE_HIP) + IF (NOT (TPL_ENABLE_ROCBLAS AND TPL_ENABLE_ROCSPARSE AND TPL_ENABLE_ROCSOLVER)) + MESSAGE(FATAL_ERROR "Tacho can not be build with HIP without enabling ROCBLAS, ROCSPARSE, and ROCSOLVER TPLs. Please disable Tacho, or enable these three TPLs") + ENDIF() +ENDIF() + IF (Kokkos_ENABLE_THREADS) IF (NOT Kokkos_ENABLE_OPENMP) MESSAGE(FATAL_ERROR "Tacho can not be build with Pthreads as the Kokkos Host Backend.") diff --git a/packages/tpetra/core/src/Tpetra_Core.cpp b/packages/tpetra/core/src/Tpetra_Core.cpp index 51680bb8f75d..4dfd077a3796 100644 --- a/packages/tpetra/core/src/Tpetra_Core.cpp +++ b/packages/tpetra/core/src/Tpetra_Core.cpp @@ -19,6 +19,7 @@ #include "Tpetra_Details_checkLaunchBlocking.hpp" #include "Tpetra_Details_KokkosTeuchosTimerInjection.hpp" #include "Tpetra_Details_Behavior.hpp" +#include "KokkosKernels_EagerInitialize.hpp" namespace Tpetra { @@ -129,6 +130,9 @@ namespace Tpetra { (! kokkosIsInitialized, std::logic_error, "At the end of " "initKokkosIfNeeded, Kokkos is not initialized. " "Please report this bug to the Tpetra developers."); + // Now that the Kokkos backend(s) are initialized, + // initialize all KokkosKernels TPLs. + KokkosKernels::eager_initialize(); } #ifdef HAVE_TPETRACORE_MPI