diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yml index 76c4acc93a7b..8145908d28ac 100644 --- a/.github/workflows/stale.yml +++ b/.github/workflows/stale.yml @@ -35,17 +35,22 @@ jobs: # We specifically DO NOT exempt PR's from autoclosing. #exempt-pr-labels: '' remove-stale-when-updated: true - operations-per-run: 55 + operations-per-run: 70 stale-issue-message: > This issue has had no activity for **365** days and is marked for closure. It will be closed after an additional **30** days of inactivity. - If you would like to keep this issue open please add a comment and remove + If you would like to keep this issue open please add a comment and/or remove the `MARKED_FOR_CLOSURE` label. If this issue should be kept open even with no activity beyond the time limits you can add the label `DO_NOT_AUTOCLOSE`. + If it is ok for this issue to be closed, feel free to go ahead and close it. + Please **do not** add any comments or change any labels or otherwise touch + this issue unless your intention is to reset the inactivity counter for an + additional year. + close-issue-message: > This issue was closed due to inactivity for **395** days. @@ -57,9 +62,14 @@ jobs: considered to be abandoned and will be automatically closed after **30** additional days of inactivity from when it was marked inactive. - If this should be kept open, please post a comment and remove the + If this should be kept open, please post a comment and/or remove the label `MARKED_FOR_CLOSURE` to reset the inactivity timer. + If it is ok for this pull request to be closed, feel free to go ahead and close it. + Please **do not** add any comments or change any labels or otherwise touch + this issue unless your intention is to reset the inactivity counter for an + additional year. + close-pr-message: > This Pull Request has been automatically closed due to **395** days of inactivity. diff --git a/cmake/ctest/drivers/ascicgpu031/CMakeLists.txt b/cmake/ctest/drivers/ascicgpu031/CMakeLists.txt index d1d004d8ba75..99e07ec7c074 100644 --- a/cmake/ctest/drivers/ascicgpu031/CMakeLists.txt +++ b/cmake/ctest/drivers/ascicgpu031/CMakeLists.txt @@ -1,30 +1,5 @@ TRILINOS_DRIVER_SETUP() -TRILINOS_DRIVER_ADD_DASHBOARD( - RELEASE_CUDA_NOUVM_NODEPRECATED_DUALVIEW - ctest_linux_nightly_mpi_release_nouvm_ascicgpu031.cmake - CTEST_INSTALLER_TYPE release - RUN_SERIAL - TIMEOUT_MINUTES 420 - ) - - -TRILINOS_DRIVER_ADD_DASHBOARD( - RELEASE_CUDA_UVM_NODEPRECATED_DUALVIEW - ctest_linux_nightly_mpi_release_uvm_nodeprecated_ascicgpu031.cmake - CTEST_INSTALLER_TYPE release - RUN_SERIAL - TIMEOUT_MINUTES 420 - ) - -TRILINOS_DRIVER_ADD_DASHBOARD( - RELEASE_CUDA_UVM_DEPRECATED_DUALVIEW - ctest_linux_nightly_mpi_release_uvm_deprecated_ascicgpu031.cmake - CTEST_INSTALLER_TYPE release - RUN_SERIAL - TIMEOUT_MINUTES 420 - ) - TRILINOS_DRIVER_ADD_DASHBOARD( DEBUG_CUDA_NOUVM_NODEPRECATED_CRS ctest_linux_nightly_mpi_debug_nouvm_ascicgpu031_crs.cmake diff --git a/cmake/ctest/drivers/ascicgpu031/TrilinosCTestDriverCore.ascicgpu031.gcc-cuda.cmake b/cmake/ctest/drivers/ascicgpu031/TrilinosCTestDriverCore.ascicgpu031.gcc-cuda.cmake deleted file mode 100644 index 4b9fc54beacc..000000000000 --- a/cmake/ctest/drivers/ascicgpu031/TrilinosCTestDriverCore.ascicgpu031.gcc-cuda.cmake +++ /dev/null @@ -1,165 +0,0 @@ -# @HEADER -# ************************************************************************ -# -# Trilinos: An Object-Oriented Solver Framework -# Copyright (2001) Sandia Corporation -# -# -# Copyright (2001) Sandia Corporation. Under the terms of Contract -# DE-AC04-94AL85000, there is a non-exclusive license for use of this -# work by or on behalf of the U.S. Government. Export of this program -# may require a license from the United States Government. -# -# 1. Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# -# 3. Neither the name of the Corporation nor the names of the -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -# -# NOTICE: The United States Government is granted for itself and others -# acting on its behalf a paid-up, nonexclusive, irrevocable worldwide -# license in this data to reproduce, prepare derivative works, and -# perform publicly and display publicly. Beginning five (5) years from -# July 25, 2001, the United States Government is granted for itself and -# others acting on its behalf a paid-up, nonexclusive, irrevocable -# worldwide license in this data to reproduce, prepare derivative works, -# distribute copies to the public, perform publicly and display -# publicly, and to permit others to do so. -# -# NEITHER THE UNITED STATES GOVERNMENT, NOR THE UNITED STATES DEPARTMENT -# OF ENERGY, NOR SANDIA CORPORATION, NOR ANY OF THEIR EMPLOYEES, MAKES -# ANY WARRANTY, EXPRESS OR IMPLIED, OR ASSUMES ANY LEGAL LIABILITY OR -# RESPONSIBILITY FOR THE ACCURACY, COMPLETENESS, OR USEFULNESS OF ANY -# INFORMATION, APPARATUS, PRODUCT, OR PROCESS DISCLOSED, OR REPRESENTS -# THAT ITS USE WOULD NOT INFRINGE PRIVATELY OWNED RIGHTS. -# -# ************************************************************************ -# @HEADER - - -INCLUDE("${CTEST_SCRIPT_DIRECTORY}/../../TrilinosCTestDriverCore.cmake") - -# -# Platform/compiler specific options for geminga using gcc -# - -MACRO(TRILINOS_SYSTEM_SPECIFIC_CTEST_DRIVER) - - # Base of Trilinos/cmake/ctest then BUILD_DIR_NAME - IF(COMM_TYPE STREQUAL MPI) - string(TOUPPER $ENV{SIERRA_MPI} UC_MPI_NAME) - SET(BUILD_DIR_NAME ${UC_MPI_NAME}_${BUILD_TYPE}_${BUILD_NAME_DETAILS}) - ELSE() - SET(BUILD_DIR_NAME ${COMM_TYPE}-${BUILD_TYPE}_${BUILD_NAME_DETAILS}) - ENDIF() - - SET(Trilinos_REPOSITORY_LOCATION_NIGHTLY_DEFAULT "git@gitlab-ex.sandia.gov:trilinos-project/Trilinos.git") - SET(Trilinos_BRANCH "TpetraDualViewRefactor" ) - - SET(CTEST_DASHBOARD_ROOT "${TRILINOS_CMAKE_DIR}/../../${BUILD_DIR_NAME}" ) - SET(CTEST_NOTES_FILES "${CTEST_SCRIPT_DIRECTORY}/${CTEST_SCRIPT_NAME}" ) - SET(CTEST_BUILD_FLAGS "-j45 -i" ) - - SET_DEFAULT(CTEST_PARALLEL_LEVEL "45" ) - SET_DEFAULT(Trilinos_ENABLE_SECONDARY_TESTED_CODE ON) - SET(Trilinos_CTEST_DO_ALL_AT_ONCE FALSE) - SET_DEFAULT(Trilinos_EXCLUDE_PACKAGES ${EXTRA_EXCLUDE_PACKAGES} TriKota Optika Pamgen) - - # Select package disables - set (Trilinos_ENABLE_Gtest OFF CACHE BOOL "Gtest just does not build" FORCE) - set (Trilinos_ENABLE_ShyLU_NodeTacho OFF CACHE BOOL "Can't test Tacho with CUDA without RDC" FORCE) - set (Trilinos_ENABLE_Shards OFF CACHE BOOL "Shards does not build" FORCE) - set (Trilinos_ENABLE_Epetra OFF CACHE BOOL "We do not want Epetra" FORCE) - - SET(EXTRA_SYSTEM_CONFIGURE_OPTIONS - "-DCMAKE_BUILD_TYPE:STRING=${BUILD_TYPE}" - - # Adding the following as a possible fix for github issue #2115. - #KDD This flag appears to be unnecessary in April 2021, and it - #KDD breaks building of Zoltan tests - #KDD "-DCMAKE_CXX_USE_RESPONSE_FILE_FOR_OBJECTS:BOOL=ON" - - ### ALWAYS AND EVERYWHERE ### - "-DTrilinos_ENABLE_EXPLICIT_INSTANTIATION:BOOL=ON" - "-DBUILD_SHARED_LIBS:BOOL=ON" - "-DTrilinos_ENABLE_TESTS:BOOL=ON" - "-DTrilinos_ENABLE_EXAMPLES:BOOL=ON" - "-DTrilinos_ENABLE_DEPENDENCY_UNIT_TESTS:BOOL=OFF" - "-DTeuchos_GLOBALLY_REDUCE_UNITTEST_RESULTS:BOOL=ON" - - "-DTrilinos_ENABLE_COMPLEX=ON" - "-DTeuchos_ENABLE_COMPLEX=ON" - "-DTpetra_INST_COMPLEX_DOUBLE=ON" - - ### COMPILERS AND FLAGS ### - "-DCMAKE_CXX_FLAGS:STRING='-Wall -Wno-unknown-pragmas -Wno-unused-but-set-variable -Wno-inline -Wshadow'" - "-DTrilinos_ENABLE_Fortran:BOOL=OFF" - - ### TPLS ### - "-DTPL_ENABLE_CUDA:BOOL=ON" - "-DCMAKE_POLICY_DEFAULT_CMP0074=NEW" - "-DTPL_ENABLE_CUSPARSE:BOOL=ON" - "-DTPL_ENABLE_HWLOC:BOOL=OFF" - - - # Host Blas is required (https://github.com/kokkos/kokkos-kernels/issues/347) for Kokkos-Kernels to build correctly - "-DTPL_ENABLE_BLAS:BOOL=ON" - "-DTPL_ENABLE_LAPACK:BOOL=ON" - "-DTPL_BLAS_LIBRARIES=/usr/lib64/libblas.so" - "-DTPL_LAPACK_LIBRARIES=/usr/lib64/liblapack.so" - - ### PACKAGE CONFIGURATION ### - "-DKokkos_ENABLE_CUDA:BOOL=ON" - "-DKokkos_ENABLE_CUDA_LAMBDA:BOOL=ON" - "-DKokkos_ARCH_SKX:BOOL=ON" - "-DKokkos_ARCH_VOLTA70:BOOL=ON" - "-DKokkos_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE=OFF" - - "-DTrilinos_ENABLE_Epetra:BOOL=OFF" - "-DTrilinos_ENABLE_Gtest:BOOL=OFF" - "-DTrilinos_ENABLE_Pamgen:BOOL=OFF" - "-DTrilinos_ENABLE_Shards:BOOL=OFF" - "-DTrilinos_ENABLE_ShyLU_Node:BOOL=OFF" - "-DTrilinos_ENABLE_ShyLU_NodeTacho:BOOL=OFF" - "-DTrilinos_ENABLE_ShyLU:BOOL=OFF" - "-DTrilinos_ENABLE_ShyLU_DD:BOOL=OFF" - "-DAmesos2_ENABLE_ShyLU_NodeTacho:BOOL=OFF" - "-DAmesos2_ENABLE_ShyLU_NodeBasker:BOOL=OFF" - - ### MISC ### - "-DCMAKE_VERBOSE_MAKEFILE:BOOL=ON" - ) - - SET_DEFAULT(COMPILER_VERSION "$ENV{SIERRA_PLATFORM}") - - # Ensure that MPI is on for all parallel builds that might be run. - IF(COMM_TYPE STREQUAL MPI) - - SET(EXTRA_SYSTEM_CONFIGURE_OPTIONS - ${EXTRA_SYSTEM_CONFIGURE_OPTIONS} - "-DTPL_ENABLE_MPI:BOOL=ON" - "-DMPI_BASE_DIR:PATH=$ENV{MPIHOME}" - ) - - ENDIF() - - TRILINOS_CTEST_DRIVER() - -ENDMACRO() diff --git a/cmake/ctest/drivers/ascicgpu031/ctest_linux_nightly_mpi_release_nouvm_ascicgpu031.cmake b/cmake/ctest/drivers/ascicgpu031/ctest_linux_nightly_mpi_release_nouvm_ascicgpu031.cmake deleted file mode 100644 index 2c009e7980da..000000000000 --- a/cmake/ctest/drivers/ascicgpu031/ctest_linux_nightly_mpi_release_nouvm_ascicgpu031.cmake +++ /dev/null @@ -1,98 +0,0 @@ -# @HEADER -# ************************************************************************ -# -# Trilinos: An Object-Oriented Solver Framework -# Copyright (2001) Sandia Corporation -# -# -# Copyright (2001) Sandia Corporation. Under the terms of Contract -# DE-AC04-94AL85000, there is a non-exclusive license for use of this -# work by or on behalf of the U.S. Government. Export of this program -# may require a license from the United States Government. -# -# 1. Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# -# 3. Neither the name of the Corporation nor the names of the -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -# -# NOTICE: The United States Government is granted for itself and others -# acting on its behalf a paid-up, nonexclusive, irrevocable worldwide -# license in this data to reproduce, prepare derivative works, and -# perform publicly and display publicly. Beginning five (5) years from -# July 25, 2001, the United States Government is granted for itself and -# others acting on its behalf a paid-up, nonexclusive, irrevocable -# worldwide license in this data to reproduce, prepare derivative works, -# distribute copies to the public, perform publicly and display -# publicly, and to permit others to do so. -# -# NEITHER THE UNITED STATES GOVERNMENT, NOR THE UNITED STATES DEPARTMENT -# OF ENERGY, NOR SANDIA CORPORATION, NOR ANY OF THEIR EMPLOYEES, MAKES -# ANY WARRANTY, EXPRESS OR IMPLIED, OR ASSUMES ANY LEGAL LIABILITY OR -# RESPONSIBILITY FOR THE ACCURACY, COMPLETENESS, OR USEFULNESS OF ANY -# INFORMATION, APPARATUS, PRODUCT, OR PROCESS DISCLOSED, OR REPRESENTS -# THAT ITS USE WOULD NOT INFRINGE PRIVATELY OWNED RIGHTS. -# -# ************************************************************************ -# @HEADER - - -INCLUDE("${CTEST_SCRIPT_DIRECTORY}/TrilinosCTestDriverCore.ascicgpu031.gcc-cuda.cmake") - -# -# Set the options specific to this build case -# - -# The variable BUILD_DIR_NAME is based COMM_TYPE, BUILD_TYPE, and BUILD_NAME_DETAILS. -# Tribits creates the variable listed under "Build Name" by prepending the OS type and compiler -# details to BUILD_DIR_NAME. -SET(COMM_TYPE MPI) -SET(BUILD_TYPE RELEASE) -SET(BUILD_NAME_DETAILS NOUVM_NODEPRECATED_DUALVIEW) - -SET(CTEST_PARALLEL_LEVEL 8) -SET(CTEST_TEST_TYPE Nightly) -SET(Trilinos_TRACK Experimental) # Set the CDash track -SET(CTEST_TEST_TIMEOUT 900) - -SET(Trilinos_PACKAGES Amesos2 Belos Tpetra Ifpack2 MueLu Xpetra Zoltan2) - -SET(EXTRA_CONFIGURE_OPTIONS - ### TPLS ### - "-DTPL_ENABLE_SuperLU:BOOL=OFF" - "-DTPL_ENABLE_HWLOC:BOOL=OFF" - - ### PACKAGES CONFIGURATION ### - "-DTpetra_INST_INT_INT:BOOL=OFF" - "-DTpetra_INST_INT_LONG_LONG:BOOL=ON" - "-DTpetra_INST_COMPLEX_FLOAT:BOOL=OFF" - - "-DKokkos_ENABLE_CUDA_UVM:BOOL=OFF" - "-DTpetra_ENABLE_CUDA_UVM:BOOL=OFF" - - "-DTpetra_ENABLE_DEPRECATED_CODE:BOOL=OFF" - -) - -# -# Set the rest of the system-specific options and run the dashboard build/test -# - -TRILINOS_SYSTEM_SPECIFIC_CTEST_DRIVER() diff --git a/cmake/ctest/drivers/ascicgpu031/ctest_linux_nightly_mpi_release_uvm_deprecated_ascicgpu031.cmake b/cmake/ctest/drivers/ascicgpu031/ctest_linux_nightly_mpi_release_uvm_deprecated_ascicgpu031.cmake deleted file mode 100644 index ec785001e3ea..000000000000 --- a/cmake/ctest/drivers/ascicgpu031/ctest_linux_nightly_mpi_release_uvm_deprecated_ascicgpu031.cmake +++ /dev/null @@ -1,98 +0,0 @@ -# @HEADER -# ************************************************************************ -# -# Trilinos: An Object-Oriented Solver Framework -# Copyright (2001) Sandia Corporation -# -# -# Copyright (2001) Sandia Corporation. Under the terms of Contract -# DE-AC04-94AL85000, there is a non-exclusive license for use of this -# work by or on behalf of the U.S. Government. Export of this program -# may require a license from the United States Government. -# -# 1. Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# -# 3. Neither the name of the Corporation nor the names of the -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -# -# NOTICE: The United States Government is granted for itself and others -# acting on its behalf a paid-up, nonexclusive, irrevocable worldwide -# license in this data to reproduce, prepare derivative works, and -# perform publicly and display publicly. Beginning five (5) years from -# July 25, 2001, the United States Government is granted for itself and -# others acting on its behalf a paid-up, nonexclusive, irrevocable -# worldwide license in this data to reproduce, prepare derivative works, -# distribute copies to the public, perform publicly and display -# publicly, and to permit others to do so. -# -# NEITHER THE UNITED STATES GOVERNMENT, NOR THE UNITED STATES DEPARTMENT -# OF ENERGY, NOR SANDIA CORPORATION, NOR ANY OF THEIR EMPLOYEES, MAKES -# ANY WARRANTY, EXPRESS OR IMPLIED, OR ASSUMES ANY LEGAL LIABILITY OR -# RESPONSIBILITY FOR THE ACCURACY, COMPLETENESS, OR USEFULNESS OF ANY -# INFORMATION, APPARATUS, PRODUCT, OR PROCESS DISCLOSED, OR REPRESENTS -# THAT ITS USE WOULD NOT INFRINGE PRIVATELY OWNED RIGHTS. -# -# ************************************************************************ -# @HEADER - - -INCLUDE("${CTEST_SCRIPT_DIRECTORY}/TrilinosCTestDriverCore.ascicgpu031.gcc-cuda.cmake") - -# -# Set the options specific to this build case -# - -# The variable BUILD_DIR_NAME is based COMM_TYPE, BUILD_TYPE, and BUILD_NAME_DETAILS. -# Tribits creates the variable listed under "Build Name" by prepending the OS type and compiler -# details to BUILD_DIR_NAME. -SET(COMM_TYPE MPI) -SET(BUILD_TYPE RELEASE) -SET(BUILD_NAME_DETAILS UVM_DEPRECATED_DUALVIEW) - -SET(CTEST_PARALLEL_LEVEL 8) -SET(CTEST_TEST_TYPE Nightly) -SET(Trilinos_TRACK Experimental) # Set the CDash track -SET(CTEST_TEST_TIMEOUT 900) - -SET(Trilinos_PACKAGES Amesos2 Belos Tpetra Ifpack2 MueLu Xpetra Zoltan2) - -SET(EXTRA_CONFIGURE_OPTIONS - ### TPLS ### - "-DTPL_ENABLE_SuperLU:BOOL=OFF" - "-DTPL_ENABLE_HWLOC:BOOL=OFF" - - ### PACKAGES CONFIGURATION ### - "-DTpetra_INST_INT_INT:BOOL=OFF" - "-DTpetra_INST_INT_LONG_LONG:BOOL=ON" - "-DTpetra_INST_COMPLEX_FLOAT:BOOL=OFF" - - "-DKokkos_ENABLE_CUDA_UVM:BOOL=ON" - "-DTpetra_ENABLE_CUDA_UVM:BOOL=ON" - - "-DTpetra_ENABLE_DEPRECATED_CODE:BOOL=ON" - -) - -# -# Set the rest of the system-specific options and run the dashboard build/test -# - -TRILINOS_SYSTEM_SPECIFIC_CTEST_DRIVER() diff --git a/cmake/ctest/drivers/ascicgpu031/ctest_linux_nightly_mpi_release_uvm_nodeprecated_ascicgpu031.cmake b/cmake/ctest/drivers/ascicgpu031/ctest_linux_nightly_mpi_release_uvm_nodeprecated_ascicgpu031.cmake deleted file mode 100644 index 113d6d89a98f..000000000000 --- a/cmake/ctest/drivers/ascicgpu031/ctest_linux_nightly_mpi_release_uvm_nodeprecated_ascicgpu031.cmake +++ /dev/null @@ -1,98 +0,0 @@ -# @HEADER -# ************************************************************************ -# -# Trilinos: An Object-Oriented Solver Framework -# Copyright (2001) Sandia Corporation -# -# -# Copyright (2001) Sandia Corporation. Under the terms of Contract -# DE-AC04-94AL85000, there is a non-exclusive license for use of this -# work by or on behalf of the U.S. Government. Export of this program -# may require a license from the United States Government. -# -# 1. Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# -# 3. Neither the name of the Corporation nor the names of the -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -# -# NOTICE: The United States Government is granted for itself and others -# acting on its behalf a paid-up, nonexclusive, irrevocable worldwide -# license in this data to reproduce, prepare derivative works, and -# perform publicly and display publicly. Beginning five (5) years from -# July 25, 2001, the United States Government is granted for itself and -# others acting on its behalf a paid-up, nonexclusive, irrevocable -# worldwide license in this data to reproduce, prepare derivative works, -# distribute copies to the public, perform publicly and display -# publicly, and to permit others to do so. -# -# NEITHER THE UNITED STATES GOVERNMENT, NOR THE UNITED STATES DEPARTMENT -# OF ENERGY, NOR SANDIA CORPORATION, NOR ANY OF THEIR EMPLOYEES, MAKES -# ANY WARRANTY, EXPRESS OR IMPLIED, OR ASSUMES ANY LEGAL LIABILITY OR -# RESPONSIBILITY FOR THE ACCURACY, COMPLETENESS, OR USEFULNESS OF ANY -# INFORMATION, APPARATUS, PRODUCT, OR PROCESS DISCLOSED, OR REPRESENTS -# THAT ITS USE WOULD NOT INFRINGE PRIVATELY OWNED RIGHTS. -# -# ************************************************************************ -# @HEADER - - -INCLUDE("${CTEST_SCRIPT_DIRECTORY}/TrilinosCTestDriverCore.ascicgpu031.gcc-cuda.cmake") - -# -# Set the options specific to this build case -# - -# The variable BUILD_DIR_NAME is based COMM_TYPE, BUILD_TYPE, and BUILD_NAME_DETAILS. -# Tribits creates the variable listed under "Build Name" by prepending the OS type and compiler -# details to BUILD_DIR_NAME. -SET(COMM_TYPE MPI) -SET(BUILD_TYPE RELEASE) -SET(BUILD_NAME_DETAILS UVM_NODEPRECATED_DUALVIEW) - -SET(CTEST_PARALLEL_LEVEL 8) -SET(CTEST_TEST_TYPE Nightly) -SET(Trilinos_TRACK Experimental) # Set the CDash track -SET(CTEST_TEST_TIMEOUT 900) - -SET(Trilinos_PACKAGES Amesos2 Belos Tpetra Ifpack2 MueLu Xpetra Zoltan2) - -SET(EXTRA_CONFIGURE_OPTIONS - ### TPLS ### - "-DTPL_ENABLE_SuperLU:BOOL=OFF" - "-DTPL_ENABLE_HWLOC:BOOL=OFF" - - ### PACKAGES CONFIGURATION ### - "-DTpetra_INST_INT_INT:BOOL=OFF" - "-DTpetra_INST_INT_LONG_LONG:BOOL=ON" - "-DTpetra_INST_COMPLEX_FLOAT:BOOL=OFF" - - "-DKokkos_ENABLE_CUDA_UVM:BOOL=ON" - "-DTpetra_ENABLE_CUDA_UVM:BOOL=ON" - - "-DTpetra_ENABLE_DEPRECATED_CODE:BOOL=OFF" - -) - -# -# Set the rest of the system-specific options and run the dashboard build/test -# - -TRILINOS_SYSTEM_SPECIFIC_CTEST_DRIVER() diff --git a/cmake/ctest/drivers/ascicgpu031/drakify-email.pl b/cmake/ctest/drivers/ascicgpu031/drakify-email.pl index 4783c6734189..1bd756b64f75 100755 --- a/cmake/ctest/drivers/ascicgpu031/drakify-email.pl +++ b/cmake/ctest/drivers/ascicgpu031/drakify-email.pl @@ -16,7 +16,7 @@ # If you want to reuse this script somewhere else, # this should be the only line you need to change -@packages=("Tpetra", "Amesos2","Ifpack2","Xpetra","MueLu","Zoltan2"); +@packages=("Tpetra", "Amesos2","Belos","Ifpack2","Xpetra","MueLu","Zoltan2","Thyra"); $num_packages = scalar @packages; diff --git a/cmake/std/PullRequestLinuxCuda10.1.105uvmTestingSettings.cmake b/cmake/std/PullRequestLinuxCuda10.1.105uvmTestingSettings.cmake index 656298349c27..02cea9c12f05 100644 --- a/cmake/std/PullRequestLinuxCuda10.1.105uvmTestingSettings.cmake +++ b/cmake/std/PullRequestLinuxCuda10.1.105uvmTestingSettings.cmake @@ -127,7 +127,6 @@ set (EpetraExt_inout_test_LL_MPI_4_DISABLE ON CACHE BOOL "Temporary disable for set (EpetraExt_inout_test_MPI_4_DISABLE ON CACHE BOOL "Temporary disable for CUDA PR testing") set (Teko_testdriver_MPI_4_DISABLE ON CACHE BOOL "Temporary disable for CUDA PR testing") set (Zoltan2_fix4785_MPI_4_DISABLE ON CACHE BOOL "Temporary disable for CUDA PR testing") -set (Intrepid2_unit-test_Discretization_Basis_HierarchicalBases_Hierarchical_Basis_Tests_MPI_1_DISABLE ON CACHE BOOL "Temporary disable for CUDA PR testing") # UVM = OFF @@ -135,7 +134,6 @@ set (Kokkos_ENABLE_CUDA_UVM OFF CACHE BOOL "Set by default for CUDA PR testing") set (Tpetra_ENABLE_CUDA_UVM OFF CACHE BOOL "Set by default for CUDA PR testing") # Turn off packages currently failing with UVM = OFF -set (Trilinos_ENABLE_Intrepid2 OFF CACHE BOOL "Turn off packages for non-UVM build") set (Trilinos_ENABLE_Panzer OFF CACHE BOOL "Turn off packages for non-UVM build") set (Trilinos_ENABLE_Stokhos OFF CACHE BOOL "Turn off packages for non-UVM build") set (Trilinos_ENABLE_TrilinosCouplings OFF CACHE BOOL "Turn off packages for non-UVM build") @@ -158,10 +156,132 @@ set (SEACAS_ENABLE_TESTS OFF CACHE BOOL "Turn off tests for non-UVM build") set (ShyLU_DD_ENABLE_TESTS OFF CACHE BOOL "Turn off tests for non-UVM build") set (STK_ENABLE_TESTS OFF CACHE BOOL "Turn off tests for non-UVM build") set (Teko_ENABLE_TESTS OFF CACHE BOOL "Turn off tests for non-UVM build") -set (Tpetra_ENABLE_TESTS OFF CACHE BOOL "Turn off tests for non-UVM build") -set (TpetraCore_ENABLE_TESTS OFF CACHE BOOL "Turn off tests for non-UVM build") set (Xpetra_ENABLE_TESTS OFF CACHE BOOL "Turn off tests for non-UVM build") set (Zoltan2_ENABLE_TESTS OFF CACHE BOOL "Turn off tests for non-UVM build") + +# Tpetra UVM = OFF tests +set (TpetraCore_BlockCrsMatrix_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") +set (TpetraCore_Bug5072_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") +set (TpetraCore_BlankRowBugTest_MPI_2_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") +set (TpetraCore_iallreduce_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") +set (TpetraCore_idot_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") +set (TpetraCore_CrsGraph_UnitTests0_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") +set (TpetraCore_CrsGraph_UnitTests1_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") +set (TpetraCore_CrsGraph_UnitTests_Swap_MPI_2_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") +set (TpetraCore_CrsGraph_ReindexColumns_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") +set (TpetraCore_Issue601_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") +set (TpetraCore_Issue601_MPI_8_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") +set (TpetraCore_CrsGraph_insertGlobalIndicesFiltered_MPI_2_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") +set (TpetraCore_CrsGraph_PackUnpack_MPI_1_MPI_1_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") +set (TpetraCore_CrsGraph_getNumDiags_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") +set (TpetraCore_CrsGraph_UnpackIntoStaticGraph_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") +set (TpetraCore_CrsGraph_StaticImportExport_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") +set (TpetraCore_CrsGraph_UnpackMerge_MPI_2_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") +set (TpetraCore_CrsMatrix_UnitTests_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") +set (TpetraCore_CrsMatrix_UnitTests2_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") +set (TpetraCore_CrsMatrix_UnitTests3_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") +set (TpetraCore_CrsMatrix_UnitTests4_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") +set (TpetraCore_CrsMatrix_UnitTests_Swap_MPI_2_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") +set (TpetraCore_CrsMatrix_NonlocalAfterResume_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") +set (TpetraCore_CrsMatrix_LeftRightScale_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") +set (TpetraCore_CrsMatrix_2DRandomDist_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") +set (TpetraCore_CrsMatrix_WithGraph_Cuda_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") +set (TpetraCore_CrsMatrix_ReplaceDomainMapAndImporter_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") +set (TpetraCore_CrsMatrix_NonlocalSumInto_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") +set (TpetraCore_CrsMatrix_NonlocalSumInto_Ignore_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") +set (TpetraCore_CrsMatrix_Bug5978_MPI_2_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") +set (TpetraCore_CrsMatrix_Bug6069_1_MPI_3_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") +set (TpetraCore_CrsMatrix_Bug6069_2_MPI_2_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") +set (TpetraCore_CrsMatrix_Bug6171_MPI_2_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") +set (TpetraCore_CrsMatrix_ReplaceLocalValues_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") +set (TpetraCore_CrsMatrix_ReplaceDiagonal_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") +set (TpetraCore_CrsMatrix_MultipleFillCompletes_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") +set (TpetraCore_CrsMatrix_ReindexColumns_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") +set (TpetraCore_CrsMatrix_TransformValues_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") +set (TpetraCore_CrsMatrix_GetRowCopy_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") +set (TpetraCore_CrsMatrix_PackUnpack_MPI_1_MPI_1_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") +set (TpetraCore_Equilibration_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") +set (TpetraCore_CrsMatrix_StaticImportExport_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") +set (TpetraCore_sumIntoStaticProfileExtraSpace_MPI_1_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") +set (TpetraCore_CrsMatrix_createDeepCopy_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") +set (TpetraCore_CrsMatrix_UnpackMerge_MPI_2_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") +set (TpetraCore_CrsMatrix_Bug7745_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") +set (TpetraCore_CrsMatrix_Bug8794_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") +set (TpetraCore_CrsMatrix_RemoveEmptyProcesses_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") +set (TpetraCore_Albany182_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") +set (TpetraCore_Distributor_CreateFromSendsAndRecvs_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") +set (TpetraCore_Distributor_CreateFromSendsAndRecvs_MPI_8_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") +set (TpetraCore_Issue1752_MPI_2_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") +set (TpetraCore_FECrsGraph_UnitTests_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") +set (TpetraCore_FECrsMatrix_UnitTests_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") +set (TpetraCore_FEMultiVector_UnitTests_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") +set (TpetraCore_FixedHashTableTest_MPI_1_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") +set (TpetraCore_computeOffsetsFromCounts_MPI_1_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") +set (TpetraCore_ImportExport_ImportConstructExpert_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") +set (TpetraCore_UnpackLongRows_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") +set (TpetraCore_ExportToStaticGraphCrsMatrix_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") +set (TpetraCore_ImportExport2_UnitTests_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") +set (TpetraCore_MatrixMarket_Tpetra_CrsMatrix_InOutTest_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") +set (TpetraCore_MatrixMarket_Tpetra_CrsMatrix_Dist_simple_MPI_1_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") +set (TpetraCore_MatrixMarket_Tpetra_CrsMatrix_Dist_simple_MPI_3_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") +set (TpetraCore_MatrixMarket_Tpetra_CrsMatrix_Dist_simple_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") +set (TpetraCore_MatrixMarket_Tpetra_CrsMatrix_Dist_simple_MPI_6_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") +set (TpetraCore_MatrixMarket_Tpetra_CrsMatrix_Dist_simple_MPI_10_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") +set (TpetraCore_MatrixMarket_Tpetra_CrsMatrix_Dist_rmat_MPI_1_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") +set (TpetraCore_MatrixMarket_Tpetra_CrsMatrix_Dist_rmat_MPI_3_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") +set (TpetraCore_MatrixMarket_Tpetra_CrsMatrix_Dist_rmat_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") +set (TpetraCore_MatrixMarket_Tpetra_CrsMatrix_Dist_rmat_MPI_6_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") +set (TpetraCore_MatrixMarket_Tpetra_CrsMatrix_Dist_rmat_MPI_10_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") +set (TpetraCore_MatrixMarket_Tpetra_CrsMatrix_Dist_rmat_nodiag_MPI_1_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") +set (TpetraCore_MatrixMarket_Tpetra_CrsMatrix_Dist_rmat_nodiag_MPI_3_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") +set (TpetraCore_MatrixMarket_Tpetra_CrsMatrix_Dist_rmat_nodiag_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") +set (TpetraCore_MatrixMarket_Tpetra_CrsMatrix_Dist_rmat_nodiag_MPI_6_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") +set (TpetraCore_MatrixMarket_Tpetra_CrsMatrix_Dist_rmat_nodiag_MPI_10_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") +set (TpetraCore_MatrixMarket_Tpetra_CrsMatrix_Dist_Binary_simple_MPI_1_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") +set (TpetraCore_MatrixMarket_Tpetra_CrsMatrix_Dist_Binary_simple_MPI_3_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") +set (TpetraCore_MatrixMarket_Tpetra_CrsMatrix_Dist_Binary_simple_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") +set (TpetraCore_MatrixMarket_Tpetra_CrsMatrix_Dist_Binary_simple_MPI_6_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") +set (TpetraCore_MatrixMarket_Tpetra_CrsMatrix_Dist_Binary_simple_MPI_10_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") +set (TpetraCore_MatrixMarket_Tpetra_CrsMatrix_Dist_Binary_rmat_MPI_1_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") +set (TpetraCore_MatrixMarket_Tpetra_CrsMatrix_Dist_Binary_rmat_MPI_3_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") +set (TpetraCore_MatrixMarket_Tpetra_CrsMatrix_Dist_Binary_rmat_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") +set (TpetraCore_MatrixMarket_Tpetra_CrsMatrix_Dist_Binary_rmat_MPI_6_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") +set (TpetraCore_MatrixMarket_Tpetra_CrsMatrix_Dist_Binary_rmat_MPI_10_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") +set (TpetraCore_MatrixMarket_Tpetra_CrsMatrix_Dist_BinaryPerProcess_simple_MPI_1_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") +set (TpetraCore_MatrixMarket_Tpetra_CrsMatrix_Dist_BinaryPerProcess_simple_MPI_3_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") +set (TpetraCore_MatrixMarket_Tpetra_CrsMatrix_Dist_BinaryPerProcess_simple_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") +set (TpetraCore_MatrixMarket_Tpetra_CrsMatrix_Dist_BinaryPerProcess_simple_MPI_6_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") +set (TpetraCore_MatrixMarket_Tpetra_CrsMatrix_Dist_BinaryPerProcess_simple_MPI_10_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") +set (TpetraCore_MatrixMarket_Tpetra_CrsMatrix_Dist_BinaryPerProcess_rmat_MPI_1_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") +set (TpetraCore_MatrixMarket_Tpetra_CrsMatrix_Dist_BinaryPerProcess_rmat_MPI_3_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") +set (TpetraCore_MatrixMarket_Tpetra_CrsMatrix_Dist_BinaryPerProcess_rmat_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") +set (TpetraCore_MatrixMarket_Tpetra_CrsMatrix_Dist_BinaryPerProcess_rmat_MPI_6_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") +set (TpetraCore_MatrixMarket_Tpetra_CrsMatrix_Dist_BinaryPerProcess_rmat_MPI_10_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") +set (TpetraCore_MatrixMarket_Tpetra_CrsGraph_InOutTest_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") +set (TpetraCore_MatrixMarket_Operator_Test_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") +set (TpetraCore_MatrixMatrix_UnitTests_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") +set (TpetraCore_FECrs_MatrixMatrix_UnitTests_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") +set (TpetraCore_copyConvert_MPI_1_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") +set (TpetraCore_StaticView_MPI_1_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") +set (TpetraCore_RowMatrixTransposer_test_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") +set (TpetraCore_RowMatrixTransposer_UnitTests_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") +set (TpetraCore_CrsMatrix_transpose_sortedRows_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") +set (TpetraCore_lesson03_power_method_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") +set (TpetraCore_lesson05_redistribution_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") +set (TpetraCore_FEMAssembly_InsertGlobalIndicesFESP_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") +set (TpetraCore_FEMAssembly_InsertGlobalIndicesFESPKokkos_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") +set (TpetraCore_FEMAssembly_TotalElementLoopSP_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") +set (TpetraCore_FEMAssembly_TotalElementLoopSPKokkos_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") +set (TpetraCore_AdditiveSchwarzHalo_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") +set (TpetraCore_BlockCrsPerfTest_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") +set (TpetraCore_NewReaderExample_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") +set (TpetraCore_NewReaderExample_rmat_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") +set (TpetraCore_guide_power_method_1_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") +set (TpetraCore_guide_matrix_fill_1_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") +set (TpetraCore_guide_data_redist_1_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") +set (TpetraCore_EpetraRowMatrix_UnitTests_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") + + # ShyLU_DD UVM = OFF tests set (ShyLU_DDFROSch_test_thyra_xpetra_laplace_one_rank_TLP_IPOU_DIM3_TPETRA_MPI_1_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") set (ShyLU_DDFROSch_test_thyra_xpetra_laplace_one_rank_TLP_GDSW_DIM2_TPETRA_MPI_1_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") diff --git a/cmake/std/atdm/ATDMDevEnvSettings.cmake b/cmake/std/atdm/ATDMDevEnvSettings.cmake index 3735bd10b2d3..06a5356844f3 100644 --- a/cmake/std/atdm/ATDMDevEnvSettings.cmake +++ b/cmake/std/atdm/ATDMDevEnvSettings.cmake @@ -342,7 +342,7 @@ ENDIF() # Disable/hide deprecated code ATDM_SET_CACHE(Kokkos_ENABLE_DEPRECATED_CODE OFF CACHE BOOL) -ATDM_SET_CACHE(Tpetra_ENABLE_DEPRECATED_CODE OFF CACHE BOOL) +ATDM_SET_CACHE(Tpetra_ENABLE_DEPRECATED_CODE ON CACHE BOOL) ATDM_SET_CACHE(Belos_HIDE_DEPRECATED_CODE ON CACHE BOOL) ATDM_SET_CACHE(Epetra_HIDE_DEPRECATED_CODE ON CACHE BOOL) ATDM_SET_CACHE(Ifpack2_HIDE_DEPRECATED_CODE ON CACHE BOOL) diff --git a/cmake/std/atdm/contributed/blake/environment.sh b/cmake/std/atdm/contributed/blake/environment.sh index 64d6c31d3618..d5bff9e1f41e 100755 --- a/cmake/std/atdm/contributed/blake/environment.sh +++ b/cmake/std/atdm/contributed/blake/environment.sh @@ -99,7 +99,7 @@ fi if [[ "$ATDM_CONFIG_COMPILER" == "INTEL-18.1.163" ]] ; then module load devpack/20171203/openmpi/2.1.2/intel/18.1.163 - module swap cmake/3.9.0 cmake/3.12.3 + module swap cmake/3.9.0 cmake/3.19.3 module load ninja/1.7.2 module load python/2.7.13 @@ -120,7 +120,7 @@ if [[ "$ATDM_CONFIG_COMPILER" == "INTEL-18.1.163" ]] ; then export ATDM_CONFIG_MPI_POST_FLAGS="-bind-to;socket;-map-by;socket" elif [[ "$ATDM_CONFIG_COMPILER" == "GNU-7.2.0" ]] ; then module load devpack/20171203/openmpi/2.1.2/gcc/7.2.0 - module swap cmake/3.9.0 cmake/3.12.3 + module swap cmake/3.9.0 cmake/3.19.3 module load ninja/1.7.2 module load python/2.7.13 diff --git a/packages/amesos2/src/Amesos2_TpetraMultiVecAdapter_def.hpp b/packages/amesos2/src/Amesos2_TpetraMultiVecAdapter_def.hpp index d73a580c1f15..ec669a888e3b 100644 --- a/packages/amesos2/src/Amesos2_TpetraMultiVecAdapter_def.hpp +++ b/packages/amesos2/src/Amesos2_TpetraMultiVecAdapter_def.hpp @@ -91,7 +91,7 @@ namespace Amesos2 { typedef typename multivec_t::dual_view_type dual_view_type; typedef typename dual_view_type::host_mirror_space host_execution_space; mv_->template sync (); - auto contig_local_view_2d = mv_->template getLocalView(); + auto contig_local_view_2d = mv_->getLocalViewHost(); auto contig_local_view_1d = Kokkos::subview (contig_local_view_2d, Kokkos::ALL (), 0); return contig_local_view_1d.data(); } @@ -191,7 +191,7 @@ namespace Amesos2 { typedef typename dual_view_type::host_mirror_space host_execution_space; redist_mv.template sync < host_execution_space > (); - auto contig_local_view_2d = redist_mv.template getLocalView(); + auto contig_local_view_2d = redist_mv.getLocalViewHost(); if ( redist_mv.isConstantStride() ) { for ( size_t j = 0; j < num_vecs; ++j) { auto av_j = av(lda*j, lda); @@ -208,7 +208,7 @@ namespace Amesos2 { const size_t lclNumRows = redist_mv.getLocalLength(); for (size_t j = 0; j < redist_mv.getNumVectors(); ++j) { auto av_j = av(lda*j, lclNumRows); - auto X_lcl_j_2d = redist_mv.template getLocalView (); + auto X_lcl_j_2d = redist_mv.getLocalViewHost(); auto X_lcl_j_1d = Kokkos::subview (X_lcl_j_2d, Kokkos::ALL (), j); using val_type = typename decltype( X_lcl_j_1d )::value_type; @@ -425,7 +425,7 @@ namespace Amesos2 { if ( num_vecs == 1 && this->getComm()->getRank() == 0 && this->getComm()->getSize() == 1 ) { typedef typename multivec_t::dual_view_type::host_mirror_space host_execution_space; // num_vecs = 1; stride does not matter - auto mv_view_to_modify_2d = mv_->template getLocalView(); + auto mv_view_to_modify_2d = mv_->getLocalViewHost(); for ( size_t i = 0; i < lda; ++i ) { mv_view_to_modify_2d(i,0) = new_data[i]; // Only one vector } @@ -464,7 +464,7 @@ namespace Amesos2 { redist_mv.template modify< host_execution_space > (); if ( redist_mv.isConstantStride() ) { - auto contig_local_view_2d = redist_mv.template getLocalView(); + auto contig_local_view_2d = redist_mv.getLocalViewHost(); for ( size_t j = 0; j < num_vecs; ++j) { auto av_j = new_data(lda*j, lda); for ( size_t i = 0; i < lda; ++i ) { @@ -480,7 +480,7 @@ namespace Amesos2 { const size_t lclNumRows = redist_mv.getLocalLength(); for (size_t j = 0; j < redist_mv.getNumVectors(); ++j) { auto av_j = new_data(lda*j, lclNumRows); - auto X_lcl_j_2d = redist_mv.template getLocalView (); + auto X_lcl_j_2d = redist_mv.getLocalViewHost(); auto X_lcl_j_1d = Kokkos::subview (X_lcl_j_2d, Kokkos::ALL (), j); using val_type = typename decltype( X_lcl_j_1d )::value_type; @@ -535,7 +535,7 @@ namespace Amesos2 { // num_vecs = 1; stride does not matter // If this is the optimized path then kokkos_new_data will be the dst - auto mv_view_to_modify_2d = mv_->getLocalViewDevice(); + auto mv_view_to_modify_2d = mv_->getLocalViewDevice(Tpetra::Access::OverwriteAll); deep_copy_or_assign_view(mv_view_to_modify_2d, kokkos_new_data); } else { @@ -592,7 +592,7 @@ namespace Amesos2 { auto host_kokkos_new_data = Kokkos::create_mirror_view(kokkos_new_data); Kokkos::deep_copy(host_kokkos_new_data, kokkos_new_data); if ( redist_mv.isConstantStride() ) { - auto contig_local_view_2d = redist_mv.template getLocalView(); + auto contig_local_view_2d = redist_mv.getLocalViewHost(); for ( size_t j = 0; j < num_vecs; ++j) { auto av_j = Kokkos::subview(host_kokkos_new_data, Kokkos::ALL, j); for ( size_t i = 0; i < lda; ++i ) { diff --git a/packages/belos/tpetra/src/BelosMultiVecTraits_Tpetra.hpp b/packages/belos/tpetra/src/BelosMultiVecTraits_Tpetra.hpp index e0c94c5d66e1..0330c4b56e0f 100644 --- a/packages/belos/tpetra/src/BelosMultiVecTraits_Tpetra.hpp +++ b/packages/belos/tpetra/src/BelosMultiVecTraits_Tpetra.hpp @@ -409,6 +409,12 @@ namespace Belos { mv.multiply (Teuchos::NO_TRANS, Teuchos::NO_TRANS, alpha, A, B_mv, beta); } + Kokkos::fence(); // Belos with Thyra's MvTimesMatAddMv allowed failures + // when fence was not applied after mv.multiply; + // adding the fence fixed the tests in Thyra. + // Out of an abundance of caution (and with blessing + // from @hkthorn), we add the fence here as well. + // #8821 KDD } /// \brief mv := alpha*A + beta*B diff --git a/packages/belos/tpetra/src/solvers/Belos_Tpetra_GmresSstep.hpp b/packages/belos/tpetra/src/solvers/Belos_Tpetra_GmresSstep.hpp index 85234e2dcba9..6ab2d4705851 100644 --- a/packages/belos/tpetra/src/solvers/Belos_Tpetra_GmresSstep.hpp +++ b/packages/belos/tpetra/src/solvers/Belos_Tpetra_GmresSstep.hpp @@ -76,13 +76,8 @@ class CholQR { // A^T * A = R^T * R, where R is ncols by ncols upper // triangular. int info = 0; - if (R_mv.need_sync_host()) { - // sync R to host before modifying it in place on host - R_mv.sync_host(); - } - R_mv.modify_host (); { - auto R_h = R_mv.getLocalViewHost (); + auto R_h = R_mv.getLocalViewHost (Tpetra::Access::ReadWrite); int ldr = int (R_h.extent (0)); SC *Rdata = reinterpret_cast (R_h.data ()); lapack.POTRF ('U', ncols, Rdata, ldr, &info); @@ -114,11 +109,9 @@ class CholQR { // triangle of R. // Compute A_cur / R (Matlab notation for A_cur * R^{-1}) in place. - A.sync_device (); - A.modify_device (); { - auto A_d = A.getLocalViewDevice (); - auto R_d = R_mv.getLocalViewDevice (); + auto A_d = A.getLocalViewDevice (Tpetra::Access::ReadWrite); + auto R_d = R_mv.getLocalViewDevice (Tpetra::Access::ReadOnly); KokkosBlas::trsm ("R", "U", "N", "N", one, R_d, A_d); } diff --git a/packages/ifpack2/example/RelaxationWithEquilibration.cpp b/packages/ifpack2/example/RelaxationWithEquilibration.cpp index a8fa847d4f10..c045c27ed439 100644 --- a/packages/ifpack2/example/RelaxationWithEquilibration.cpp +++ b/packages/ifpack2/example/RelaxationWithEquilibration.cpp @@ -171,12 +171,12 @@ deep_copy (Epetra_Vector& X_e, host_view_type X_e_lcl (X_e_lcl_raw, lclNumRows); if (X_t.need_sync_device ()) { - auto X_t_lcl_2d = X_t.getLocalViewHost (); + auto X_t_lcl_2d = X_t.getLocalViewHost (Tpetra::Access::ReadOnly); auto X_t_lcl = Kokkos::subview (X_t_lcl_2d, Kokkos::ALL (), 0); Kokkos::deep_copy (X_e_lcl, X_t_lcl); } else { - auto X_t_lcl_2d = X_t.getLocalViewDevice (); + auto X_t_lcl_2d = X_t.getLocalViewDevice (Tpetra::Access::ReadOnly); auto X_t_lcl = Kokkos::subview (X_t_lcl_2d, Kokkos::ALL (), 0); Kokkos::deep_copy (X_e_lcl, X_t_lcl); } @@ -229,14 +229,12 @@ deep_copy (Tpetra::Vector& X_t, host_view_type X_e_lcl (X_e_lcl_raw, lclNumRows); if (X_t.need_sync_device ()) { - X_t.modify_host (); - auto X_t_lcl_2d = X_t.getLocalViewHost (); + auto X_t_lcl_2d = X_t.getLocalViewHost (Tpetra::Access::OverwriteAll); auto X_t_lcl = Kokkos::subview (X_t_lcl_2d, Kokkos::ALL (), 0); Kokkos::deep_copy (X_t_lcl, X_e_lcl); } else { - X_t.modify_device (); - auto X_t_lcl_2d = X_t.getLocalViewDevice (); + auto X_t_lcl_2d = X_t.getLocalViewDevice (Tpetra::Access::OverwriteAll); auto X_t_lcl = Kokkos::subview (X_t_lcl_2d, Kokkos::ALL (), 0); Kokkos::deep_copy (X_t_lcl, X_e_lcl); } @@ -546,11 +544,9 @@ typename MV::dot_type accurate_dot (const MV& X, const MV& Y) using dot_type = typename MV::dot_type; const LO lclNumRows = X.getLocalLength (); - const_cast (X).sync_host (); - auto X_lcl_2d = X.getLocalViewHost(); + auto X_lcl_2d = X.getLocalViewHost(Tpetra::Access::ReadOnly); auto X_lcl = Kokkos::subview (X_lcl_2d, Kokkos::ALL (), 0); - const_cast (Y).sync_host (); - auto Y_lcl_2d = Y.getLocalViewHost(); + auto Y_lcl_2d = Y.getLocalViewHost(Tpetra::Access::ReadOnly); auto Y_lcl = Kokkos::subview (Y_lcl_2d, Kokkos::ALL (), 0); long double sum = 0.0; @@ -560,9 +556,6 @@ typename MV::dot_type accurate_dot (const MV& X, const MV& Y) sum = std::fma (x_i, y_i, sum); } - const_cast (X).sync_device (); - const_cast (Y).sync_device (); - return dot_type (sum); } @@ -952,12 +945,10 @@ copyGatheredMultiVector (Tpetra::MultiVector& X, { using dense_matrix_type = HostDenseMatrix; - X.sync_host (); - auto X_lcl = X.getLocalViewHost (); + auto X_lcl = X.getLocalViewHost (Tpetra::Access::ReadOnly); dense_matrix_type X_copy (label, X.getLocalLength (), X.getNumVectors ()); Kokkos::deep_copy (X_copy, X_lcl); - X.sync_device (); return X_copy; } @@ -1324,12 +1315,7 @@ elementWiseMultiplyMultiVector (MultiVectorType& X, using index_type = typename MultiVectorType::local_ordinal_type; const index_type lclNumRows = static_cast (X.getLocalLength ()); - if (X.need_sync_device ()) { - X.sync_device (); - } - X.modify_device (); - - auto X_lcl = X.getLocalViewDevice (); + auto X_lcl = X.getLocalViewDevice (Tpetra::Access::ReadWrite); if (static_cast (X.getNumVectors ()) == std::size_t (1)) { using pair_type = Kokkos::pair; auto X_lcl_1d = Kokkos::subview (X_lcl, pair_type (0, lclNumRows), 0); @@ -1527,12 +1513,7 @@ elementWiseDivideMultiVector (MultiVectorType& X, using index_type = typename MultiVectorType::local_ordinal_type; const index_type lclNumRows = static_cast (X.getLocalLength ()); - if (X.need_sync_device ()) { - X.sync_device (); - } - X.modify_device (); - - auto X_lcl = X.getLocalViewDevice (); + auto X_lcl = X.getLocalViewDevice (Tpetra::Access::ReadWrite); if (static_cast (X.getNumVectors ()) == std::size_t (1)) { using pair_type = Kokkos::pair; auto X_lcl_1d = Kokkos::subview (X_lcl, pair_type (0, lclNumRows), 0); diff --git a/packages/ifpack2/src/Ifpack2_BandedContainer_decl.hpp b/packages/ifpack2/src/Ifpack2_BandedContainer_decl.hpp index d221ba72ff01..d36b348160f9 100644 --- a/packages/ifpack2/src/Ifpack2_BandedContainer_decl.hpp +++ b/packages/ifpack2/src/Ifpack2_BandedContainer_decl.hpp @@ -136,6 +136,7 @@ class BandedContainer using typename Container::HostView; using typename ContainerImpl::HostSubviewLocal; + using typename ContainerImpl::ConstHostSubviewLocal; using HostViewLocal = typename local_mv_type::dual_view_type::t_host; static_assert(std::is_same void BandedContainer:: -solveBlock(HostSubviewLocal X, +solveBlock(ConstHostSubviewLocal X, HostSubviewLocal Y, int blockIndex, Teuchos::ETransp mode, diff --git a/packages/ifpack2/src/Ifpack2_BlockRelaxation_def.hpp b/packages/ifpack2/src/Ifpack2_BlockRelaxation_def.hpp index 29a3b26c33d7..bdd55f2079a0 100644 --- a/packages/ifpack2/src/Ifpack2_BlockRelaxation_def.hpp +++ b/packages/ifpack2/src/Ifpack2_BlockRelaxation_def.hpp @@ -496,10 +496,7 @@ apply (const Tpetra::MultiVector X_copy; { - auto X_lcl_host = X.getLocalViewHost (); - auto Y_lcl_host = Y.getLocalViewHost (); - - if (X_lcl_host.data () == Y_lcl_host.data ()) { + if (X.aliases(Y)) { X_copy = rcp (new MV (X, Teuchos::Copy)); } else { X_copy = rcpFromRef (X); @@ -804,20 +801,18 @@ BlockRelaxation:: ApplyInverseJacobi (const MV& X, MV& Y) const { const size_t NumVectors = X.getNumVectors (); - auto XView = X.getLocalViewHost (); - auto YView = Y.getLocalViewHost (); MV AY (Y.getMap (), NumVectors); - auto AYView = AY.getLocalViewHost (); - // Initial matvec not needed int starting_iteration = 0; if (OverlapLevel_ > 0) { //Overlapping jacobi, with view of W_ - auto WView = W_->getLocalViewHost (); + auto WView = W_->getLocalViewHost (Tpetra::Access::ReadOnly); if(ZeroStartingSolution_) { + auto XView = X.getLocalViewHost (Tpetra::Access::ReadOnly); + auto YView = Y.getLocalViewHost (Tpetra::Access::ReadWrite); Container_->DoOverlappingJacobi(XView, YView, WView, DampingFactor_); starting_iteration = 1; } @@ -826,7 +821,11 @@ ApplyInverseJacobi (const MV& X, MV& Y) const { applyMat (Y, AY); AY.update (ONE, X, -ONE); - Container_->DoOverlappingJacobi (AYView, YView, WView, DampingFactor_); + { + auto AYView = AY.getLocalViewHost (Tpetra::Access::ReadOnly); + auto YView = Y.getLocalViewHost (Tpetra::Access::ReadWrite); + Container_->DoOverlappingJacobi (AYView, YView, WView, DampingFactor_); + } } } else @@ -834,6 +833,8 @@ ApplyInverseJacobi (const MV& X, MV& Y) const //Non-overlapping if(ZeroStartingSolution_) { + auto XView = X.getLocalViewHost (Tpetra::Access::ReadOnly); + auto YView = Y.getLocalViewHost (Tpetra::Access::ReadWrite); Container_->DoJacobi (XView, YView, DampingFactor_); starting_iteration = 1; } @@ -842,7 +843,11 @@ ApplyInverseJacobi (const MV& X, MV& Y) const { applyMat (Y, AY); AY.update (ONE, X, -ONE); - Container_->DoJacobi (AYView, YView, DampingFactor_); + { + auto AYView = AY.getLocalViewHost (Tpetra::Access::ReadOnly); + auto YView = Y.getLocalViewHost (Tpetra::Access::ReadWrite); + Container_->DoJacobi (AYView, YView, DampingFactor_); + } } } } @@ -856,8 +861,8 @@ ApplyInverseGS (const MV& X, MV& Y) const using Teuchos::ptr; size_t numVecs = X.getNumVectors(); //Get view of X (is never modified in this function) - auto XView = X.getLocalViewHost (); - auto YView = Y.getLocalViewHost (); + auto XView = X.getLocalViewHost (Tpetra::Access::ReadOnly); + auto YView = Y.getLocalViewHost (Tpetra::Access::ReadWrite); //Pre-import Y, if parallel Ptr Y2; bool deleteY2 = false; @@ -874,13 +879,13 @@ ApplyInverseGS (const MV& X, MV& Y) const { //do import once per sweep Y2->doImport(Y, *Importer_, Tpetra::INSERT); - auto Y2View = Y2->getLocalViewHost (); + auto Y2View = Y2->getLocalViewHost (Tpetra::Access::ReadWrite); Container_->DoGaussSeidel(XView, YView, Y2View, DampingFactor_); } } else { - auto Y2View = Y2->getLocalViewHost (); + auto Y2View = Y2->getLocalViewHost (Tpetra::Access::ReadWrite); for(int j = 0; j < NumSweeps_; ++j) { Container_->DoGaussSeidel(XView, YView, Y2View, DampingFactor_); @@ -898,8 +903,8 @@ ApplyInverseSGS (const MV& X, MV& Y) const using Teuchos::Ptr; using Teuchos::ptr; //Get view of X (is never modified in this function) - auto XView = X.getLocalViewHost (); - auto YView = Y.getLocalViewHost (); + auto XView = X.getLocalViewHost (Tpetra::Access::ReadOnly); + auto YView = Y.getLocalViewHost (Tpetra::Access::ReadWrite); //Pre-import Y, if parallel Ptr Y2; bool deleteY2 = false; @@ -916,13 +921,13 @@ ApplyInverseSGS (const MV& X, MV& Y) const { //do import once per sweep Y2->doImport(Y, *Importer_, Tpetra::INSERT); - auto Y2View = Y2->getLocalViewHost (); + auto Y2View = Y2->getLocalViewHost (Tpetra::Access::ReadWrite); Container_->DoSGS(XView, YView, Y2View, DampingFactor_); } } else { - auto Y2View = Y2->getLocalViewHost (); + auto Y2View = Y2->getLocalViewHost (Tpetra::Access::ReadWrite); for(int j = 0; j < NumSweeps_; ++j) { Container_->DoSGS(XView, YView, Y2View, DampingFactor_); diff --git a/packages/ifpack2/src/Ifpack2_BlockTriDiContainer_decl.hpp b/packages/ifpack2/src/Ifpack2_BlockTriDiContainer_decl.hpp index d65d8aed00dd..6cbcc0f839d7 100644 --- a/packages/ifpack2/src/Ifpack2_BlockTriDiContainer_decl.hpp +++ b/packages/ifpack2/src/Ifpack2_BlockTriDiContainer_decl.hpp @@ -168,7 +168,9 @@ namespace Ifpack2 { typedef typename Container::import_type import_type; typedef typename Container::HostView host_view_type; + typedef typename Container::ConstHostView const_host_view_type; typedef host_view_type HostView; + typedef const_host_view_type ConstHostView; //typedef Tpetra::MultiVector local_mv_type; //typedef typename Kokkos::View HostViewLocal; @@ -336,7 +338,7 @@ namespace Ifpack2 { //! Compute Y := (1 - a) Y + a D^{-1} (X - R*Y). Not supported. Call //! applyInverseJacobi instead. void - apply (host_view_type X, + apply (const_host_view_type X, host_view_type Y, int blockIndex, Teuchos::ETransp mode = Teuchos::NO_TRANS, @@ -346,9 +348,9 @@ namespace Ifpack2 { //! Compute Y := alpha * diag(D) * M^{-1} (diag(D) * X) + beta*Y. Not //! supported. void - weightedApply (host_view_type X, + weightedApply (const_host_view_type X, host_view_type Y, - host_view_type W, + const_host_view_type W, int blockIndex, Teuchos::ETransp mode = Teuchos::NO_TRANS, scalar_type alpha = Teuchos::ScalarTraits::one(), @@ -418,6 +420,7 @@ namespace Ifpack2 { typedef typename Container::import_type import_type; typedef typename Container::HostView host_view_type; + typedef typename Container::ConstHostView const_host_view_type; typedef typename Container::row_matrix_type row_matrix_type; static_assert (std::is_same::value, @@ -443,7 +446,7 @@ namespace Ifpack2 { int numSweeps = 1) const override {} void - apply (host_view_type X, + apply (const_host_view_type X, host_view_type Y, int blockIndex, Teuchos::ETransp mode = Teuchos::NO_TRANS, @@ -451,9 +454,9 @@ namespace Ifpack2 { scalar_type beta = Teuchos::ScalarTraits::zero()) const override {} void - weightedApply (host_view_type X, + weightedApply (const_host_view_type X, host_view_type Y, - host_view_type W, + const_host_view_type W, int blockIndex, Teuchos::ETransp mode = Teuchos::NO_TRANS, scalar_type alpha = Teuchos::ScalarTraits::one(), diff --git a/packages/ifpack2/src/Ifpack2_BlockTriDiContainer_def.hpp b/packages/ifpack2/src/Ifpack2_BlockTriDiContainer_def.hpp index fe6edfaea34b..201f82b393cb 100644 --- a/packages/ifpack2/src/Ifpack2_BlockTriDiContainer_def.hpp +++ b/packages/ifpack2/src/Ifpack2_BlockTriDiContainer_def.hpp @@ -336,7 +336,7 @@ namespace Ifpack2 { template void BlockTriDiContainer - ::apply (HostView /* X */, HostView /* Y */, int /* blockIndex */, Teuchos::ETransp /* mode */, + ::apply (ConstHostView /* X */, HostView /* Y */, int /* blockIndex */, Teuchos::ETransp /* mode */, scalar_type /* alpha */, scalar_type /* beta */) const { TEUCHOS_TEST_FOR_EXCEPT_MSG(true, "BlockTriDiContainer::apply is not implemented. You may have reached this message " @@ -347,7 +347,7 @@ namespace Ifpack2 { template void BlockTriDiContainer - ::weightedApply (HostView /* X */, HostView /* Y */, HostView /* D */, int /* blockIndex */, + ::weightedApply (ConstHostView /* X */, HostView /* Y */, ConstHostView /* D */, int /* blockIndex */, Teuchos::ETransp /* mode */, scalar_type /* alpha */, scalar_type /* beta */) const { TEUCHOS_TEST_FOR_EXCEPT_MSG(true, "BlockTriDiContainer::weightedApply is not implemented."); diff --git a/packages/ifpack2/src/Ifpack2_BlockTriDiContainer_impl.hpp b/packages/ifpack2/src/Ifpack2_BlockTriDiContainer_impl.hpp index c5121790ff6b..aad796ef8d8b 100644 --- a/packages/ifpack2/src/Ifpack2_BlockTriDiContainer_impl.hpp +++ b/packages/ifpack2/src/Ifpack2_BlockTriDiContainer_impl.hpp @@ -2184,6 +2184,7 @@ namespace Ifpack2 { using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view; using vector_type_3d_view = typename impl_type::vector_type_3d_view; using impl_scalar_type_2d_view_tpetra = typename impl_type::impl_scalar_type_2d_view_tpetra; + using const_impl_scalar_type_2d_view_tpetra = typename impl_scalar_type_2d_view_tpetra::const_type; static constexpr int vector_length = impl_type::vector_length; using member_type = typename Kokkos::TeamPolicy::member_type; @@ -2200,7 +2201,7 @@ namespace Ifpack2 { // packed multivector output (or input) vector_type_3d_view packed_multivector; - impl_scalar_type_2d_view_tpetra scalar_multivector; + const_impl_scalar_type_2d_view_tpetra scalar_multivector; template KOKKOS_INLINE_FUNCTION @@ -2282,7 +2283,7 @@ namespace Ifpack2 { }); } - void run(const impl_scalar_type_2d_view_tpetra &scalar_multivector_) { + void run(const const_impl_scalar_type_2d_view_tpetra &scalar_multivector_) { IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN; IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::MultiVectorConverter"); @@ -3810,9 +3811,9 @@ namespace Ifpack2 { // wrap the workspace with 3d view vector_type_3d_view pmv(work.data(), num_blockrows, blocksize, num_vectors); - const auto XX = X.template getLocalView(); - const auto YY = Y.template getLocalView(); - const auto ZZ = Z.template getLocalView(); + const auto XX = X.getLocalViewDevice(Tpetra::Access::ReadOnly); + const auto YY = Y.getLocalViewDevice(Tpetra::Access::ReadWrite); + const auto ZZ = Z.getLocalViewDevice(Tpetra::Access::ReadWrite); if (is_y_zero) Kokkos::deep_copy(YY, zero); MultiVectorConverter multivector_converter(interf, pmv); diff --git a/packages/ifpack2/src/Ifpack2_Chebyshev_def.hpp b/packages/ifpack2/src/Ifpack2_Chebyshev_def.hpp index 3d387b5153b0..195aced2b43b 100644 --- a/packages/ifpack2/src/Ifpack2_Chebyshev_def.hpp +++ b/packages/ifpack2/src/Ifpack2_Chebyshev_def.hpp @@ -460,15 +460,11 @@ applyImpl (const MV& X, // optimize for it by caching X_copy. RCP X_copy; bool copiedInput = false; - { - auto X_lcl_host = X.getLocalViewHost (); - auto Y_lcl_host = Y.getLocalViewHost (); - if (X_lcl_host.data () == Y_lcl_host.data ()) { - X_copy = rcp (new MV (X, Teuchos::Copy)); - copiedInput = true; - } else { - X_copy = rcpFromRef (X); - } + if (X.aliases(Y)) { + X_copy = rcp (new MV (X, Teuchos::Copy)); + copiedInput = true; + } else { + X_copy = rcpFromRef (X); } // If alpha != 1, fold alpha into (a deep copy of) X. diff --git a/packages/ifpack2/src/Ifpack2_Container_decl.hpp b/packages/ifpack2/src/Ifpack2_Container_decl.hpp index cdcc1c77b65a..82063ab27de8 100644 --- a/packages/ifpack2/src/Ifpack2_Container_decl.hpp +++ b/packages/ifpack2/src/Ifpack2_Container_decl.hpp @@ -137,6 +137,7 @@ class Container : public Teuchos::Describable //! HostView (the host-space internal representation for Tpetra::Multivector) is the //! type of the vector arguments of DoJacobi, DoGaussSeidel, and DoSGS. using HostView = typename mv_type::dual_view_type::t_host; + using ConstHostView = typename HostView::const_type; public: /// \brief Constructor. @@ -204,10 +205,10 @@ class Container : public Teuchos::Describable /// before calling compute(). virtual void compute () = 0; - void DoJacobi(HostView X, HostView Y, SC dampingFactor) const; - void DoOverlappingJacobi(HostView X, HostView Y, HostView W, SC dampingFactor) const; - void DoGaussSeidel(HostView X, HostView Y, HostView Y2, SC dampingFactor) const; - void DoSGS(HostView X, HostView Y, HostView Y2, SC dampingFactor) const; + void DoJacobi(ConstHostView X, HostView Y, SC dampingFactor) const; + void DoOverlappingJacobi(ConstHostView X, HostView Y, ConstHostView W, SC dampingFactor) const; + void DoGaussSeidel(ConstHostView X, HostView Y, HostView Y2, SC dampingFactor) const; + void DoSGS(ConstHostView X, HostView Y, HostView Y2, SC dampingFactor) const; //! Set parameters, if any. virtual void setParameters (const Teuchos::ParameterList& List) = 0; @@ -225,7 +226,7 @@ class Container : public Teuchos::Describable /// Tpetra::Operator's method of the same name. This might require /// subclasses to mark some of their instance data as \c mutable. virtual void - apply(HostView X, + apply(ConstHostView X, HostView Y, int blockIndex, Teuchos::ETransp mode = Teuchos::NO_TRANS, @@ -234,9 +235,9 @@ class Container : public Teuchos::Describable //! Compute Y := alpha * diag(D) * M^{-1} (diag(D) * X) + beta*Y. virtual void - weightedApply(HostView X, + weightedApply(ConstHostView X, HostView Y, - HostView D, + ConstHostView D, int blockIndex, Teuchos::ETransp mode = Teuchos::NO_TRANS, SC alpha = Teuchos::ScalarTraits::one(), @@ -256,10 +257,10 @@ class Container : public Teuchos::Describable int /* numSweeps = 1 */) const = 0; //! Wrapper for apply with MultiVector - virtual void applyMV (mv_type& X, mv_type& Y) const; + virtual void applyMV (const mv_type& X, mv_type& Y) const; //! Wrapper for weightedApply with MultiVector - virtual void weightedApplyMV (mv_type& X, + virtual void weightedApplyMV (const mv_type& X, mv_type& Y, vector_type& W) const; @@ -275,7 +276,7 @@ class Container : public Teuchos::Describable protected: //! Do one step of Gauss-Seidel on block i (used by DoGaussSeidel and DoSGS) - virtual void DoGSBlock(HostView X, HostView Y, HostView Y2, HostView Resid, + virtual void DoGSBlock(ConstHostView X, HostView Y, HostView Y2, HostView Resid, SC dampingFactor, LO i) const; //! The input matrix to the constructor. @@ -365,8 +366,10 @@ class ContainerImpl : public Container using local_mv_type = Tpetra::MultiVector; using typename Container::HostView; + using typename Container::ConstHostView; using HostViewLocal = typename local_mv_type::dual_view_type::t_host; using HostSubviewLocal = Kokkos::View; + using ConstHostSubviewLocal = Kokkos::View; static_assert(std::is_same::value, "Ifpack2::Container: Please use MatrixType = Tpetra::RowMatrix."); @@ -435,7 +438,7 @@ class ContainerImpl : public Container /// Tpetra::Operator's method of the same name. This might require /// subclasses to mark some of their instance data as \c mutable. virtual void - apply(HostView X, + apply(ConstHostView X, HostView Y, int blockIndex, Teuchos::ETransp mode = Teuchos::NO_TRANS, @@ -444,9 +447,9 @@ class ContainerImpl : public Container //! Compute Y := alpha * diag(D) * M^{-1} (diag(D) * X) + beta*Y. virtual void - weightedApply(HostView X, + weightedApply(ConstHostView X, HostView Y, - HostView D, + ConstHostView D, int blockIndex, Teuchos::ETransp mode = Teuchos::NO_TRANS, SC alpha = Teuchos::ScalarTraits::one(), @@ -466,10 +469,10 @@ class ContainerImpl : public Container int /* numSweeps = 1 */) const; //! Wrapper for apply with MVs, used in unit tests (never called by BlockRelaxation) - void applyMV (mv_type& X, mv_type& Y) const; + void applyMV (const mv_type& X, mv_type& Y) const; //! Wrapper for weightedApply with MVs, used in unit tests (never called by BlockRelaxation) - void weightedApplyMV (mv_type& X, + void weightedApplyMV (const mv_type& X, mv_type& Y, vector_type& W) const; @@ -484,7 +487,7 @@ class ContainerImpl : public Container protected: //Do Gauss-Seidel on only block i (this is used by DoGaussSeidel and DoSGS) - void DoGSBlock(HostView X, HostView Y, HostView Y2, HostView Resid, + void DoGSBlock(ConstHostView X, HostView Y, HostView Y2, HostView Resid, SC dampingFactor, LO i) const; //! Exactly solves the linear system By = x, where B is a diagonal block matrix @@ -493,7 +496,7 @@ class ContainerImpl : public Container //! The Dense, Banded and TriDi containers all implement this and it is used in ContainerImpl::apply(). //! The Sparse and BlockTriDi containers have their own implementation of apply() and do not use solveBlock. virtual void - solveBlock(HostSubviewLocal X, + solveBlock(ConstHostSubviewLocal X, HostSubviewLocal Y, int blockIndex, Teuchos::ETransp mode, diff --git a/packages/ifpack2/src/Ifpack2_Container_def.hpp b/packages/ifpack2/src/Ifpack2_Container_def.hpp index 0889fc9fbdaa..b5c9b220a7af 100644 --- a/packages/ifpack2/src/Ifpack2_Container_def.hpp +++ b/packages/ifpack2/src/Ifpack2_Container_def.hpp @@ -173,14 +173,14 @@ bool Container::isComputed () const { template void Container:: -applyMV(mv_type& X, mv_type& Y) const +applyMV(const mv_type& X, mv_type& Y) const { TEUCHOS_TEST_FOR_EXCEPT_MSG(true, "Not implemented."); } template void Container:: -weightedApplyMV(mv_type& X, mv_type& Y, vector_type& W) const +weightedApplyMV(const mv_type& X, mv_type& Y, vector_type& W) const { TEUCHOS_TEST_FOR_EXCEPT_MSG(true, "Not implemented."); } @@ -193,14 +193,14 @@ getName() } template -void Container::DoGSBlock(HostView X, HostView Y, HostView Y2, HostView Resid, +void Container::DoGSBlock(ConstHostView X, HostView Y, HostView Y2, HostView Resid, SC dampingFactor, LO i) const { TEUCHOS_TEST_FOR_EXCEPT_MSG(true, "Not implemented."); } template -void Container::DoJacobi(HostView X, HostView Y, SC dampingFactor) const +void Container::DoJacobi(ConstHostView X, HostView Y, SC dampingFactor) const { using STS = Teuchos::ScalarTraits; const ISC one = STS::one(); @@ -220,7 +220,7 @@ void Container::DoJacobi(HostView X, HostView Y, SC dampingFactor) c { LO LRID = blockRows_[blockOffsets_[i]]; getMatDiag(); - HostView diagView = Diag_->getLocalViewHost(); + auto diagView = Diag_->getLocalViewHost(Tpetra::Access::ReadOnly); ISC d = one / diagView(LRID, 0); for(size_t nv = 0; nv < numVecs; nv++) { @@ -232,7 +232,7 @@ void Container::DoJacobi(HostView X, HostView Y, SC dampingFactor) c } template -void Container::DoOverlappingJacobi(HostView X, HostView Y, HostView W, SC dampingFactor) const +void Container::DoOverlappingJacobi(ConstHostView X, HostView Y, ConstHostView W, SC dampingFactor) const { using STS = Teuchos::ScalarTraits; // Overlapping Jacobi @@ -250,7 +250,7 @@ void Container::DoOverlappingJacobi(HostView X, HostView Y, HostView //This is used 3 times: once in DoGaussSeidel and twice in DoSGS template void ContainerImpl::DoGSBlock( - HostView X, HostView Y, HostView Y2, HostView Resid, + ConstHostView X, HostView Y, HostView Y2, HostView Resid, SC dampingFactor, LO i) const { using Teuchos::ArrayView; @@ -302,7 +302,7 @@ void ContainerImpl::DoGSBlock( // singleton, can't access Containers_[i] as it was never filled and may be null. // a singleton calculation (just using matrix diagonal) is exact, all residuals should be zero. LO LRID = this->blockOffsets_[i]; // by definition, a singleton 1 row in block. - HostView diagView = this->Diag_->getLocalViewHost(); + ConstHostView diagView = this->Diag_->getLocalViewHost(Tpetra::Access::ReadOnly); ISC d = one / diagView(LRID, 0); for(size_t m = 0; m < numVecs; m++) { @@ -377,7 +377,7 @@ void ContainerImpl::DoGSBlock( template void Container:: -DoGaussSeidel(HostView X, HostView Y, HostView Y2, SC dampingFactor) const +DoGaussSeidel(ConstHostView X, HostView Y, HostView Y2, SC dampingFactor) const { using Teuchos::Array; using Teuchos::ArrayRCP; @@ -410,7 +410,7 @@ DoGaussSeidel(HostView X, HostView Y, HostView Y2, SC dampingFactor) const template void Container:: -DoSGS(HostView X, HostView Y, HostView Y2, SC dampingFactor) const +DoSGS(ConstHostView X, HostView Y, HostView Y2, SC dampingFactor) const { // X = RHS, Y = initial guess using Teuchos::Array; @@ -489,22 +489,22 @@ applyInverseJacobi (const mv_type& /* X */, mv_type& /* Y */, template void ContainerImpl:: -applyMV (mv_type& X, mv_type& Y) const +applyMV (const mv_type& X, mv_type& Y) const { - HostView XView = X.getLocalViewHost(); - HostView YView = Y.getLocalViewHost(); + ConstHostView XView = X.getLocalViewHost(Tpetra::Access::ReadOnly); + HostView YView = Y.getLocalViewHost(Tpetra::Access::ReadWrite); this->apply (XView, YView, 0); } template void ContainerImpl:: -weightedApplyMV (mv_type& X, +weightedApplyMV (const mv_type& X, mv_type& Y, vector_type& W) const { - HostView XView = X.getLocalViewHost(); - HostView YView = Y.getLocalViewHost(); - HostView WView = W.getLocalViewHost(); + ConstHostView XView = X.getLocalViewHost(Tpetra::Access::ReadOnly); + HostView YView = Y.getLocalViewHost(Tpetra::Access::ReadWrite); + ConstHostView WView = W.getLocalViewHost(Tpetra::Access::ReadOnly); weightedApply (XView, YView, WView, 0); } @@ -517,7 +517,7 @@ getName() template void ContainerImpl:: -solveBlock(HostSubviewLocal X, +solveBlock(ConstHostSubviewLocal X, HostSubviewLocal Y, int blockIndex, Teuchos::ETransp mode, @@ -573,7 +573,7 @@ translateRowToCol(LO row) template void ContainerImpl:: -apply (HostView X, +apply (ConstHostView X, HostView Y, int blockIndex, Teuchos::ETransp mode, @@ -684,9 +684,9 @@ apply (HostView X, template void ContainerImpl:: -weightedApply(HostView X, +weightedApply(ConstHostView X, HostView Y, - HostView D, + ConstHostView D, int blockIndex, Teuchos::ETransp mode, SC alpha, diff --git a/packages/ifpack2/src/Ifpack2_DenseContainer_decl.hpp b/packages/ifpack2/src/Ifpack2_DenseContainer_decl.hpp index 9fd1135f2e3b..10f87b08c4b9 100644 --- a/packages/ifpack2/src/Ifpack2_DenseContainer_decl.hpp +++ b/packages/ifpack2/src/Ifpack2_DenseContainer_decl.hpp @@ -136,6 +136,7 @@ class DenseContainer using typename Container::HostView; using typename ContainerImpl::HostViewLocal; using typename ContainerImpl::HostSubviewLocal; + using typename ContainerImpl::ConstHostSubviewLocal; static_assert(std::is_same>::value, "Ifpack2::DenseContainer: Please use MatrixType = Tpetra::RowMatrix."); @@ -236,9 +237,9 @@ class DenseContainer /// linear system with the diagonal block. /// /// \param X [in] Subset permutation of the input X of apply(). - /// \param Y [in] Subset permutation of the input/output Y of apply(). + /// \param Y [in/out] Subset permutation of the input/output Y of apply(). void - solveBlock(HostSubviewLocal X, + solveBlock(ConstHostSubviewLocal X, HostSubviewLocal Y, int blockIndex, Teuchos::ETransp mode, diff --git a/packages/ifpack2/src/Ifpack2_DenseContainer_def.hpp b/packages/ifpack2/src/Ifpack2_DenseContainer_def.hpp index aed816c40642..058ffac13387 100644 --- a/packages/ifpack2/src/Ifpack2_DenseContainer_def.hpp +++ b/packages/ifpack2/src/Ifpack2_DenseContainer_def.hpp @@ -271,7 +271,7 @@ factor () template void DenseContainer:: -solveBlock(HostSubviewLocal X, +solveBlock(ConstHostSubviewLocal X, HostSubviewLocal Y, int blockIndex, Teuchos::ETransp mode, diff --git a/packages/ifpack2/src/Ifpack2_Details_Amesos2Wrapper_def.hpp b/packages/ifpack2/src/Ifpack2_Details_Amesos2Wrapper_def.hpp index 658bdc6ea5eb..963f61f3a46e 100644 --- a/packages/ifpack2/src/Ifpack2_Details_Amesos2Wrapper_def.hpp +++ b/packages/ifpack2/src/Ifpack2_Details_Amesos2Wrapper_def.hpp @@ -467,10 +467,7 @@ apply (const Tpetra::MultiVector X_temp; { - auto X_lcl_host = X.getLocalViewHost (); - auto Y_lcl_host = Y.getLocalViewHost (); - - if (X_lcl_host.data () == Y_lcl_host.data ()) { + if (X.aliases(Y)) { X_temp = rcp (new MV (X, Teuchos::Copy)); } else { X_temp = rcpFromRef (X); diff --git a/packages/ifpack2/src/Ifpack2_Details_ChebyshevKernel_decl.hpp b/packages/ifpack2/src/Ifpack2_Details_ChebyshevKernel_decl.hpp index fb3d4dd7c763..609f94391509 100644 --- a/packages/ifpack2/src/Ifpack2_Details_ChebyshevKernel_decl.hpp +++ b/packages/ifpack2/src/Ifpack2_Details_ChebyshevKernel_decl.hpp @@ -111,7 +111,6 @@ class ChebyshevKernel { std::unique_ptr X_colMap_; std::unique_ptr V1_; - typename multivector_type::dual_view_type::t_host viewW_, viewB_, viewX_; Teuchos::RCP W_vec_, B_vec_, X_vec_; // Do the Import, if needed, and return the column Map version of X. diff --git a/packages/ifpack2/src/Ifpack2_Details_ChebyshevKernel_def.hpp b/packages/ifpack2/src/Ifpack2_Details_ChebyshevKernel_def.hpp index cae9a9f982b5..3b12e3a058ba 100644 --- a/packages/ifpack2/src/Ifpack2_Details_ChebyshevKernel_def.hpp +++ b/packages/ifpack2/src/Ifpack2_Details_ChebyshevKernel_def.hpp @@ -46,7 +46,6 @@ #include "Tpetra_MultiVector.hpp" #include "Tpetra_Operator.hpp" #include "Tpetra_Vector.hpp" -#include "Tpetra_withLocalAccess_MultiVector.hpp" #include "Tpetra_Export_decl.hpp" #include "Tpetra_Import_decl.hpp" #include "Kokkos_ArithTraits.hpp" @@ -351,18 +350,9 @@ compute (multivector_type& W, if (canFuse (B)) { // "nonconst" here has no effect other than on the return type. - if (W_vec_.is_null() || W.getLocalViewHost().data() != viewW_.data()) { - viewW_ = W.getLocalViewHost(); - W_vec_ = W.getVectorNonConst (0); - } - if (B_vec_.is_null() || B.getLocalViewHost().data() != viewB_.data()) { - viewB_ = B.getLocalViewHost(); - B_vec_ = B.getVectorNonConst (0); - } - if (X_vec_.is_null() || X.getLocalViewHost().data() != viewX_.data()) { - viewX_ = X.getLocalViewHost(); - X_vec_ = X.getVectorNonConst (0); - } + W_vec_ = W.getVectorNonConst (0); + B_vec_ = B.getVectorNonConst (0); + X_vec_ = X.getVectorNonConst (0); TEUCHOS_ASSERT( ! A_crs_.is_null () ); fusedCase (*W_vec_, alpha, D_inv, *B_vec_, *A_crs_, *X_vec_, beta); } @@ -443,64 +433,32 @@ fusedCase (vector_type& W, { vector_type& X_colMap = importVector (X); - // Only need these aliases because we lack C++14 generic lambdas. - using Tpetra::with_local_access_function_argument_type; - using ro_lcl_vec_type = - with_local_access_function_argument_type< - decltype (readOnly (B))>; - using wo_lcl_vec_type = - with_local_access_function_argument_type< - decltype (writeOnly (B))>; - using rw_lcl_vec_type = - with_local_access_function_argument_type< - decltype (readWrite (B))>; - - using Tpetra::withLocalAccess; - using Tpetra::readOnly; - using Tpetra::readWrite; - using Tpetra::writeOnly; using Impl::chebyshev_kernel_vector; using STS = Teuchos::ScalarTraits; auto A_lcl = A.getLocalMatrix (); + //D_inv, B, X and W are all Vectors, so it's safe to take the first column only + auto Dinv_lcl = Kokkos::subview(D_inv.getLocalViewDevice(Tpetra::Access::ReadOnly), Kokkos::ALL(), 0); + auto B_lcl = Kokkos::subview(B.getLocalViewDevice(Tpetra::Access::ReadOnly), Kokkos::ALL(), 0); + auto X_domMap_lcl = Kokkos::subview(X.getLocalViewDevice(Tpetra::Access::ReadWrite), Kokkos::ALL(), 0); + auto X_colMap_lcl = Kokkos::subview(X_colMap.getLocalViewDevice(Tpetra::Access::ReadOnly), Kokkos::ALL(), 0); + const bool do_X_update = !imp_.is_null (); if (beta == STS::zero ()) { - withLocalAccess - ([&] (const wo_lcl_vec_type& W_lcl, - const ro_lcl_vec_type& D_lcl, - const ro_lcl_vec_type& B_lcl, - const ro_lcl_vec_type& X_colMap_lcl, - const wo_lcl_vec_type& X_domMap_lcl) { - chebyshev_kernel_vector (alpha, W_lcl, D_lcl, - B_lcl, A_lcl, - X_colMap_lcl, X_domMap_lcl, - beta, - do_X_update); - }, - writeOnly (W), - readOnly (D_inv), - readOnly (B), - readOnly (X_colMap), - writeOnly (X)); + auto W_lcl = Kokkos::subview(W.getLocalViewDevice(Tpetra::Access::OverwriteAll), Kokkos::ALL(), 0); + chebyshev_kernel_vector (alpha, W_lcl, Dinv_lcl, + B_lcl, A_lcl, + X_colMap_lcl, X_domMap_lcl, + beta, + do_X_update); } else { // need to read _and_ write W if beta != 0 - withLocalAccess - ([&] (const rw_lcl_vec_type& W_lcl, - const ro_lcl_vec_type& D_lcl, - const ro_lcl_vec_type& B_lcl, - const ro_lcl_vec_type& X_colMap_lcl, - const wo_lcl_vec_type& X_domMap_lcl) { - chebyshev_kernel_vector (alpha, W_lcl, D_lcl, - B_lcl, A_lcl, - X_colMap_lcl, X_domMap_lcl, - beta, - do_X_update); - }, - readWrite (W), - readOnly (D_inv), - readOnly (B), - readOnly (X_colMap), - writeOnly (X)); + auto W_lcl = Kokkos::subview(W.getLocalViewDevice(Tpetra::Access::ReadWrite), Kokkos::ALL(), 0); + chebyshev_kernel_vector (alpha, W_lcl, Dinv_lcl, + B_lcl, A_lcl, + X_colMap_lcl, X_domMap_lcl, + beta, + do_X_update); } if (!do_X_update) X.update(STS::one (), W, STS::one ()); diff --git a/packages/ifpack2/src/Ifpack2_Details_Chebyshev_def.hpp b/packages/ifpack2/src/Ifpack2_Details_Chebyshev_def.hpp index 5a9ca35f0ba7..eb9cb504f722 100644 --- a/packages/ifpack2/src/Ifpack2_Details_Chebyshev_def.hpp +++ b/packages/ifpack2/src/Ifpack2_Details_Chebyshev_def.hpp @@ -166,13 +166,9 @@ struct GlobalReciprocalThreshold { const typename TpetraVectorType::scalar_type& minVal) { typedef typename TpetraVectorType::impl_scalar_type value_type; - typedef typename TpetraVectorType::device_type::memory_space memory_space; - - X.template sync (); - X.template modify (); const value_type minValS = static_cast (minVal); - auto X_0 = Kokkos::subview (X.template getLocalView (), + auto X_0 = Kokkos::subview (X.getLocalViewDevice (Tpetra::Access::ReadWrite), Kokkos::ALL (), 0); LocalReciprocalThreshold::compute (X_0, minValS); } @@ -782,7 +778,7 @@ Chebyshev::compute () Teuchos::rcp_dynamic_cast (A_); if (D_.is_null ()) { // We haven't computed D_ before - if (! A_crsMat.is_null () && A_crsMat->isStaticGraph ()) { + if (! A_crsMat.is_null () && A_crsMat->isFillComplete ()) { // It's a CrsMatrix with a const graph; cache diagonal offsets. const size_t lclNumRows = A_crsMat->getNodeNumRows (); if (diagOffsets_.extent (0) < lclNumRows) { @@ -798,7 +794,7 @@ Chebyshev::compute () } } else if (! assumeMatrixUnchanged_) { // D_ exists but A_ may have changed - if (! A_crsMat.is_null () && A_crsMat->isStaticGraph ()) { + if (! A_crsMat.is_null () && A_crsMat->isFillComplete ()) { // It's a CrsMatrix with a const graph; cache diagonal offsets // if we haven't already. if (! savedDiagOffsets_) { @@ -1052,21 +1048,22 @@ makeInverseDiagonal (const row_matrix_type& A, const bool useDiagOffsets) const // In debug mode, make sure that all diagonal entries are // positive, on all processes. Note that *out_ only prints on // Process 0 of the matrix's communicator. - D_rangeMap->sync_host (); - auto D_lcl = D_rangeMap->getLocalViewHost (); - auto D_lcl_1d = Kokkos::subview (D_lcl, Kokkos::ALL (), 0); - - typedef typename MV::impl_scalar_type IST; - typedef typename MV::local_ordinal_type LO; - typedef Kokkos::Details::ArithTraits STS; - typedef Kokkos::Details::ArithTraits STM; - - const LO lclNumRows = static_cast (D_rangeMap->getLocalLength ()); bool foundNonpositiveValue = false; - for (LO i = 0; i < lclNumRows; ++i) { - if (STS::real (D_lcl_1d(i)) <= STM::zero ()) { - foundNonpositiveValue = true; - break; + { + auto D_lcl = D_rangeMap->getLocalViewHost (Tpetra::Access::ReadOnly); + auto D_lcl_1d = Kokkos::subview (D_lcl, Kokkos::ALL (), 0); + + typedef typename MV::impl_scalar_type IST; + typedef typename MV::local_ordinal_type LO; + typedef Kokkos::Details::ArithTraits STS; + typedef Kokkos::Details::ArithTraits STM; + + const LO lclNumRows = static_cast (D_rangeMap->getLocalLength ()); + for (LO i = 0; i < lclNumRows; ++i) { + if (STS::real (D_lcl_1d(i)) <= STM::zero ()) { + foundNonpositiveValue = true; + break; + } } } @@ -1433,14 +1430,12 @@ Chebyshev:: computeInitialGuessForPowerMethod (V& x, const bool nonnegativeRealParts) const { typedef typename MV::device_type::execution_space dev_execution_space; - typedef typename MV::device_type::memory_space dev_memory_space; typedef typename MV::local_ordinal_type LO; x.randomize (); if (nonnegativeRealParts) { - x.template modify (); - auto x_lcl = x.template getLocalView (); + auto x_lcl = x.getLocalViewDevice (Tpetra::Access::ReadWrite); auto x_lcl_1d = Kokkos::subview (x_lcl, Kokkos::ALL (), 0); const LO lclNumRows = static_cast (x.getLocalLength ()); diff --git a/packages/ifpack2/src/Ifpack2_Details_FastILU_Base_def.hpp b/packages/ifpack2/src/Ifpack2_Details_FastILU_Base_def.hpp index 218146e54805..802991269cdd 100644 --- a/packages/ifpack2/src/Ifpack2_Details_FastILU_Base_def.hpp +++ b/packages/ifpack2/src/Ifpack2_Details_FastILU_Base_def.hpp @@ -119,8 +119,8 @@ apply (const Tpetra::MultiVector &X, int nvecs = X.getNumVectors(); if(nvecs == 1) { - auto x2d = X.template getLocalView(); - auto y2d = Y.template getLocalView(); + auto x2d = X.template getLocalView(Tpetra::Access::ReadWrite); + auto y2d = Y.template getLocalView(Tpetra::Access::ReadWrite); auto x1d = Kokkos::subview(x2d, Kokkos::ALL(), 0); auto y1d = Kokkos::subview(y2d, Kokkos::ALL(), 0); applyLocalPrec(x1d, y1d); @@ -132,8 +132,8 @@ apply (const Tpetra::MultiVector &X, { auto Xcol = X.getVector(i); auto Ycol = Y.getVector(i); - auto xColView2d = Xcol->template getLocalView(); - auto yColView2d = Ycol->template getLocalView(); + auto xColView2d = Xcol->template getLocalView(Tpetra::Access::ReadWrite); + auto yColView2d = Ycol->template getLocalView(Tpetra::Access::ReadWrite); ScalarArray xColView1d = Kokkos::subview(xColView2d, Kokkos::ALL(), 0); ScalarArray yColView1d = Kokkos::subview(yColView2d, Kokkos::ALL(), 0); applyLocalPrec(xColView1d, yColView1d); diff --git a/packages/ifpack2/src/Ifpack2_Details_GaussSeidel.hpp b/packages/ifpack2/src/Ifpack2_Details_GaussSeidel.hpp index 3305a1405583..e4817b0c3183 100644 --- a/packages/ifpack2/src/Ifpack2_Details_GaussSeidel.hpp +++ b/packages/ifpack2/src/Ifpack2_Details_GaussSeidel.hpp @@ -71,7 +71,6 @@ namespace Details { numRows = A.getNodeNumRows(); inverseDiagVec = inverseDiagVec_; - inverseDiagVec->sync_host(); applyRows = applyRows_; blockSize = 1; omega = omega_; @@ -88,7 +87,6 @@ namespace Details { numRows = A.getNodeNumRows(); inverseDiagVec = inverseDiagVec_; - inverseDiagVec->sync_host(); applyRows = applyRows_; blockSize = 1; omega = omega_; @@ -141,7 +139,7 @@ namespace Details //note: direction is either Forward or Backward (Symmetric is handled in apply()) LO numApplyRows = useApplyRows ? (LO) applyRows.size() : numRows; //note: inverseDiagMV always has only one column - auto inverseDiag = inverseDiagVec->get1dView(); + auto inverseDiag = Kokkos::subview(inverseDiagVec->getLocalViewHost(Tpetra::Access::ReadOnly), Kokkos::ALL(), 0); bool forward = direction == Tpetra::Forward; if(multipleRHS) { @@ -172,7 +170,7 @@ namespace Details } } //Update x - IST dinv = inverseDiag[row]; + IST dinv = inverseDiag(row); for(LO k = 0; k < numVecs; k++) { if(omegaNotOne) @@ -184,9 +182,9 @@ namespace Details } else { - auto xlcl = Kokkos::subview(x.getLocalViewHost(), Kokkos::ALL(), 0); - auto blcl = Kokkos::subview(b.getLocalViewHost(), Kokkos::ALL(), 0); - auto dlcl = Kokkos::subview(inverseDiagVec->getLocalViewHost(), Kokkos::ALL(), 0); + auto xlcl = Kokkos::subview(x.getLocalViewHost(Tpetra::Access::ReadWrite), Kokkos::ALL(), 0); + auto blcl = Kokkos::subview(b.getLocalViewHost(Tpetra::Access::ReadOnly), Kokkos::ALL(), 0); + auto dlcl = Kokkos::subview(inverseDiagVec->getLocalViewHost(Tpetra::Access::ReadOnly), Kokkos::ALL(), 0); for(LO i = 0; i < numApplyRows; i++) { LO row; @@ -238,7 +236,7 @@ namespace Details row = useApplyRows ? applyRows[numApplyRows - 1 - i] : numApplyRows - 1 - i; for(LO v = 0; v < numVecs; v++) { - auto bRow = b.getLocalBlock (row, v); + auto bRow = b.getLocalBlock (row, v, Tpetra::Access::ReadOnly); for(LO k = 0; k < blockSize; k++) { accum(k, v) = KAT::zero(); @@ -252,7 +250,7 @@ namespace Details IST* blk = &Avalues(j * bs2); for(LO v = 0; v < numVecs; v++) { - auto xCol = x.getLocalBlock (col, v); + auto xCol = x.getLocalBlock (col, v, Tpetra::Access::ReadOnly); for(LO br = 0; br < blockSize; br++) { for(LO bc = 0; bc < blockSize; bc++) @@ -268,7 +266,7 @@ namespace Details Kokkos::deep_copy(dinv_accum, KAT::zero()); for(LO v = 0; v < numVecs; v++) { - auto bRow = b.getLocalBlock (row, v); + auto bRow = b.getLocalBlock (row, v, Tpetra::Access::ReadOnly); for(LO br = 0; br < blockSize; br++) { accum(br, v) = bRow(br) - accum(br, v); @@ -287,7 +285,7 @@ namespace Details //Update x for(LO v = 0; v < numVecs; v++) { - auto xRow = x.getLocalBlock (row, v); + auto xRow = x.getLocalBlock (row, v, Tpetra::Access::ReadWrite); for(LO k = 0; k < blockSize; k++) { xRow(k) += omega * dinv_accum(k, v); diff --git a/packages/ifpack2/src/Ifpack2_Details_ScaledDampedResidual_decl.hpp b/packages/ifpack2/src/Ifpack2_Details_ScaledDampedResidual_decl.hpp index 60e465943b19..f367ccbfc6d9 100644 --- a/packages/ifpack2/src/Ifpack2_Details_ScaledDampedResidual_decl.hpp +++ b/packages/ifpack2/src/Ifpack2_Details_ScaledDampedResidual_decl.hpp @@ -111,7 +111,6 @@ class ScaledDampedResidual { std::unique_ptr X_colMap_; std::unique_ptr V1_; - typename multivector_type::dual_view_type::t_host viewW_, viewB_, viewX_; Teuchos::RCP W_vec_, B_vec_, X_vec_; // Do the Import, if needed, and return the column Map version of X. diff --git a/packages/ifpack2/src/Ifpack2_Details_ScaledDampedResidual_def.hpp b/packages/ifpack2/src/Ifpack2_Details_ScaledDampedResidual_def.hpp index ae4920f04c4a..79d14e65a9d3 100644 --- a/packages/ifpack2/src/Ifpack2_Details_ScaledDampedResidual_def.hpp +++ b/packages/ifpack2/src/Ifpack2_Details_ScaledDampedResidual_def.hpp @@ -46,7 +46,6 @@ #include "Tpetra_MultiVector.hpp" #include "Tpetra_Operator.hpp" #include "Tpetra_Vector.hpp" -#include "Tpetra_withLocalAccess_MultiVector.hpp" #include "Tpetra_Export_decl.hpp" #include "Tpetra_Import_decl.hpp" #include "Kokkos_ArithTraits.hpp" @@ -303,18 +302,9 @@ compute (multivector_type& W, if (canFuse (B)) { // "nonconst" here has no effect other than on the return type. - if (W_vec_.is_null() || W.getLocalViewHost().data() != viewW_.data()) { - viewW_ = W.getLocalViewHost(); - W_vec_ = W.getVectorNonConst (0); - } - if (B_vec_.is_null() || B.getLocalViewHost().data() != viewB_.data()) { - viewB_ = B.getLocalViewHost(); - B_vec_ = B.getVectorNonConst (0); - } - if (X_vec_.is_null() || X.getLocalViewHost().data() != viewX_.data()) { - viewX_ = X.getLocalViewHost(); - X_vec_ = X.getVectorNonConst (0); - } + W_vec_ = W.getVectorNonConst (0); + B_vec_ = B.getVectorNonConst (0); + X_vec_ = X.getVectorNonConst (0); TEUCHOS_ASSERT( ! A_crs_.is_null () ); fusedCase (*W_vec_, alpha, D_inv, *B_vec_, *A_crs_, *X_vec_, beta); } @@ -391,53 +381,22 @@ fusedCase (vector_type& W, { vector_type& X_colMap = importVector (X); - // Only need these aliases because we lack C++14 generic lambdas. - using Tpetra::with_local_access_function_argument_type; - using ro_lcl_vec_type = - with_local_access_function_argument_type< - decltype (readOnly (B))>; - using wo_lcl_vec_type = - with_local_access_function_argument_type< - decltype (writeOnly (B))>; - using rw_lcl_vec_type = - with_local_access_function_argument_type< - decltype (readWrite (B))>; - - using Tpetra::withLocalAccess; - using Tpetra::readOnly; - using Tpetra::readWrite; - using Tpetra::writeOnly; using Impl::scaled_damped_residual_vector; using STS = Teuchos::ScalarTraits; auto A_lcl = A.getLocalMatrix (); + auto Dinv_lcl = Kokkos::subview(D_inv.getLocalViewDevice(Tpetra::Access::ReadOnly), Kokkos::ALL(), 0); + auto B_lcl = Kokkos::subview(B.getLocalViewDevice(Tpetra::Access::ReadOnly), Kokkos::ALL(), 0); + auto X_lcl = Kokkos::subview(X_colMap.getLocalViewDevice(Tpetra::Access::ReadOnly), Kokkos::ALL(), 0); if (beta == STS::zero ()) { - withLocalAccess - ([&] (const wo_lcl_vec_type& W_lcl, - const ro_lcl_vec_type& D_lcl, - const ro_lcl_vec_type& B_lcl, - const ro_lcl_vec_type& X_lcl) { - scaled_damped_residual_vector (alpha, W_lcl, D_lcl, - B_lcl, A_lcl, X_lcl, beta); - }, - writeOnly (W), - readOnly (D_inv), - readOnly (B), - readOnly (X_colMap)); + auto W_lcl = Kokkos::subview(W.getLocalViewDevice(Tpetra::Access::OverwriteAll), Kokkos::ALL(), 0); + scaled_damped_residual_vector (alpha, W_lcl, Dinv_lcl, + B_lcl, A_lcl, X_lcl, beta); } else { // need to read _and_ write W if beta != 0 - withLocalAccess - ([&] (const rw_lcl_vec_type& W_lcl, - const ro_lcl_vec_type& D_lcl, - const ro_lcl_vec_type& B_lcl, - const ro_lcl_vec_type& X_lcl) { - scaled_damped_residual_vector (alpha, W_lcl, D_lcl, - B_lcl, A_lcl, X_lcl, beta); - }, - readWrite (W), - readOnly (D_inv), - readOnly (B), - readOnly (X_colMap)); + auto W_lcl = Kokkos::subview(W.getLocalViewDevice(Tpetra::Access::ReadWrite), Kokkos::ALL(), 0); + scaled_damped_residual_vector (alpha, W_lcl, Dinv_lcl, + B_lcl, A_lcl, X_lcl, beta); } } diff --git a/packages/ifpack2/src/Ifpack2_Experimental_RBILUK_decl.hpp b/packages/ifpack2/src/Ifpack2_Experimental_RBILUK_decl.hpp index dd8e05b5e2fb..2cf6a45defb6 100644 --- a/packages/ifpack2/src/Ifpack2_Experimental_RBILUK_decl.hpp +++ b/packages/ifpack2/src/Ifpack2_Experimental_RBILUK_decl.hpp @@ -334,7 +334,8 @@ class RBILUK : virtual public Ifpack2::RILUK< Tpetra::RowMatrix< typename Matrix typedef Teuchos::ScalarTraits STM; typedef typename block_crs_matrix_type::little_block_type little_block_type; typedef typename block_crs_matrix_type::little_vec_type little_vec_type; - typedef typename little_vec_type::HostMirror little_host_vec_type; + typedef typename block_crs_matrix_type::little_host_vec_type little_host_vec_type; + typedef typename block_crs_matrix_type::const_host_little_vec_type const_host_little_vec_type; void allocate_L_and_U_blocks(); void initAllValues (const block_crs_matrix_type& A); diff --git a/packages/ifpack2/src/Ifpack2_Experimental_RBILUK_def.hpp b/packages/ifpack2/src/Ifpack2_Experimental_RBILUK_def.hpp index a2169c64a01b..490a48efe7ca 100644 --- a/packages/ifpack2/src/Ifpack2_Experimental_RBILUK_def.hpp +++ b/packages/ifpack2/src/Ifpack2_Experimental_RBILUK_def.hpp @@ -341,6 +341,8 @@ initAllValues (const block_crs_matrix_type& A) // This is ok, as the *order* of the GIDs in the rowmap is a better // expression of the user's intent than the GIDs themselves. + //TODO BMK: Revisit this fence when BlockCrsMatrix is refactored. + Kokkos::fence(); for (size_t myRow=0; myRowgetGraph ()->getDomainMap ()), blockSize_, numVectors); BMV rBlock (* (A_block_->getGraph ()->getDomainMap ()), blockSize_, numVectors); - cBlock.sync_host(); for (local_ordinal_type imv = 0; imv < numVectors; ++imv) { for (size_t i = 0; i < D_block_->getNodeNumRows(); ++i) { local_ordinal_type local_row = i; - little_host_vec_type xval = xBlock.getLocalBlock(local_row,imv); - little_host_vec_type cval = cBlock.getLocalBlock(local_row,imv); + const_host_little_vec_type xval = xBlock.getLocalBlock(local_row, imv, Tpetra::Access::ReadOnly); + little_host_vec_type cval = cBlock.getLocalBlock(local_row, imv, Tpetra::Access::OverwriteAll); //cval.assign(xval); Tpetra::COPY (xval, cval); @@ -859,7 +860,7 @@ apply (const Tpetra::MultiVectorapplyBlock(cBlock, rBlock); // Solve U Y = R. - rBlock.sync_host(); for (local_ordinal_type imv = 0; imv < numVectors; ++imv) { const local_ordinal_type numRows = D_block_->getNodeNumRows(); for (local_ordinal_type i = 0; i < numRows; ++i) { local_ordinal_type local_row = (numRows-1)-i; - little_host_vec_type rval = rBlock.getLocalBlock(local_row,imv); - little_host_vec_type yval = yBlock.getLocalBlock(local_row,imv); + const_host_little_vec_type rval = rBlock.getLocalBlock(local_row, imv, Tpetra::Access::ReadOnly); + little_host_vec_type yval = yBlock.getLocalBlock(local_row, imv, Tpetra::Access::OverwriteAll); //yval.assign(rval); Tpetra::COPY (rval, yval); @@ -895,7 +895,7 @@ apply (const Tpetra::MultiVector Xcopy; { - auto X_lcl_host = X.getLocalViewHost (); - auto Y_lcl_host = Y.getLocalViewHost (); - - if (X_lcl_host.data () == Y_lcl_host.data ()) { + if (X.aliases(Y)) { Xcopy = rcp (new MV (X, Teuchos::Copy)); } else { Xcopy = rcpFromRef (X); diff --git a/packages/ifpack2/src/Ifpack2_LocalSparseTriangularSolver_def.hpp b/packages/ifpack2/src/Ifpack2_LocalSparseTriangularSolver_def.hpp index 54b1657e5cc2..8851131e6236 100644 --- a/packages/ifpack2/src/Ifpack2_LocalSparseTriangularSolver_def.hpp +++ b/packages/ifpack2/src/Ifpack2_LocalSparseTriangularSolver_def.hpp @@ -194,9 +194,8 @@ class LocalSparseTriangularSolver::HtsImpl { (void)alpha; (void)beta; #ifdef HAVE_IFPACK2_SHYLU_NODEHTS - const auto& X_view = X.getLocalViewHost (); - const auto& Y_view = Y.getLocalViewHost (); - Kokkos::fence(); + const auto& X_view = X.getLocalViewHost (Tpetra::Access::ReadOnly); + const auto& Y_view = Y.getLocalViewHost (Tpetra::Access::ReadWrite); // Only does something if #rhs > current capacity. HTST::reset_max_nrhs(Timpl_.get(), X_view.extent(1)); @@ -725,13 +724,6 @@ localTriangularSolve (const MV& Y, "not currently support non-conjugated transposed solve (mode == " "Teuchos::TRANS) for complex scalar types."); - // FIXME (mfh 19 May 2016) This makes some Ifpack2 tests fail. - // - // TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC - // (Y.template need_sync () && ! - // Y.template need_sync (), std::runtime_error, - // "Y must be sync'd to device memory before you may call this method."); - const std::string uplo = this->uplo_; const std::string trans = (mode == Teuchos::CONJ_TRANS) ? "C" : (mode == Teuchos::TRANS ? "T" : "N"); @@ -744,43 +736,28 @@ localTriangularSolve (const MV& Y, auto ind = A_lclk.graph.entries; auto val = A_lclk.values; - X.sync_device (); - const_cast (Y).sync_device (); - X.modify_device (); // we will write to X - const size_t numVecs = std::min (X.getNumVectors (), Y.getNumVectors ()); for (size_t j = 0; j < numVecs; ++j) { - auto X_j = X.getVector (j); + auto X_j = X.getVectorNonConst (j); auto Y_j = Y.getVector (j); - auto X_lcl = X_j->getLocalViewDevice (); - auto Y_lcl = Y_j->getLocalViewDevice (); + auto X_lcl = X_j->getLocalViewDevice (Tpetra::Access::ReadWrite); + auto Y_lcl = Y_j->getLocalViewDevice (Tpetra::Access::ReadOnly); auto X_lcl_1d = Kokkos::subview (X_lcl, Kokkos::ALL (), 0); auto Y_lcl_1d = Kokkos::subview (Y_lcl, Kokkos::ALL (), 0); KokkosSparse::Experimental::sptrsv_solve(kh_.getRawPtr(), ptr, ind, val, Y_lcl_1d, X_lcl_1d); // TODO is this fence needed... typename k_handle::HandleExecSpace().fence(); } - // TODO: This forces a sync on host that may be unnecessary, but unclear where users may need to check for this... - X.sync_host (); - const_cast (Y).sync_host (); } else { const std::string diag = this->diag_; - // NOTE (mfh 20 Aug 2017): KokkosSparse::trsv currently is a - // sequential, host-only code. See - // https://github.com/kokkos/kokkos-kernels/issues/48. This - // means that we need to sync to host, then sync back to device - // when done. auto A_lcl = this->A_crs_->getLocalMatrix (); - X.sync_host (); - const_cast (Y).sync_host (); - X.modify_host (); // we will write to X if (X.isConstantStride () && Y.isConstantStride ()) { - auto X_lcl = X.getLocalViewHost (); - auto Y_lcl = Y.getLocalViewHost (); + auto X_lcl = X.getLocalViewHost (Tpetra::Access::ReadWrite); + auto Y_lcl = Y.getLocalViewHost (Tpetra::Access::ReadOnly); KokkosSparse::trsv (uplo.c_str (), trans.c_str (), diag.c_str (), A_lcl, Y_lcl, X_lcl); } @@ -788,17 +765,14 @@ localTriangularSolve (const MV& Y, const size_t numVecs = std::min (X.getNumVectors (), Y.getNumVectors ()); for (size_t j = 0; j < numVecs; ++j) { - auto X_j = X.getVector (j); - auto Y_j = X.getVector (j); - auto X_lcl = X_j->getLocalViewHost (); - auto Y_lcl = Y_j->getLocalViewHost (); + auto X_j = X.getVectorNonConst (j); + auto Y_j = Y.getVector (j); + auto X_lcl = X_j->getLocalViewHost (Tpetra::Access::ReadWrite); + auto Y_lcl = Y_j->getLocalViewHost (Tpetra::Access::ReadOnly); KokkosSparse::trsv (uplo.c_str (), trans.c_str (), diag.c_str (), A_lcl, Y_lcl, X_lcl); } } - - X.sync_device (); - const_cast (Y).sync_device (); } } diff --git a/packages/ifpack2/src/Ifpack2_OverlappingRowMatrix_def.hpp b/packages/ifpack2/src/Ifpack2_OverlappingRowMatrix_def.hpp index c79646cfe1e2..8e220719cbd3 100644 --- a/packages/ifpack2/src/Ifpack2_OverlappingRowMatrix_def.hpp +++ b/packages/ifpack2/src/Ifpack2_OverlappingRowMatrix_def.hpp @@ -563,19 +563,12 @@ apply (const Tpetra::MultiVector= Y_d.data () + Y_d.span ()) { - aliases = false; // X starts after Y ends; no overlap - } - else if (Y_d.data () >= X_d.data () + X_d.span ()) { - aliases = false; // Y starts after X ends; no overlap - } + // If X aliases Y, we'll need to copy X. + bool aliases = X.aliases(Y); if (aliases) { MV X_copy (X, Teuchos::Copy); this->apply (X_copy, Y, mode, alpha, beta); + return; } const auto& rowMap0 = * (A_->getRowMap ()); diff --git a/packages/ifpack2/src/Ifpack2_RILUK_decl.hpp b/packages/ifpack2/src/Ifpack2_RILUK_decl.hpp index b2b341d430ae..a7be84980ff3 100644 --- a/packages/ifpack2/src/Ifpack2_RILUK_decl.hpp +++ b/packages/ifpack2/src/Ifpack2_RILUK_decl.hpp @@ -289,6 +289,9 @@ class RILUK: global_ordinal_type, node_type> crs_matrix_type; + //! Scalar type stored in Kokkos::Views (CrsMatrix and MultiVector) + typedef typename crs_matrix_type::impl_scalar_type impl_scalar_type; + template friend class RILUK; //@} diff --git a/packages/ifpack2/src/Ifpack2_RILUK_def.hpp b/packages/ifpack2/src/Ifpack2_RILUK_def.hpp index 721ed6e03a4f..bc8c0481a8ed 100644 --- a/packages/ifpack2/src/Ifpack2_RILUK_def.hpp +++ b/packages/ifpack2/src/Ifpack2_RILUK_def.hpp @@ -622,7 +622,7 @@ initAllValues (const row_matrix_type& A) } D_->putScalar (STS::zero ()); // Set diagonal values to zero - ArrayRCP DV = D_->get1dViewNonConst (); // Get view of diagonal + auto DV = Kokkos::subview(D_->getLocalViewHost(Tpetra::Access::ReadWrite), Kokkos::ALL(), 0); RCP rowMap = L_->getRowMap (); @@ -654,7 +654,7 @@ initAllValues (const row_matrix_type& A) if (k == local_row) { DiagFound = true; // Store perturbed diagonal in Tpetra::Vector D_ - DV[local_row] += Rthresh_ * InV[j] + IFPACK2_SGN(InV[j]) * Athresh_; + DV(local_row) += Rthresh_ * InV[j] + IFPACK2_SGN(InV[j]) * Athresh_; } else if (k < 0) { // Out of range TEUCHOS_TEST_FOR_EXCEPTION( @@ -682,7 +682,7 @@ initAllValues (const row_matrix_type& A) if (DiagFound) { ++NumNonzeroDiags; } else { - DV[local_row] = Athresh_; + DV(local_row) = Athresh_; } if (NumL) { @@ -771,8 +771,8 @@ void RILUK::compute () size_t num_cols = U_->getColMap()->getNodeNumElements(); Teuchos::Array colflag(num_cols); - Teuchos::ArrayRCP DV = D_->get1dViewNonConst(); // Get view of diagonal - + auto DV = Kokkos::subview(D_->getLocalViewHost(Tpetra::Access::ReadWrite), Kokkos::ALL(), 0); + // Now start the factorization. // Need some integer workspace and pointers @@ -791,7 +791,7 @@ void RILUK::compute () NumIn = MaxNumEntries; L_->getLocalRowCopy (local_row, InI (), InV (), NumL); - InV[NumL] = DV[i]; // Put in diagonal + InV[NumL] = DV(i); // Put in diagonal InI[NumL] = local_row; U_->getLocalRowCopy (local_row, InI (NumL+1, MaxNumEntries-NumL-1), @@ -809,7 +809,7 @@ void RILUK::compute () local_ordinal_type j = InI[jj]; scalar_type multiplier = InV[jj]; // current_mults++; - InV[jj] *= DV[j]; + InV[jj] *= static_cast(DV(j)); U_->getLocalRowView(j, UUI, UUV); // View of row above NumUU = UUI.size(); @@ -845,26 +845,26 @@ void RILUK::compute () L_->replaceLocalValues (local_row, InI (0, NumL), InV (0, NumL)); } - DV[i] = InV[NumL]; // Extract Diagonal value + DV(i) = InV[NumL]; // Extract Diagonal value if (RelaxValue_ != STM::zero ()) { - DV[i] += RelaxValue_*diagmod; // Add off diagonal modifications + DV(i) += RelaxValue_*diagmod; // Add off diagonal modifications } - if (STS::magnitude (DV[i]) > STS::magnitude (MaxDiagonalValue)) { - if (STS::real (DV[i]) < STM::zero ()) { - DV[i] = -MinDiagonalValue; + if (STS::magnitude (DV(i)) > STS::magnitude (MaxDiagonalValue)) { + if (STS::real (DV(i)) < STM::zero ()) { + DV(i) = -MinDiagonalValue; } else { - DV[i] = MinDiagonalValue; + DV(i) = MinDiagonalValue; } } else { - DV[i] = STS::one () / DV[i]; // Invert diagonal value + DV(i) = static_cast(STS::one ()) / DV(i); // Invert diagonal value } for (size_t j = 0; j < NumU; ++j) { - InV[NumL+1+j] *= DV[i]; // Scale U by inverse of diagonal + InV[NumL+1+j] *= static_cast(DV(i)); // Scale U by inverse of diagonal } if (NumU) { diff --git a/packages/ifpack2/src/Ifpack2_Relaxation_def.hpp b/packages/ifpack2/src/Ifpack2_Relaxation_def.hpp index cb942129156e..17a491472aa7 100644 --- a/packages/ifpack2/src/Ifpack2_Relaxation_def.hpp +++ b/packages/ifpack2/src/Ifpack2_Relaxation_def.hpp @@ -48,8 +48,6 @@ #include "Tpetra_BlockView.hpp" #include "Ifpack2_Utilities.hpp" #include "MatrixMarket_Tpetra.hpp" -#include "Tpetra_transform_MultiVector.hpp" -#include "Tpetra_withLocalAccess_MultiVector.hpp" #include "Tpetra_Details_residual.hpp" #include #include @@ -583,12 +581,8 @@ apply (const Tpetra::MultiVector Xcopy; - // FIXME (mfh 12 Sep 2014) This test for aliasing is incomplete. { - auto X_lcl_host = X.getLocalViewHost (); - auto Y_lcl_host = Y.getLocalViewHost (); - - if (X_lcl_host.data () == Y_lcl_host.data ()) { + if (X.aliases(Y)) { Xcopy = rcp (new MV (X, Teuchos::Copy)); } else { Xcopy = rcpFromRef (X); @@ -972,7 +966,6 @@ void Relaxation::computeBlockCrs () template void Relaxation::compute () { - using Tpetra::readWrite; using Teuchos::Array; using Teuchos::ArrayRCP; using Teuchos::ArrayView; @@ -1055,7 +1048,7 @@ void Relaxation::compute () // method (not inherited from RowMatrix's interface). It's // perfectly valid to do relaxation on a RowMatrix which is not // a CrsMatrix. - if (crsMat == nullptr || ! crsMat->isStaticGraph ()) { + if (crsMat == nullptr || ! crsMat->isFillComplete ()) { A_->getLocalDiagCopy (*Diagonal_); // slow path } else { @@ -1114,36 +1107,26 @@ void Relaxation::compute () // combine diagonal extraction above with L1 modification into a // single parallel loop. if (DoL1Method_ && IsParallel_) { - vector_type& gblDiag = *Diagonal_; - using rw_type = - decltype (readWrite (gblDiag).on (Kokkos::HostSpace ())); - // Once we have C++14, we can get rid of this alias and use - // "auto" in the lambda below. - using lcl_vec_type = - Tpetra::with_local_access_function_argument_type; const row_matrix_type& A_row = *A_; - const magnitude_type L1_eta = L1Eta_; - Tpetra::withLocalAccess - ([&A_row, L1_eta, numMyRows] (const lcl_vec_type& diag) { - const magnitude_type two = STM::one () + STM::one (); - const size_t maxLength = A_row.getNodeMaxNumRowEntries (); - Array indices (maxLength); - Array values (maxLength); - size_t numEntries; - - for (LO i = 0; i < numMyRows; ++i) { - A_row.getLocalRowCopy (i, indices (), values (), numEntries); - magnitude_type diagonal_boost = STM::zero (); - for (size_t k = 0 ; k < numEntries; ++k) { - if (indices[k] > numMyRows) { - diagonal_boost += STS::magnitude (values[k] / two); - } - } - if (KAT::magnitude (diag[i]) < L1_eta * diagonal_boost) { - diag[i] += diagonal_boost; - } - } - }, readWrite (gblDiag).on (Kokkos::HostSpace ())); + auto diag = Diagonal_->getLocalViewHost(Tpetra::Access::ReadWrite); + const magnitude_type two = STM::one () + STM::one (); + const size_t maxLength = A_row.getNodeMaxNumRowEntries (); + Array indices (maxLength); + Array values (maxLength); + size_t numEntries; + + for (LO i = 0; i < numMyRows; ++i) { + A_row.getLocalRowCopy (i, indices (), values (), numEntries); + magnitude_type diagonal_boost = STM::zero (); + for (size_t k = 0 ; k < numEntries; ++k) { + if (indices[k] > numMyRows) { + diagonal_boost += STS::magnitude (values[k] / two); + } + } + if (KAT::magnitude (diag(i, 0)) < L1Eta_ * diagonal_boost) { + diag(i, 0) += diagonal_boost; + } + } } // @@ -1176,77 +1159,69 @@ void Relaxation::compute () magnitude_type minMagDiagEntryMag = STM::zero (); magnitude_type maxMagDiagEntryMag = STM::zero (); - vector_type& gblDiag = *Diagonal_; - // Once we have C++14, we can get rid of these two aliases and - // use "auto" in the lambda below. - using rw_type = - decltype (readWrite (gblDiag).on (Kokkos::HostSpace ())); - using lcl_vec_type = - Tpetra::with_local_access_function_argument_type; - Tpetra::withLocalAccess - ([&] (const lcl_vec_type& diag) { - // As we go, keep track of the diagonal entries with the - // least and greatest magnitude. We could use the trick of - // starting min with +Inf and max with -Inf, but that - // doesn't work if scalar_type is a built-in integer type. - // Thus, we have to start by reading the first diagonal - // entry redundantly. - if (numMyRows != 0) { - const magnitude_type d_0_mag = KAT::abs (diag[0]); - minMagDiagEntryMag = d_0_mag; - maxMagDiagEntryMag = d_0_mag; - } - - // Go through all the diagonal entries. Compute counts of - // small-magnitude, zero, and negative-real-part entries. - // Invert the diagonal entries that aren't too small. For - // those too small in magnitude, replace them with - // 1/MinDiagonalValue_ (or 1/eps if MinDiagonalValue_ - // happens to be zero). - for (LO i = 0; i < numMyRows; ++i) { - const IST d_i = diag[i]; - const magnitude_type d_i_mag = KAT::abs (d_i); - // Work-around for GitHub Issue #5269. - //const magnitude_type d_i_real = KAT::real (d_i); - const auto d_i_real = getRealValue (d_i); - - // We can't compare complex numbers, but we can compare their - // real parts. - if (d_i_real < STM::zero ()) { - ++numNegDiagEntries; - } - if (d_i_mag < minMagDiagEntryMag) { - minMagDiagEntryMag = d_i_mag; - } - if (d_i_mag > maxMagDiagEntryMag) { - maxMagDiagEntryMag = d_i_mag; - } - - if (fixTinyDiagEntries_) { - // <= not <, in case minDiagValMag is zero. - if (d_i_mag <= minDiagValMag) { - ++numSmallDiagEntries; - if (d_i_mag == STM::zero ()) { - ++numZeroDiagEntries; - } - diag[i] = oneOverMinDiagVal; - } - else { - diag[i] = KAT::one () / d_i; - } - } - else { // Don't fix zero or tiny diagonal entries. - // <= not <, in case minDiagValMag is zero. - if (d_i_mag <= minDiagValMag) { - ++numSmallDiagEntries; - if (d_i_mag == STM::zero ()) { - ++numZeroDiagEntries; - } - } - diag[i] = KAT::one () / d_i; - } - } - }, readWrite (gblDiag).on (Kokkos::HostSpace ())); + auto diag2d = Diagonal_->getLocalViewHost(Tpetra::Access::ReadWrite); + auto diag = Kokkos::subview(diag2d, Kokkos::ALL(), 0); + // As we go, keep track of the diagonal entries with the + // least and greatest magnitude. We could use the trick of + // starting min with +Inf and max with -Inf, but that + // doesn't work if scalar_type is a built-in integer type. + // Thus, we have to start by reading the first diagonal + // entry redundantly. + if (numMyRows != 0) { + const magnitude_type d_0_mag = KAT::abs (diag(0)); + minMagDiagEntryMag = d_0_mag; + maxMagDiagEntryMag = d_0_mag; + } + + // Go through all the diagonal entries. Compute counts of + // small-magnitude, zero, and negative-real-part entries. + // Invert the diagonal entries that aren't too small. For + // those too small in magnitude, replace them with + // 1/MinDiagonalValue_ (or 1/eps if MinDiagonalValue_ + // happens to be zero). + for (LO i = 0; i < numMyRows; ++i) { + const IST d_i = diag(i); + const magnitude_type d_i_mag = KAT::abs (d_i); + // Work-around for GitHub Issue #5269. + //const magnitude_type d_i_real = KAT::real (d_i); + const auto d_i_real = getRealValue (d_i); + + // We can't compare complex numbers, but we can compare their + // real parts. + if (d_i_real < STM::zero ()) { + ++numNegDiagEntries; + } + if (d_i_mag < minMagDiagEntryMag) { + minMagDiagEntryMag = d_i_mag; + } + if (d_i_mag > maxMagDiagEntryMag) { + maxMagDiagEntryMag = d_i_mag; + } + + if (fixTinyDiagEntries_) { + // <= not <, in case minDiagValMag is zero. + if (d_i_mag <= minDiagValMag) { + ++numSmallDiagEntries; + if (d_i_mag == STM::zero ()) { + ++numZeroDiagEntries; + } + diag(i) = oneOverMinDiagVal; + } + else { + diag(i) = KAT::one () / d_i; + } + } + else { // Don't fix zero or tiny diagonal entries. + // <= not <, in case minDiagValMag is zero. + if (d_i_mag <= minDiagValMag) { + ++numSmallDiagEntries; + if (d_i_mag == STM::zero ()) { + ++numZeroDiagEntries; + } + } + diag(i) = KAT::one () / d_i; + } + } // Count floating-point operations of computing the inverse diagonal. // @@ -1302,9 +1277,6 @@ void Relaxation::compute () // diagonal, and the original diagonal's inverse. vector_type diff (A_->getRowMap ()); diff.reciprocal (*origDiag); - if (Diagonal_->need_sync_device ()) { - Diagonal_->sync_device (); - } diff.update (-one, *Diagonal_, one); globalDiagNormDiff_ = diff.norm2 (); } @@ -1313,28 +1285,22 @@ void Relaxation::compute () // Go through all the diagonal entries. Invert those that // aren't too small in magnitude. For those that are too // small in magnitude, replace them with oneOverMinDiagVal. - vector_type& gblDiag = *Diagonal_; - Tpetra::transform - ("Ifpack2::Relaxation::compute: Invert & fix diagonal", - gblDiag, gblDiag, - KOKKOS_LAMBDA (const IST& d_i) { - const magnitude_type d_i_mag = KAT::magnitude (d_i); - - // <= not <, in case minDiagValMag is zero. - if (d_i_mag <= minDiagValMag) { - return oneOverMinDiagVal; - } - else { - // For Stokhos types, operator/ returns an expression - // type. Explicitly convert to IST before returning. - return IST (KAT::one () / d_i); - } - }); + auto localDiag = Diagonal_->getLocalViewDevice(Tpetra::Access::ReadWrite); + Kokkos::parallel_for(Kokkos::RangePolicy(0, localDiag.extent(0)), + KOKKOS_LAMBDA (const IST& d_i) { + const magnitude_type d_i_mag = KAT::magnitude (d_i); + // <= not <, in case minDiagValMag is zero. + if (d_i_mag <= minDiagValMag) { + return oneOverMinDiagVal; + } + else { + // For Stokhos types, operator/ returns an expression + // type. Explicitly convert to IST before returning. + return IST (KAT::one () / d_i); + } + }); } else { // don't fix tiny or zero diagonal entries - if (Diagonal_->need_sync_device ()) { - Diagonal_->sync_device (); - } Diagonal_->reciprocal (*Diagonal_); } @@ -1346,10 +1312,6 @@ void Relaxation::compute () } } - if (Diagonal_->need_sync_device ()) { - Diagonal_->sync_device (); - } - if (PrecType_ == Ifpack2::Details::MTGS || PrecType_ == Ifpack2::Details::MTSGS || PrecType_ == Ifpack2::Details::GS2 || @@ -1363,7 +1325,9 @@ void Relaxation::compute () "when the input matrix is a Tpetra::CrsMatrix."); local_matrix_type kcsr = crsMat->getLocalMatrix (); - auto diagView_2d = Diagonal_->getLocalViewDevice (); + //TODO BMK: This should be ReadOnly, and KokkosKernels should accept a + //const-valued view for user-provided D^-1. OK for now, Diagonal_ is nonconst. + auto diagView_2d = Diagonal_->getLocalViewDevice (Tpetra::Access::ReadWrite); auto diagView_1d = Kokkos::subview (diagView_2d, Kokkos::ALL (), 0); using KokkosSparse::Experimental::gauss_seidel_numeric; gauss_seidel_numeric(X).sync_host(); for (int j = 0; j < NumSweeps_; ++j) { // data exchange is here, once per sweep if (IsParallel_) { @@ -1674,9 +1637,7 @@ ApplyInverseSerialGS_RowMatrix (const Tpetra::MultiVectordoImport (Y, *Importer_, Tpetra::INSERT); } } - Y2->sync_host(); serialGaussSeidel_->apply(*Y2, X, direction); - Y2->modify_host(); // FIXME (mfh 02 Jan 2013) This is only correct if row Map == range Map. if (IsParallel_) { @@ -1787,45 +1748,6 @@ ApplyInverseSerialGS_CrsMatrix(const crs_matrix_type& A, X_colMap = cachedMV_; X_domainMap = X_colMap->offsetViewNonConst (domainMap, 0); -#ifdef HAVE_TPETRA_DEBUG - auto X_colMap_host_view = X_colMap->getLocalViewHost (); - auto X_domainMap_host_view = X_domainMap->getLocalViewHost (); - - if (X_colMap->getLocalLength () != 0 && X_domainMap->getLocalLength ()) { - TEUCHOS_TEST_FOR_EXCEPTION - (X_colMap_host_view.data () != X_domainMap_host_view.data (), - std::logic_error, "Tpetra::CrsMatrix::gaussSeidelCopy: Pointer to " - "start of column Map view of X is not equal to pointer to start of " - "(domain Map view of) X. This may mean that Tpetra::MultiVector::" - "offsetViewNonConst is broken. " - "Please report this bug to the Tpetra developers."); - } - - TEUCHOS_TEST_FOR_EXCEPTION( - X_colMap_host_view.extent (0) < X_domainMap_host_view.extent (0) || - X_colMap->getLocalLength () < X_domainMap->getLocalLength (), - std::logic_error, "Tpetra::CrsMatrix::gaussSeidelCopy: " - "X_colMap has fewer local rows than X_domainMap. " - "X_colMap_host_view.extent(0) = " << X_colMap_host_view.extent (0) - << ", X_domainMap_host_view.extent(0) = " - << X_domainMap_host_view.extent (0) - << ", X_colMap->getLocalLength() = " << X_colMap->getLocalLength () - << ", and X_domainMap->getLocalLength() = " - << X_domainMap->getLocalLength () - << ". This means that Tpetra::MultiVector::offsetViewNonConst " - "is broken. Please report this bug to the Tpetra developers."); - - TEUCHOS_TEST_FOR_EXCEPTION( - X_colMap->getNumVectors () != X_domainMap->getNumVectors (), - std::logic_error, "Tpetra::CrsMatrix::gaussSeidelCopy: " - "X_colMap has a different number of columns than X_domainMap. " - "X_colMap->getNumVectors() = " << X_colMap->getNumVectors () - << " != X_domainMap->getNumVectors() = " - << X_domainMap->getNumVectors () - << ". This means that Tpetra::MultiVector::offsetViewNonConst " - "is broken. Please report this bug to the Tpetra developers."); -#endif // HAVE_TPETRA_DEBUG - if (ZeroStartingSolution_) { // No need for an Import, since we're filling with zeros. X_colMap->putScalar (ZERO); @@ -1843,17 +1765,14 @@ ApplyInverseSerialGS_CrsMatrix(const crs_matrix_type& A, copyBackOutput = true; // Don't forget to copy back at end. } // if column and domain Maps are (not) the same - const_cast(B).sync_host(); for (int sweep = 0; sweep < NumSweeps_; ++sweep) { if (! importer.is_null () && sweep > 0) { // We already did the first Import for the zeroth sweep above, // if it was necessary. X_colMap->doImport (*X_domainMap, *importer, Tpetra::INSERT); } - X_colMap->sync_host (); // Do local Gauss-Seidel (forward, backward or symmetric) serialGaussSeidel_->apply(*X_colMap, B, direction); - X_colMap->modify_host (); } if (copyBackOutput) { @@ -1947,14 +1866,11 @@ ApplyInverseSerialGS_BlockCrsMatrix (const block_crs_matrix_type& A, yBlockCol_mv.doImport(yBlock_mv, *pointImporter_, Tpetra::INSERT); } - const_cast(xBlock).sync_host(); for (int sweep = 0; sweep < NumSweeps_; ++sweep) { if (performImport && sweep > 0) { yBlockCol_mv.doImport(yBlock_mv, *pointImporter_, Tpetra::INSERT); } - yBlockCol->sync_host(); serialGaussSeidel_->applyBlock(*yBlockCol, xBlock, direction); - yBlockCol->modify_host(); if (performImport) { Tpetra::deep_copy(Y, *yBlockColPointDomain); } @@ -2131,45 +2047,6 @@ ApplyInverseMTGS_CrsMatrix( X_domainMap = X_colMap->offsetViewNonConst (domainMap, 0); -#ifdef HAVE_IFPACK2_DEBUG - auto X_colMap_host_view = X_colMap->template getLocalView (); - auto X_domainMap_host_view = X_domainMap->template getLocalView (); - - if (X_colMap->getLocalLength () != 0 && X_domainMap->getLocalLength ()) { - TEUCHOS_TEST_FOR_EXCEPTION( - X_colMap_host_view.data () != X_domainMap_host_view.data (), - std::logic_error, "Ifpack2::Relaxation::MTGaussSeidel: " - "Pointer to start of column Map view of X is not equal to pointer to " - "start of (domain Map view of) X. This may mean that " - "Tpetra::MultiVector::offsetViewNonConst is broken. " - "Please report this bug to the Tpetra developers."); - } - - TEUCHOS_TEST_FOR_EXCEPTION( - X_colMap_host_view.extent (0) < X_domainMap_host_view.extent (0) || - X_colMap->getLocalLength () < X_domainMap->getLocalLength (), - std::logic_error, "Ifpack2::Relaxation::MTGaussSeidel: " - "X_colMap has fewer local rows than X_domainMap. " - "X_colMap_host_view.extent(0) = " << X_colMap_host_view.extent (0) - << ", X_domainMap_host_view.extent(0) = " - << X_domainMap_host_view.extent (0) - << ", X_colMap->getLocalLength() = " << X_colMap->getLocalLength () - << ", and X_domainMap->getLocalLength() = " - << X_domainMap->getLocalLength () - << ". This means that Tpetra::MultiVector::offsetViewNonConst " - "is broken. Please report this bug to the Tpetra developers."); - - TEUCHOS_TEST_FOR_EXCEPTION( - X_colMap->getNumVectors () != X_domainMap->getNumVectors (), - std::logic_error, "Ifpack2::Relaxation::MTGaussSeidel: " - "X_colMap has a different number of columns than X_domainMap. " - "X_colMap->getNumVectors() = " << X_colMap->getNumVectors () - << " != X_domainMap->getNumVectors() = " - << X_domainMap->getNumVectors () - << ". This means that Tpetra::MultiVector::offsetViewNonConst " - "is broken. Please report this bug to the Tpetra developers."); -#endif // HAVE_IFPACK2_DEBUG - if (ZeroStartingSolution_) { // No need for an Import, since we're filling with zeros. X_colMap->putScalar (ZERO); @@ -2239,24 +2116,24 @@ ApplyInverseMTGS_CrsMatrix( KokkosSparse::Experimental::symmetric_gauss_seidel_apply (mtKernelHandle_.getRawPtr(), A_->getNodeNumRows(), A_->getNodeNumCols(), kcsr.graph.row_map, kcsr.graph.entries, kcsr.values, - X_colMap->getLocalViewDevice(), - B_in->getLocalViewDevice(), + X_colMap->getLocalViewDevice(Tpetra::Access::ReadWrite), + B_in->getLocalViewDevice(Tpetra::Access::ReadOnly), zero_x_vector, update_y_vector, DampingFactor_, 1); } else if (direction == Tpetra::Forward) { KokkosSparse::Experimental::forward_sweep_gauss_seidel_apply (mtKernelHandle_.getRawPtr(), A_->getNodeNumRows(), A_->getNodeNumCols(), kcsr.graph.row_map,kcsr.graph.entries, kcsr.values, - X_colMap->getLocalViewDevice (), - B_in->getLocalViewDevice(), + X_colMap->getLocalViewDevice(Tpetra::Access::ReadWrite), + B_in->getLocalViewDevice(Tpetra::Access::ReadOnly), zero_x_vector, update_y_vector, DampingFactor_, 1); } else if (direction == Tpetra::Backward) { KokkosSparse::Experimental::backward_sweep_gauss_seidel_apply (mtKernelHandle_.getRawPtr(), A_->getNodeNumRows(), A_->getNodeNumCols(), kcsr.graph.row_map,kcsr.graph.entries, kcsr.values, - X_colMap->getLocalViewDevice(), - B_in->getLocalViewDevice(), + X_colMap->getLocalViewDevice(Tpetra::Access::ReadWrite), + B_in->getLocalViewDevice(Tpetra::Access::ReadOnly), zero_x_vector, update_y_vector, DampingFactor_, 1); } else { diff --git a/packages/ifpack2/src/Ifpack2_SparseContainer_decl.hpp b/packages/ifpack2/src/Ifpack2_SparseContainer_decl.hpp index ec90dcf730bd..c931c621e870 100644 --- a/packages/ifpack2/src/Ifpack2_SparseContainer_decl.hpp +++ b/packages/ifpack2/src/Ifpack2_SparseContainer_decl.hpp @@ -172,6 +172,7 @@ class SparseContainer using InverseMap = typename Tpetra::Map; using typename Container::HostView; + using typename Container::ConstHostView; using HostViewInverse = typename inverse_mv_type::dual_view_type::t_host; static_assert(std::is_sameY := alpha * M^{-1} X + beta*Y. virtual void - apply (HostView X, + apply (ConstHostView X, HostView Y, int blockIndex, Teuchos::ETransp mode = Teuchos::NO_TRANS, @@ -243,9 +244,9 @@ class SparseContainer //! Compute Y := alpha * diag(D) * M^{-1} (diag(D) * X) + beta*Y. virtual void - weightedApply (HostView X, + weightedApply (ConstHostView X, HostView Y, - HostView W, + ConstHostView W, int blockIndex, Teuchos::ETransp mode = Teuchos::NO_TRANS, SC alpha = Teuchos::ScalarTraits::one(), @@ -297,7 +298,7 @@ class SparseContainer /// \param Y [in] Subset permutation of the input/output Y of apply(), /// suitable for the second argument of Inverse_->apply(). void - solveBlockMV(inverse_mv_type& X, + solveBlockMV(const inverse_mv_type& X, inverse_mv_type& Y, int blockIndex, Teuchos::ETransp mode, diff --git a/packages/ifpack2/src/Ifpack2_SparseContainer_def.hpp b/packages/ifpack2/src/Ifpack2_SparseContainer_def.hpp index d1b71f5b34fb..b83700ff0dd2 100644 --- a/packages/ifpack2/src/Ifpack2_SparseContainer_def.hpp +++ b/packages/ifpack2/src/Ifpack2_SparseContainer_def.hpp @@ -139,7 +139,7 @@ void SparseContainer::clearBlocks () //============================================================================== template void SparseContainer:: -solveBlockMV(inverse_mv_type& X, +solveBlockMV(const inverse_mv_type& X, inverse_mv_type& Y, int blockIndex, Teuchos::ETransp mode, @@ -165,7 +165,7 @@ solveBlockMV(inverse_mv_type& X, template void SparseContainer:: -apply (HostView X, +apply (ConstHostView X, HostView Y, int blockIndex, Teuchos::ETransp mode, @@ -285,9 +285,9 @@ apply (HostView X, //============================================================================== template void SparseContainer:: -weightedApply (HostView X, +weightedApply (ConstHostView X, HostView Y, - HostView D, + ConstHostView D, int blockIndex, Teuchos::ETransp mode, SC alpha, diff --git a/packages/ifpack2/src/Ifpack2_TriDiContainer_decl.hpp b/packages/ifpack2/src/Ifpack2_TriDiContainer_decl.hpp index 3116e43034c5..df2d0aca7e76 100644 --- a/packages/ifpack2/src/Ifpack2_TriDiContainer_decl.hpp +++ b/packages/ifpack2/src/Ifpack2_TriDiContainer_decl.hpp @@ -137,6 +137,7 @@ class TriDiContainer using local_mv_type = Tpetra::MultiVector; using HostViewLocal = typename Kokkos::View; using typename ContainerImpl::HostSubviewLocal; + using typename ContainerImpl::ConstHostSubviewLocal; static_assert (std::is_same>::value, "Ifpack2::TriDiContainer: MatrixType must be a Tpetra::RowMatrix specialization."); @@ -191,7 +192,7 @@ class TriDiContainer void clearBlocks(); - void solveBlock(HostSubviewLocal X, + void solveBlock(ConstHostSubviewLocal X, HostSubviewLocal Y, int blockIndex, Teuchos::ETransp mode, diff --git a/packages/ifpack2/src/Ifpack2_TriDiContainer_def.hpp b/packages/ifpack2/src/Ifpack2_TriDiContainer_def.hpp index dd124768f373..a6f345273f47 100644 --- a/packages/ifpack2/src/Ifpack2_TriDiContainer_def.hpp +++ b/packages/ifpack2/src/Ifpack2_TriDiContainer_def.hpp @@ -277,7 +277,7 @@ void TriDiContainer::factor () template void TriDiContainer:: -solveBlock(HostSubviewLocal X, +solveBlock(ConstHostSubviewLocal X, HostSubviewLocal Y, int blockIndex, Teuchos::ETransp mode, diff --git a/packages/ifpack2/src/supportgraph/Ifpack2_SupportGraph_def.hpp b/packages/ifpack2/src/supportgraph/Ifpack2_SupportGraph_def.hpp index 58e383e5935d..8f39d7b117ef 100644 --- a/packages/ifpack2/src/supportgraph/Ifpack2_SupportGraph_def.hpp +++ b/packages/ifpack2/src/supportgraph/Ifpack2_SupportGraph_def.hpp @@ -642,9 +642,7 @@ apply (const Tpetra::MultiVector Xcopy; { - auto X_lcl_host = X.getLocalView (); - auto Y_lcl_host = Y.getLocalView (); - if (X_lcl_host.data () == Y_lcl_host.data ()) { + if (X.aliases(Y)) { Xcopy = rcp (new MV (X, Teuchos::Copy)); } else { Xcopy = rcpFromRef (X); diff --git a/packages/ifpack2/test/belos/tpetra_native.cpp b/packages/ifpack2/test/belos/tpetra_native.cpp index 0df91fb8b370..0be088d9732b 100644 --- a/packages/ifpack2/test/belos/tpetra_native.cpp +++ b/packages/ifpack2/test/belos/tpetra_native.cpp @@ -225,12 +225,7 @@ elementWiseMultiplyMultiVector (MultiVectorType& X, const index_type lclNumRows = static_cast (X.getLocalLength ()); - if (X.template need_sync ()) { - X.template sync (); - } - X.template modify (); - - auto X_lcl = X.template getLocalView (); + auto X_lcl = X.template getLocalView (Tpetra::Access::ReadWrite); if (static_cast (X.getNumVectors ()) == std::size_t (1)) { using pair_type = Kokkos::pair; auto X_lcl_1d = Kokkos::subview (X_lcl, pair_type (0, lclNumRows), 0); @@ -431,12 +426,7 @@ elementWiseDivideMultiVector (MultiVectorType& X, const index_type lclNumRows = static_cast (X.getLocalLength ()); - if (X.template need_sync ()) { - X.template sync (); - } - X.template modify (); - - auto X_lcl = X.template getLocalView (); + auto X_lcl = X.template getLocalView (Tpetra::Access::ReadWrite); if (static_cast (X.getNumVectors ()) == std::size_t (1)) { using pair_type = Kokkos::pair; auto X_lcl_1d = Kokkos::subview (X_lcl, pair_type (0, lclNumRows), 0); diff --git a/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestBlockCrsUtil.hpp b/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestBlockCrsUtil.hpp index 2393db0156f1..feee75b31aad 100644 --- a/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestBlockCrsUtil.hpp +++ b/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestBlockCrsUtil.hpp @@ -259,24 +259,26 @@ struct BlockCrsMatrixMaker { new Tpetra_Map(Tpetra_BlockMultiVector::makePointMap(*col_map, bs))); Tpetra_MultiVector_Magnitude colsum_mv(cpm, 1); colsum_mv.putScalar(0); - auto colsum = colsum_mv.getLocalViewHost(); - - // Get off-diag 1-norms. - Kokkos::fence(); // uvm access - for (LO r = 0; r < nrows; ++r) { - const auto rgid = row_map->getGlobalElement(r); - for (size_t j = rowptr(r); j < rowptr(r+1); ++j) { - const LO c = colidx(j); - const auto cgid = col_map->getGlobalElement(c); - const bool diag_block = cgid == rgid; - auto* const block = &values(j*bs2); - for (Int bi = 0; bi < bs; ++bi) - for (Int bj = 0; bj < bs; ++bj) { - if (diag_block && bj == bi) continue; - const auto e = abs(block[bi*bs + bj]); - rowsum[bs*r + bi] += e; - colsum(bs*c + bj, 0) += e; - } + { + auto colsum = colsum_mv.getLocalViewHost(Tpetra::Access::ReadWrite); + + // Get off-diag 1-norms. + Kokkos::fence(); // uvm access + for (LO r = 0; r < nrows; ++r) { + const auto rgid = row_map->getGlobalElement(r); + for (size_t j = rowptr(r); j < rowptr(r+1); ++j) { + const LO c = colidx(j); + const auto cgid = col_map->getGlobalElement(c); + const bool diag_block = cgid == rgid; + auto* const block = &values(j*bs2); + for (Int bi = 0; bi < bs; ++bi) + for (Int bj = 0; bj < bs; ++bj) { + if (diag_block && bj == bi) continue; + const auto e = abs(block[bi*bs + bj]); + rowsum[bs*r + bi] += e; + colsum(bs*c + bj, 0) += e; + } + } } } @@ -291,18 +293,21 @@ struct BlockCrsMatrixMaker { colsum_mv.doImport(d, importer, Tpetra::REPLACE); } - // Modify diag entries. - for (LO r = 0; r < nrows; ++r) { - const auto rgid = row_map->getGlobalElement(r); - for (size_t j = rowptr(r); j < rowptr(r+1); ++j) { - const LO c = colidx(j); - const auto cgid = col_map->getGlobalElement(c); - const bool diag_block = cgid == rgid; - if ( ! diag_block) continue; - auto* const block = &values(j*bs2); - for (Int bi = 0; bi < bs; ++bi) { - auto& e = block[bi*bs + bi]; - e = Magnitude(1.01)*std::max(rowsum[bs*r + bi], colsum(bs*c + bi, 0))*signof(e); + { + auto colsum = colsum_mv.getLocalViewHost(Tpetra::Access::ReadOnly); + // Modify diag entries. + for (LO r = 0; r < nrows; ++r) { + const auto rgid = row_map->getGlobalElement(r); + for (size_t j = rowptr(r); j < rowptr(r+1); ++j) { + const LO c = colidx(j); + const auto cgid = col_map->getGlobalElement(c); + const bool diag_block = cgid == rgid; + if ( ! diag_block) continue; + auto* const block = &values(j*bs2); + for (Int bi = 0; bi < bs; ++bi) { + auto& e = block[bi*bs + bi]; + e = Magnitude(1.01)*std::max(rowsum[bs*r + bi], colsum(bs*c + bi, 0))*signof(e); + } } } } @@ -627,7 +632,7 @@ struct BlockCrsMatrixMaker { const Int bs, const Int nvec) { auto mv = Teuchos::rcp(new Tpetra_MultiVector(m->getDomainMap(), nvec)); - const auto v = mv->template getLocalView(); + auto v = mv->getLocalViewHost(Tpetra::Access::OverwriteAll); const auto map = mv->getMap(); for (GO lid = 0; lid < v.extent_int(0); ++lid) for (LO col = 0; col < v.extent_int(1); ++col) { diff --git a/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestBlockRelaxation.cpp b/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestBlockRelaxation.cpp index aaf0e6d4ff55..3a6d3673ca62 100644 --- a/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestBlockRelaxation.cpp +++ b/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestBlockRelaxation.cpp @@ -218,12 +218,12 @@ TEUCHOS_UNIT_TEST_TEMPLATE_3_DECL(Ifpack2BlockRelaxation, Test2, Scalar, LO, GO) TEST_INEQUALITY(&x, &y); // vector x and y are different - x.sync_host (); - y.sync_host (); - auto x_lcl_host = x.getLocalViewHost (); - auto y_lcl_host = x.getLocalViewHost (); + { + auto x_lcl_host = x.getLocalViewHost(Tpetra::Access::ReadOnly); + auto y_lcl_host = y.getLocalViewHost(Tpetra::Access::ReadOnly); - TEST_EQUALITY( x_lcl_host.data (), y_lcl_host.data () ); // vector x and y are pointing to the same memory location (such test only works if num of local elements != 0) + TEST_EQUALITY( x_lcl_host.data (), y_lcl_host.data () ); // vector x and y are pointing to the same memory location (such test only works if num of local elements != 0) + } prec.apply(x, y); @@ -707,12 +707,10 @@ TEUCHOS_UNIT_TEST_TEMPLATE_3_DECL(Ifpack2BlockRelaxation, TestDiagonalBlockCrsMa const Scalar exactSol = 0.2; - yBlock.sync_host(); for (int k = 0; k < num_rows_per_proc; ++k) { - typename BMV::little_host_vec_type ylcl = yBlock.getLocalBlock(k,0); - Scalar* yb = ylcl.data(); + auto ylcl = yBlock.getLocalBlock(k, 0, Tpetra::Access::ReadOnly); for (int j = 0; j < blockSize; ++j) { - TEST_FLOATING_EQUALITY(yb[j],exactSol,1e-14); + TEST_FLOATING_EQUALITY(ylcl(j), exactSol, 1e-14); } } } @@ -1269,13 +1267,11 @@ TEUCHOS_UNIT_TEST_TEMPLATE_3_DECL(Ifpack2BlockRelaxation, TestLowerTriangularBlo exactSol[1] = -0.25; exactSol[2] = 0.625; - yBlock.sync_host(); for (size_t k = 0; k < num_rows_per_proc; ++k) { LO lcl_row = k; - typename BMV::little_host_vec_type ylcl = yBlock.getLocalBlock(lcl_row,0); - Scalar* yb = ylcl.data(); + auto ylcl = yBlock.getLocalBlock(lcl_row, 0, Tpetra::Access::ReadOnly); for (int j = 0; j < blockSize; ++j) { - TEST_FLOATING_EQUALITY(yb[j],exactSol[k],1e-14); + TEST_FLOATING_EQUALITY(ylcl(j), exactSol[k], 1e-14); } } } @@ -1332,12 +1328,10 @@ TEUCHOS_UNIT_TEST_TEMPLATE_3_DECL(Ifpack2BlockRelaxation, TestUpperTriangularBlo exactSol[1] = -0.25; exactSol[2] = 0.5; - yBlock.sync_host(); for (int k = 0; k < num_rows_per_proc; ++k) { - typename BMV::little_host_vec_type ylcl = yBlock.getLocalBlock(k,0); - auto yb = ylcl.data(); + auto ylcl = yBlock.getLocalBlock(k, 0, Tpetra::Access::ReadOnly); for (int j = 0; j < blockSize; ++j) { - TEST_FLOATING_EQUALITY(yb[j],exactSol[k],1e-14); + TEST_FLOATING_EQUALITY(ylcl(j), exactSol[k], 1e-14); } } } diff --git a/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestChebyshev.cpp b/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestChebyshev.cpp index 47f7f303985f..8b37c1417e90 100644 --- a/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestChebyshev.cpp +++ b/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestChebyshev.cpp @@ -114,15 +114,16 @@ TEUCHOS_UNIT_TEST_TEMPLATE_3_DECL(Ifpack2Chebyshev, Test0, Scalar, LocalOrdinal, prec.applyMat(x, y); - Teuchos::ArrayRCP yview = y.get1dView(); + { + Teuchos::ArrayRCP yview = y.get1dView(); - //Since crsmatrix is a diagonal matrix with 2 on the diagonal, - //y should be full of 2's now. + //Since crsmatrix is a diagonal matrix with 2 on the diagonal, + //y should be full of 2's now. - Teuchos::ArrayRCP twos(num_rows_per_proc*2, 2); + Teuchos::ArrayRCP twos(num_rows_per_proc*2, 2); - y.sync_host(); - TEST_COMPARE_FLOATING_ARRAYS(yview, twos(), Teuchos::ScalarTraits::eps()); + TEST_COMPARE_FLOATING_ARRAYS(yview, twos(), Teuchos::ScalarTraits::eps()); + } prec.apply(x, y); @@ -133,8 +134,10 @@ TEUCHOS_UNIT_TEST_TEMPLATE_3_DECL(Ifpack2Chebyshev, Test0, Scalar, LocalOrdinal, typename Teuchos::ScalarTraits::magnitudeType trial_tol = 1.e-13; typename Teuchos::ScalarTraits::magnitudeType tol = std::max(trial_tol, Teuchos::ScalarTraits::eps()); - y.sync_host(); - TEST_COMPARE_FLOATING_ARRAYS(yview, halfs(), tol); + { + Teuchos::ArrayRCP yview = y.get1dView(); + TEST_COMPARE_FLOATING_ARRAYS(yview, halfs(), tol); + } //If I now increase the degree of the polynomial to 4 the solve won't be //exact, but it should still be within a tol of 1.e-4 for this trivial data. @@ -144,8 +147,10 @@ TEUCHOS_UNIT_TEST_TEMPLATE_3_DECL(Ifpack2Chebyshev, Test0, Scalar, LocalOrdinal, tol = 1.e-4; - y.sync_host(); - TEST_COMPARE_FLOATING_ARRAYS(yview, halfs(), tol); + { + Teuchos::ArrayRCP yview = y.get1dView(); + TEST_COMPARE_FLOATING_ARRAYS(yview, halfs(), tol); + } } #define UNIT_TEST_GROUP_SC_LO_GO(Scalar,LocalOrdinal,GlobalOrdinal) \ diff --git a/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestLocalSparseTriangularSolver.cpp b/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestLocalSparseTriangularSolver.cpp index 77406b4cea66..82a8e1edc081 100644 --- a/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestLocalSparseTriangularSolver.cpp +++ b/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestLocalSparseTriangularSolver.cpp @@ -126,17 +126,9 @@ localSolve (Tpetra::MultiVector< auto A_lcl = A.getLocalMatrix (); - // NOTE (mfh 20 Aug 2017): KokkosSparse::trsv currently is a - // sequential, host-only code. See - // https://github.com/kokkos/kokkos-kernels/issues/48. This means - // that we need to sync to host, then sync back to device when done. - X.sync_host (); - const_cast (Y).sync_host (); - X.modify_host (); // we will write to X - if (X.isConstantStride () && Y.isConstantStride ()) { - auto X_lcl = X.getLocalViewHost (); - auto Y_lcl = Y.getLocalViewHost (); + auto X_lcl = X.getLocalViewHost (Tpetra::Access::OverwriteAll); + auto Y_lcl = Y.getLocalViewHost (Tpetra::Access::ReadOnly); KokkosSparse::trsv (uplo.c_str (), trans.c_str (), diag.c_str (), A_lcl, Y_lcl, X_lcl); } @@ -144,17 +136,14 @@ localSolve (Tpetra::MultiVector< const size_t numVecs = std::min (X.getNumVectors (), Y.getNumVectors ()); for (size_t j = 0; j < numVecs; ++j) { - auto X_j = X.getVector (j); - auto Y_j = X.getVector (j); - auto X_lcl = X_j->getLocalViewHost (); - auto Y_lcl = Y_j->getLocalViewHost (); + auto X_j = X.getVectorNonConst (j); + auto Y_j = Y.getVector (j); + auto X_lcl = X_j->getLocalViewHost (Tpetra::Access::OverwriteAll); + auto Y_lcl = Y_j->getLocalViewHost (Tpetra::Access::ReadOnly); KokkosSparse::trsv (uplo.c_str (), trans.c_str (), diag.c_str (), A_lcl, Y_lcl, X_lcl); } } - - X.template sync (); - const_cast (Y).template sync (); } template @@ -1077,9 +1066,7 @@ void testArrowMatrix (bool& success, Teuchos::FancyOStream& out) // Set up the right-hand side b. vec_type b (ranMap); { - b.sync_host (); - b.modify_host (); - auto b_lcl_2d = b.getLocalViewHost (); + auto b_lcl_2d = b.getLocalViewHost (Tpetra::Access::OverwriteAll); auto b_lcl_1d = Kokkos::subview (b_lcl_2d, Kokkos::ALL (), 0); for (LO i = 0; i < lclNumRows; ++i) { @@ -1093,7 +1080,6 @@ void testArrowMatrix (bool& success, Teuchos::FancyOStream& out) b_lcl_1d(i) = K; } } - b.template sync (); } // We solve Ax=b (with A = LU) by first solving Lc = b, and then @@ -1149,8 +1135,7 @@ void testArrowMatrix (bool& success, Teuchos::FancyOStream& out) { Teuchos::OSTab tab2 (out); - c.sync_host (); - auto c_lcl_2d = c.getLocalViewHost (); + auto c_lcl_2d = c.getLocalViewHost (Tpetra::Access::ReadOnly); auto c_lcl_1d = Kokkos::subview (c_lcl_2d, Kokkos::ALL (), 0); for (LO i = 0; i + 1 < lclNumRows; ++i) { @@ -1161,7 +1146,6 @@ void testArrowMatrix (bool& success, Teuchos::FancyOStream& out) TEST_EQUALITY( c_lcl_1d(i), c_i_expected ); } TEST_EQUALITY( c_lcl_1d(lclNumRows-1), c_n_expected ); - c.template sync (); } // lclSuccess = success ? 1 : 0; // gblSuccess = 0; // to be revised @@ -1194,8 +1178,7 @@ void testArrowMatrix (bool& success, Teuchos::FancyOStream& out) { Teuchos::OSTab tab2 (out); - x.sync_host (); - auto x_lcl_2d = x.getLocalViewHost (); + auto x_lcl_2d = x.getLocalViewHost (Tpetra::Access::ReadOnly); auto x_lcl_1d = Kokkos::subview (x_lcl_2d, Kokkos::ALL (), 0); for (LO i = 0; i + 1 < lclNumRows; ++i) { @@ -1222,8 +1205,7 @@ void testArrowMatrix (bool& success, Teuchos::FancyOStream& out) { Teuchos::OSTab tab2 (out); - c.sync_host (); - auto c_lcl_2d = c.getLocalViewHost (); + auto c_lcl_2d = c.getLocalViewHost (Tpetra::Access::ReadOnly); auto c_lcl_1d = Kokkos::subview (c_lcl_2d, Kokkos::ALL (), 0); for (LO i = 0; i + 1 < lclNumRows; ++i) { @@ -1234,7 +1216,6 @@ void testArrowMatrix (bool& success, Teuchos::FancyOStream& out) TEST_EQUALITY( c_lcl_1d(i), c_i_expected ); } TEST_EQUALITY( c_lcl_1d(lclNumRows-1), c_n_expected ); - c.template sync (); } localSolve (x, *U, c, true, false, Teuchos::NO_TRANS); @@ -1242,8 +1223,7 @@ void testArrowMatrix (bool& success, Teuchos::FancyOStream& out) { Teuchos::OSTab tab2 (out); - x.sync_host (); - auto x_lcl_2d = x.getLocalViewHost (); + auto x_lcl_2d = x.getLocalViewHost (Tpetra::Access::ReadOnly); auto x_lcl_1d = Kokkos::subview (x_lcl_2d, Kokkos::ALL (), 0); for (LO i = 0; i + 1 < lclNumRows; ++i) { diff --git a/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestOverlappingRowMatrix.cpp b/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestOverlappingRowMatrix.cpp index 0dff2c5bc87c..cb136bd10e61 100644 --- a/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestOverlappingRowMatrix.cpp +++ b/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestOverlappingRowMatrix.cpp @@ -113,11 +113,11 @@ typedef Tpetra::global_size_t GST; /***********************************************************************************/ -template +template void localReducedMatvec(const MatrixClass & A_lcl, - const MultiVectorClass & X_lcl, + const ConstMultiVectorClass & X_lcl, const int userNumRows, - MultiVectorClass & Y_lcl) { + const MultiVectorClass & Y_lcl) { using Teuchos::NO_TRANS; using execution_space = typename MatrixClass::execution_space; @@ -230,8 +230,8 @@ void reducedMatvec(const OverlappedMatrixClass & A, auto undA_lcl = undA->getLocalMatrix (); auto extA_lcl = extA->getLocalMatrix (); - auto X_lcl = X.getLocalViewDevice (); - auto Y_lcl = Y.getLocalViewDevice (); + auto X_lcl = X.getLocalViewDevice (Tpetra::Access::ReadOnly); + auto Y_lcl = Y.getLocalViewDevice (Tpetra::Access::OverwriteAll); // Do the "Local part" auto numLocalRows = undA->getNodeNumRows(); @@ -626,10 +626,9 @@ TEUCHOS_UNIT_TEST_TEMPLATE_3_DECL(Ifpack2OverlappingRowMatrix, reducedMatvec, Sc reducedMatvec(ovA,temp1,1,temp2); reducedMatvec(ovA,temp2,0,ovY); - // And yes, that int cast is really necessary - auto ovY_lcl = ovY.getLocalViewDevice(); - auto Y_lcl = y_overlap.getLocalViewDevice(); - auto ovYsub = Kokkos::subview(ovY_lcl,std::make_pair(0,(int)Y_lcl.extent(0)), Kokkos::ALL); + auto Y_lcl = y_overlap.getLocalViewDevice(Tpetra::Access::OverwriteAll); + auto ovY_lcl = ovY.getLocalViewDevice(Tpetra::Access::ReadOnly); + auto ovYsub = Kokkos::subview(ovY_lcl, std::make_pair(0, Y_lcl.extent(0)), Kokkos::ALL); Kokkos::deep_copy(Y_lcl,ovYsub); } diff --git a/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestRBILUK.cpp b/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestRBILUK.cpp index 3168278912d0..b55c06a1f0c7 100644 --- a/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestRBILUK.cpp +++ b/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestRBILUK.cpp @@ -205,13 +205,11 @@ TEUCHOS_UNIT_TEST_TEMPLATE_3_DECL(RBILUK, LowerTriangularBlockCrsMatrix, Scalar, exactSol[1] = -0.25; exactSol[2] = 0.625; - yBlock.sync_host(); for (size_t k = 0; k < num_rows_per_proc; ++k) { LO lcl_row = k; - typename BMV::little_host_vec_type ylcl = yBlock.getLocalBlock(lcl_row,0); - Scalar* yb = ylcl.data(); + auto ylcl = yBlock.getLocalBlock(lcl_row, 0, Tpetra::Access::ReadOnly); for (int j = 0; j < blockSize; ++j) { - TEST_FLOATING_EQUALITY(yb[j],exactSol[k],1e-14); + TEST_FLOATING_EQUALITY(ylcl(j),exactSol[k],1e-14); } } } @@ -261,12 +259,10 @@ TEUCHOS_UNIT_TEST_TEMPLATE_3_DECL(RBILUK, UpperTriangularBlockCrsMatrix, Scalar, exactSol[1] = -0.25; exactSol[2] = 0.5; - yBlock.sync_host(); for (int k = 0; k < num_rows_per_proc; ++k) { - typename BMV::little_host_vec_type ylcl = yBlock.getLocalBlock(k,0); - Scalar* yb = ylcl.data(); + auto ylcl = yBlock.getLocalBlock(k, 0, Tpetra::Access::ReadOnly); for (int j = 0; j < blockSize; ++j) { - TEST_FLOATING_EQUALITY(yb[j],exactSol[k],1e-14); + TEST_FLOATING_EQUALITY(ylcl(j), exactSol[k], 1e-14); } } } @@ -316,12 +312,10 @@ TEUCHOS_UNIT_TEST_TEMPLATE_3_DECL(RBILUK, FullLocalBlockCrsMatrix, Scalar, Local exactSol[1] = -4.0/21.0; exactSol[2] = 2.0/7.0; - yBlock.sync_host(); for (int k = 0; k < num_rows_per_proc; ++k) { - typename BMV::little_host_vec_type ylcl = yBlock.getLocalBlock(k,0); - Scalar* yb = ylcl.data(); + auto ylcl = yBlock.getLocalBlock(k, 0, Tpetra::Access::ReadOnly); for (int j = 0; j < blockSize; ++j) { - TEST_FLOATING_EQUALITY(yb[j],exactSol[k],1e-14); + TEST_FLOATING_EQUALITY(ylcl(j), exactSol[k], 1e-14); } } } @@ -384,8 +378,6 @@ TEUCHOS_UNIT_TEST_TEMPLATE_3_DECL(RBILUK, BandedBlockCrsMatrixWithDropping, Scal prec_crs.apply(x, z); - y.sync_host(); - z.sync_host(); Teuchos::ArrayRCP zview = z.get1dView(); Teuchos::ArrayRCP yview = y.get1dView(); for (int k = 0; k < num_rows_per_proc; ++k) @@ -735,12 +727,10 @@ TEUCHOS_UNIT_TEST_TEMPLATE_3_DECL(RBILUK, DiagonalBlockCrsMatrix, Scalar, LocalO const Scalar exactSol = 0.2; - yBlock.sync_host(); for (int k = 0; k < num_rows_per_proc; ++k) { - typename BMV::little_host_vec_type ylcl = yBlock.getLocalBlock(k,0); - Scalar* yb = ylcl.data(); + auto ylcl = yBlock.getLocalBlock(k, 0, Tpetra::Access::ReadOnly); for (int j = 0; j < blockSize; ++j) { - TEST_FLOATING_EQUALITY(yb[j],exactSol,1e-14); + TEST_FLOATING_EQUALITY(ylcl(j), exactSol, 1e-14); } } } diff --git a/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestRelaxation.cpp b/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestRelaxation.cpp index 925b72a2f694..aaf7d573679e 100644 --- a/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestRelaxation.cpp +++ b/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestRelaxation.cpp @@ -106,20 +106,25 @@ TEUCHOS_UNIT_TEST_TEMPLATE_3_DECL(Ifpack2Relaxation, Test0, Scalar, LocalOrdinal prec.applyMat (x, y); - Teuchos::ArrayRCP yview = y.get1dView(); + { + Teuchos::ArrayRCP yview = y.get1dView(); - //Since crsmatrix is a diagonal matrix with 2 on the diagonal, - //y should be full of 2's now. + //Since crsmatrix is a diagonal matrix with 2 on the diagonal, + //y should be full of 2's now. - Teuchos::ArrayRCP twos (num_rows_per_proc*2, 2); - TEST_COMPARE_FLOATING_ARRAYS(yview, twos(), Teuchos::ScalarTraits::eps()); + Teuchos::ArrayRCP twos (num_rows_per_proc*2, 2); + TEST_COMPARE_FLOATING_ARRAYS(yview, twos(), Teuchos::ScalarTraits::eps()); + } prec.apply(x, y); - //y should be full of 0.5's now. - Teuchos::ArrayRCP halfs(num_rows_per_proc*2, 0.5); + { + Teuchos::ArrayRCP yview = y.get1dView(); + + //y should be full of 0.5's now. + Teuchos::ArrayRCP halfs(num_rows_per_proc*2, 0.5); - y.sync_host(); - TEST_COMPARE_FLOATING_ARRAYS(yview, halfs(), Teuchos::ScalarTraits::eps()); + TEST_COMPARE_FLOATING_ARRAYS(yview, halfs(), Teuchos::ScalarTraits::eps()); + } } // Test apply() with x == y. @@ -194,8 +199,8 @@ TEUCHOS_UNIT_TEST_TEMPLATE_3_DECL(Ifpack2Relaxation, Test2, Scalar, LocalOrdinal TEST_INEQUALITY(&x, &y); // vector x and y are different // Vectors x and y point to the same data. - TEST_EQUALITY(x.getLocalViewHost ().data (), - y.getLocalViewHost ().data ()); + TEST_EQUALITY(x.getLocalViewHost (Tpetra::Access::ReadOnly).data (), + y.getLocalViewHost (Tpetra::Access::ReadOnly).data ()); prec.apply(x, y); @@ -909,12 +914,10 @@ TEUCHOS_UNIT_TEST_TEMPLATE_3_DECL(Ifpack2Relaxation, TestDiagonalBlockCrsMatrix, using mag_type = typename STS::magnitudeType; const auto tol = mag_type(100.0) * STS::eps(); - yBlock.sync_host(); for (int k = 0; k < num_rows_per_proc; ++k) { - typename BMV::little_host_vec_type ylcl = yBlock.getLocalBlock(k,0); - Scalar* yb = ylcl.data(); + auto ylcl = yBlock.getLocalBlock(k, 0, Tpetra::Access::ReadOnly); for (int j = 0; j < blockSize; ++j) { - TEST_FLOATING_EQUALITY(yb[j], exactSol, tol); + TEST_FLOATING_EQUALITY(ylcl(j), exactSol, tol); } } } @@ -1026,13 +1029,11 @@ TEUCHOS_UNIT_TEST_TEMPLATE_3_DECL(Ifpack2Relaxation, TestLowerTriangularBlockCrs exactSol[1] = -0.25; exactSol[2] = 0.625; - yBlock.sync_host(); for (size_t k = 0; k < num_rows_per_proc; ++k) { LO lcl_row = k; - typename BMV::little_host_vec_type ylcl = yBlock.getLocalBlock(lcl_row,0); - Scalar* yb = ylcl.data(); + auto ylcl = yBlock.getLocalBlock(lcl_row, 0, Tpetra::Access::ReadOnly); for (int j = 0; j < blockSize; ++j) { - TEST_FLOATING_EQUALITY(yb[j],exactSol[k],1e-14); + TEST_FLOATING_EQUALITY(ylcl(j), exactSol[k], 1e-14); } } } @@ -1079,12 +1080,10 @@ TEUCHOS_UNIT_TEST_TEMPLATE_3_DECL(Ifpack2Relaxation, TestUpperTriangularBlockCrs exactSol[1] = -0.25; exactSol[2] = 0.5; - yBlock.sync_host(); for (int k = 0; k < num_rows_per_proc; ++k) { - typename BMV::little_host_vec_type ylcl = yBlock.getLocalBlock(k,0); - auto yb = ylcl.data(); + auto ylcl = yBlock.getLocalBlock(k, 0, Tpetra::Access::ReadOnly); for (int j = 0; j < blockSize; ++j) { - TEST_FLOATING_EQUALITY(yb[j],exactSol[k],1e-14); + TEST_FLOATING_EQUALITY(ylcl(j), exactSol[k], 1e-14); } } } diff --git a/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestSingleProcessRILUK.cpp b/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestSingleProcessRILUK.cpp index f6f5cdb95caf..d9ac6214f887 100644 --- a/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestSingleProcessRILUK.cpp +++ b/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestSingleProcessRILUK.cpp @@ -142,9 +142,8 @@ void remove_diags_and_scale(const MatrixType& L, const MatrixType& U, Ln_rowmap, Ln_entries, Ln_values)); Un = Teuchos::rcp (new MatrixType (U.getRowMap(), U.getColMap(), Un_rowmap, Un_entries, Un_values)); - auto Dn_view = Dn->getLocalViewDevice(); + auto Dn_view = Dn->getLocalViewDevice(Tpetra::Access::OverwriteAll); Kokkos::deep_copy(subview(Dn_view,Kokkos::ALL(), 0),Dn_values); - Dn->sync_host(); Ln->fillComplete(); Un->fillComplete(); diff --git a/packages/kokkos-kernels/.gitignore b/packages/kokkos-kernels/.gitignore deleted file mode 100644 index fa032cb2cb83..000000000000 --- a/packages/kokkos-kernels/.gitignore +++ /dev/null @@ -1,10 +0,0 @@ -# Standard ignores -*~ -*.pyc -\#*# -.#* -.*.swp -.cproject -.project -*.o -TAGS diff --git a/packages/kokkos-kernels/.jenkins/nightly.groovy b/packages/kokkos-kernels/.jenkins/nightly.groovy new file mode 100644 index 000000000000..41e4daf71e79 --- /dev/null +++ b/packages/kokkos-kernels/.jenkins/nightly.groovy @@ -0,0 +1,41 @@ +pipeline { + agent none + + stages { + stage('HIP-ROCm-3.10-C++14') { + agent { + dockerfile { + filename 'Dockerfile.hip' + dir 'scripts/docker' + additionalBuildArgs '--build-arg BASE=rocm/dev-ubuntu-20.04:3.10' + label 'rocm-docker && vega' + args '-v /tmp/ccache.kokkos:/tmp/ccache --device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --group-add video --env HIP_VISIBLE_DEVICES=$HIP_VISIBLE_DEVICES' + } + } + steps { + sh '''rm -rf kokkos && + git clone -b develop https://github.com/kokkos/kokkos.git && cd kokkos && \ + mkdir build && cd build && \ + cmake \ + -DCMAKE_CXX_COMPILER=hipcc \ + -DKokkos_ENABLE_HIP=ON \ + -DKokkos_ARCH_VEGA906=ON \ + .. && \ + make -j8 && make install && \ + cd ../.. && rm -rf kokkos''' + sh '''rm -rf build && mkdir -p build && cd build && \ + cmake \ + -DCMAKE_BUILD_TYPE=RelWithDebInfo \ + -DCMAKE_CXX_COMPILER=hipcc \ + -DKokkosKernels_ENABLE_TESTS=ON \ + -DKokkosKernels_ENABLE_EXAMPLES=ON \ + -DKokkos_ENABLE_HIP=ON \ + -DKokkosKernels_INST_DOUBLE=ON \ + -DKokkosKernels_INST_ORDINAL_INT=ON \ + -DKokkosKernels_INST_OFFSET_INT=ON \ + .. && \ + make -j8 && ctest --verbose''' + } + } + } +} diff --git a/packages/kokkos-kernels/BUILD.md b/packages/kokkos-kernels/BUILD.md index 19ea0fd57325..023cf96f4e77 100644 --- a/packages/kokkos-kernels/BUILD.md +++ b/packages/kokkos-kernels/BUILD.md @@ -125,6 +125,12 @@ endif() * CUSPARSE_LIBRARY_DIRS: STRING * Optional override for the library directories that comprise TPL CUSPARSE. * Default: None. Default common library locations will be searched +* ARMPL_LIBRARIES: STRING + * Optional override for the libraries that comprise TPL ARMPL. + * Default: None. Default common library names will be searched +* ARMPL_LIBRARY_DIRS: STRING + * Optional override for the library directories that comprise TPL ARMPL. + * Default: None. Default common library locations will be searched * KokkosKernels_BLAS_ROOT: PATH * Location of BLAS install root. * Default: None or the value of the environment variable BLAS_ROOT if set @@ -161,6 +167,9 @@ endif() * KokkosKernels_ENABLE_TPL_MKL: BOOL * Whether to enable MKL * Default: OFF +* KokkosKernels_ENABLE_TPL_ARMPL: BOOL + * Whether to enable ARMPL + * Default: OFF * KokkosKernels_ETI_ONLY: BOOL * Whether to restrict availability of kernels to ETI types only. Turning this on guarantees that kernels are never built inside of object files which simply call KokkosKernels functions. * Default: OFF diff --git a/packages/kokkos-kernels/CHANGELOG.md b/packages/kokkos-kernels/CHANGELOG.md index 05b18d1c8655..911bb3219754 100644 --- a/packages/kokkos-kernels/CHANGELOG.md +++ b/packages/kokkos-kernels/CHANGELOG.md @@ -1,5 +1,40 @@ # Change Log +## [3.4.00](https://github.com/kokkos/kokkos-kernels/tree/3.4.00) (2021-04-25) +[Full Changelog](https://github.com/kokkos/kokkos-kernels/compare/3.3.01...3.4.00) + +**Features:** +- SYCL: adding ETI and CMake logic for SYCL backend [\#924](https://github.com/kokkos/kokkos/pull/924) + +**Implemented enhancements Algorithms and Archs:** +- Two-stage GS: add damping factors [\#921](https://github.com/kokkos/kokkos/pull/921) +- Supernodal SpTRSV, improve symbolic performance [\#899](https://github.com/kokkos/kokkos/pull/899) +- Add MKL SpMV wrapper [\#895](https://github.com/kokkos/kokkos/pull/895) +- Serial code path for spmv [\#893](https://github.com/kokkos/kokkos/pull/893) + +**Implemented enhancements BuildSystem:** +- Cmake: Update ArmPL support [\#901](https://github.com/kokkos/kokkos/pull/901) +- Cmake: Add ARMPL TPL support [\#880](https://github.com/kokkos/kokkos/pull/880) +- IntelClang guarding __assume_aligned with !defined(__clang__) [\#878](https://github.com/kokkos/kokkos/pull/878) + +**Implemented enhancements Other:** +- Add static_assert/throw in batched eigendecomp [\#931](https://github.com/kokkos/kokkos/pull/931) +- Workaround using new/delete in kernel code [\#925](https://github.com/kokkos/kokkos/pull/925) +- Blas perf_test updates [\#892](https://github.com/kokkos/kokkos/pull/892) + +**Fixed bugs:** +- Fix ctor CrsMat mirror with CrsGraph mirror [\#918](https://github.com/kokkos/kokkos/pull/918) +- Fix nrm1, removed cublas nrminf, improved blas tests [\#915](https://github.com/kokkos/kokkos/pull/915) +- Fix and testing coverage mainly in graph coarsening [\#910](https://github.com/kokkos/kokkos/pull/910) +- Fix KokkosSparse for nightly test failure [\#898](https://github.com/kokkos/kokkos/pull/898) +- Fix view types across ternary operator [\#894](https://github.com/kokkos/kokkos/pull/894) +- Make work_view_t typedef consistent [\#885](https://github.com/kokkos/kokkos/pull/885) +- Fix supernodal SpTRSV build with serial+openmp+cuda [\#884](https://github.com/kokkos/kokkos/pull/884) +- Construct SpGEMM C with correct ncols [\#883](https://github.com/kokkos/kokkos/pull/883) +- Matrix Converter: fixing issue with deallocation after Kokkos::fininalize [\#882](https://github.com/kokkos/kokkos/pull/882) +- Fix >1024 team size error in sort_crs_* [\#872](https://github.com/kokkos/kokkos/pull/872) +- Fixing seg fault with empty matrix in kspiluk [\#871](https://github.com/kokkos/kokkos/pull/871) + ## [3.3.01](https://github.com/kokkos/kokkos-kernels/tree/3.3.01) (2021-01-18) [Full Changelog](https://github.com/kokkos/kokkos-kernels/compare/3.3.00...3.3.01) diff --git a/packages/kokkos-kernels/CMakeLists.txt b/packages/kokkos-kernels/CMakeLists.txt index 1b9000cddb4e..1f698db6683a 100644 --- a/packages/kokkos-kernels/CMakeLists.txt +++ b/packages/kokkos-kernels/CMakeLists.txt @@ -24,8 +24,8 @@ IF(NOT KOKKOSKERNELS_HAS_TRILINOS) PROJECT(KokkosKernels CXX) ENDIF() SET(KokkosKernels_VERSION_MAJOR 3) - SET(KokkosKernels_VERSION_MINOR 3) - SET(KokkosKernels_VERSION_PATCH 1) + SET(KokkosKernels_VERSION_MINOR 4) + SET(KokkosKernels_VERSION_PATCH 0) ENDIF() IF(${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.12.0") @@ -196,6 +196,10 @@ ELSE() MESSAGE("") # Skip building Kokkos Kernels if we are doing an installation test ADD_SUBDIRECTORY(src) + IF(KokkosKernels_ENABLE_INSTALL_TEST) + ADD_SUBDIRECTORY(install_test) + MESSAGE("The install test has been enabled, you will need to peform: make install before running the tests otherwise install_test will fail") + ENDIF() KOKKOSKERNELS_ADD_TEST_DIRECTORIES(test_common) KOKKOSKERNELS_ADD_TEST_DIRECTORIES(perf_test) KOKKOSKERNELS_ADD_TEST_DIRECTORIES(unit_test) diff --git a/packages/kokkos-kernels/CheckHostBlasReturnComplex.cmake b/packages/kokkos-kernels/CheckHostBlasReturnComplex.cmake index 78ae33515b03..30063b1cc3d3 100644 --- a/packages/kokkos-kernels/CheckHostBlasReturnComplex.cmake +++ b/packages/kokkos-kernels/CheckHostBlasReturnComplex.cmake @@ -5,7 +5,12 @@ FUNCTION(CHECK_HOST_BLAS_RETURN_COMPLEX VARNAME) IF (KOKKOSKERNELS_HAS_TRILINOS) SET(CMAKE_REQUIRED_LIBRARIES ${TPL_BLAS_LIBRARIES}) ELSE() - SET(CMAKE_REQUIRED_LIBRARIES ${BLAS_LIBRARIES}) + # For TPLs, just pull out the required libraries from the target properies. + IF (KOKKOSKERNELS_ENABLE_TPL_ARMPL) + GET_TARGET_PROPERTY(CMAKE_REQUIRED_LIBRARIES KokkosKernels::ARMPL INTERFACE_LINK_LIBRARIES) + ELSE() + SET(CMAKE_REQUIRED_LIBRARIES ${BLAS_LIBRARIES}) + ENDIF() ENDIF() SET(SOURCE diff --git a/packages/kokkos-kernels/cm_generate_makefile.bash b/packages/kokkos-kernels/cm_generate_makefile.bash index bb246df3c676..bb9913b05b0e 100755 --- a/packages/kokkos-kernels/cm_generate_makefile.bash +++ b/packages/kokkos-kernels/cm_generate_makefile.bash @@ -225,6 +225,8 @@ display_help_text() { echo "" echo "--with-cuda[=/Path/To/Cuda]: Enable Cuda and set path to Cuda Toolkit." echo "--with-hip[=/Path/To/Hip]: Enable Hip and set path to ROCM Toolkit." + echo "--with-openmptarget: Enable OpenMPTarget backend." + echo "--with-sycl: Enable Sycl backend." echo "--with-openmp: Enable OpenMP backend." echo "--with-pthread: Enable Pthreads backend." echo "--with-serial: Enable Serial backend." @@ -313,7 +315,7 @@ display_help_text() { echo "--with-tpls=[TPLS]: Set tpls to be instantiated (Proper support requies that appropriate compiler and device must be enabled)." echo " This may require providing paths and the library name if using custom installs not on a default path" echo " that CMake searches" - echo " Options: blas, mkl, cublas, cusparse, magma" + echo " Options: blas, mkl, cublas, cusparse, magma, armpl" echo "--user-blas-path=[PATH]: Set path to location of user-specified BLAS library." echo "--user-blas-lib=[LIB]: Library name of desired BLAS install." echo " Example: For the typical \"libblas.a\" provide \"blas\"" @@ -396,6 +398,12 @@ do --with-openmp) update_kokkos_devices OpenMP ;; + --with-openmptarget) + update_kokkos_devices OpenMPTarget + ;; + --with-sycl) + update_kokkos_devices Sycl + ;; --with-pthread) update_kokkos_devices Pthread ;; @@ -569,7 +577,7 @@ done if [ "$KOKKOS_CXX_STANDARD" == "" ]; then STANDARD_CMD= else - STANDARD_CMD=-DKokkos_CXX_STANDARD=${KOKKOS_CXX_STANDARD} + STANDARD_CMD=-DCMAKE_CXX_STANDARD=${KOKKOS_CXX_STANDARD} fi if [ "$COMPILER" == "" ]; then diff --git a/packages/kokkos-kernels/cmake/Dependencies.cmake b/packages/kokkos-kernels/cmake/Dependencies.cmake index 66990dd1264d..0aa97b1d6ce1 100644 --- a/packages/kokkos-kernels/cmake/Dependencies.cmake +++ b/packages/kokkos-kernels/cmake/Dependencies.cmake @@ -1,5 +1,5 @@ TRIBITS_PACKAGE_DEFINE_DEPENDENCIES( LIB_REQUIRED_PACKAGES KokkosCore KokkosContainers KokkosAlgorithms - LIB_OPTIONAL_TPLS quadmath MKL BLAS LAPACK CUSPARSE MAGMA SUPERLU CHOLMOD LAPACKE CBLAS + LIB_OPTIONAL_TPLS quadmath MKL BLAS LAPACK CUSPARSE MAGMA SUPERLU CHOLMOD LAPACKE CBLAS ARMPL TEST_OPTIONAL_TPLS yaml-cpp ) diff --git a/packages/kokkos-kernels/cmake/KokkosKernelsConfig.cmake.in b/packages/kokkos-kernels/cmake/KokkosKernelsConfig.cmake.in index 31d77bda9444..6b95ff91aec9 100644 --- a/packages/kokkos-kernels/cmake/KokkosKernelsConfig.cmake.in +++ b/packages/kokkos-kernels/cmake/KokkosKernelsConfig.cmake.in @@ -12,6 +12,7 @@ find_dependency(Kokkos HINTS @Kokkos_DIR@) SET(Kokkos_ENABLE_OPENMP @Kokkos_ENABLE_OPENMP@) SET(Kokkos_ENABLE_CUDA @Kokkos_ENABLE_CUDA@) SET(Kokkos_ENABLE_HIP @Kokkos_ENABLE_HIP@) +SET(Kokkos_ENABLE_SYCL @Kokkos_ENABLE_SYCL@) SET(Kokkos_ENABLE_PTHREAD @Kokkos_ENABLE_PTHREAD@) SET(Kokkos_ENABLE_SERIAL @Kokkos_ENABLE_SERIAL@) diff --git a/packages/kokkos-kernels/cmake/KokkosKernels_config.h.in b/packages/kokkos-kernels/cmake/KokkosKernels_config.h.in index c0a1e98ec665..9326edc47ac9 100644 --- a/packages/kokkos-kernels/cmake/KokkosKernels_config.h.in +++ b/packages/kokkos-kernels/cmake/KokkosKernels_config.h.in @@ -37,6 +37,10 @@ /* Whether to build kernels for execution space Kokkos::Experimental::HIP */ #cmakedefine KOKKOSKERNELS_INST_EXECSPACE_HIP #cmakedefine KOKKOSKERNELS_INST_MEMSPACE_HIPSPACE +/* Whether to build kernels for execution space Kokkos::Experimental::SYCL */ +#cmakedefine KOKKOSKERNELS_INST_EXECSPACE_SYCL +#cmakedefine KOKKOSKERNELS_INST_MEMSPACE_SYCLSPACE +#cmakedefine KOKKOSKERNELS_INST_MEMSPACE_SYCLSHAREDSPACE /* Whether to build kernels for execution space Kokkos::OpenMP */ #cmakedefine KOKKOSKERNELS_INST_EXECSPACE_OPENMP /* Whether to build kernels for execution space Kokkos::Threads */ @@ -102,11 +106,14 @@ #cmakedefine KOKKOSKERNELS_ENABLE_TPL_LAPACKE /* METIS */ #cmakedefine KOKKOSKERNELS_ENABLE_TPL_METIS +/* ARMPL */ +#cmakedefine KOKKOSKERNELS_ENABLE_TPL_ARMPL #cmakedefine KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV -/* if MKL, BLAS is also defined */ -#if defined(KOKKOSKERNELS_ENABLE_TPL_MKL) +/* if MKL or ARMPL, BLAS is also defined */ +#if defined(KOKKOSKERNELS_ENABLE_TPL_MKL) ||\ + defined(KOKKOSKERNELS_ENABLE_TPL_ARMPL) #if !defined(KOKKOSKERNELS_ENABLE_TPL_BLAS) #define KOKKOSKERNELS_ENABLE_TPL_BLAS #endif diff --git a/packages/kokkos-kernels/cmake/Modules/FindTPLARMPL.cmake b/packages/kokkos-kernels/cmake/Modules/FindTPLARMPL.cmake new file mode 100644 index 000000000000..62e1e33ea316 --- /dev/null +++ b/packages/kokkos-kernels/cmake/Modules/FindTPLARMPL.cmake @@ -0,0 +1,47 @@ +# Both the armpl_mp and armpl libraries define the same public symbol names. +# In order to link against the openmp armpl symbols, instruct cmake to link against armpl_mp. +# In order to link against the default armpl symbols, instruct cmake to link against armpl. +IF(KOKKOSKERNELS_INST_EXECSPACE_OPENMP) + SET(ARMPL_LIB armpl_mp) +ELSE() + SET(ARMPL_LIB armpl) +ENDIF() + +IF (ARMPL_LIBRARY_DIRS AND ARMPL_LIBRARIES) + KOKKOSKERNELS_FIND_IMPORTED(ARMPL INTERFACE LIBRARIES ${ARMPL_LIBRARIES} LIBRARY_PATHS ${ARMPL_LIBRARY_DIRS}) +ELSEIF (ARMPL_LIBRARIES) + KOKKOSKERNELS_FIND_IMPORTED(ARMPL INTERFACE LIBRARIES ${ARMPL_LIBRARIES}) +ELSEIF (ARMPL_LIBRARY_DIRS) + KOKKOSKERNELS_FIND_IMPORTED(ARMPL INTERFACE LIBRARIES amath ${ARMPL_LIB} LIBRARY_PATHS ${ARMPL_LIBRARY_DIRS}) +ELSEIF (DEFINED ENV{ARMPL_DIR}) + SET(ARMPL_ROOT $ENV{ARMPL_DIR}) + KOKKOSKERNELS_FIND_IMPORTED(ARMPL INTERFACE + LIBRARIES + amath + ${ARMPL_LIB} + LIBRARY_PATHS + ${ARMPL_ROOT}/lib + HEADERS + armpl.h + HEADER_PATHS + ${ARMPL_ROOT}/include + ) +ELSE() + FIND_PACKAGE(ARMPL REQUIRED) + KOKKOSKERNELS_CREATE_IMPORTED_TPL(ARMPL INTERFACE LINK_LIBRARIES ${ARMPL_LIBRARIES}) +ENDIF() + +TRY_COMPILE(KOKKOSKERNELS_TRY_COMPILE_ARMPL + ${KOKKOSKERNELS_TOP_BUILD_DIR}/tpl_tests + ${KOKKOSKERNELS_TOP_SOURCE_DIR}/cmake/compile_tests/armpl.cpp + LINK_LIBRARIES -l${ARMPL_LIB} -lgfortran -lamath -lm + OUTPUT_VARIABLE KOKKOSKERNELS_TRY_COMPILE_ARMPL_OUT) +IF(NOT KOKKOSKERNELS_TRY_COMPILE_ARMPL) + MESSAGE(FATAL_ERROR "KOKKOSKERNELS_TRY_COMPILE_ARMPL_OUT=${KOKKOSKERNELS_TRY_COMPILE_ARMPL_OUT}") +ELSE() + # KokkosKernels::ARMPL is an alias to the ARMPL target. + # Let's add in the libgfortran and libm dependencies for users here. + GET_TARGET_PROPERTY(ARMPL_INTERFACE_LINK_LIBRARIES KokkosKernels::ARMPL INTERFACE_LINK_LIBRARIES) + SET(ARMPL_INTERFACE_LINK_LIBRARIES "${ARMPL_INTERFACE_LINK_LIBRARIES};-lgfortran;-lm") + SET_TARGET_PROPERTIES(ARMPL PROPERTIES INTERFACE_LINK_LIBRARIES "${ARMPL_INTERFACE_LINK_LIBRARIES}") +ENDIF() diff --git a/packages/kokkos-kernels/cmake/compile_tests/armpl.cpp b/packages/kokkos-kernels/cmake/compile_tests/armpl.cpp new file mode 100644 index 000000000000..9bb1c4839277 --- /dev/null +++ b/packages/kokkos-kernels/cmake/compile_tests/armpl.cpp @@ -0,0 +1,5 @@ +#include + +int main(void) { + return 0; +} diff --git a/packages/kokkos-kernels/cmake/kokkos_backends.cmake b/packages/kokkos-kernels/cmake/kokkos_backends.cmake index c2f46bb8e304..eb7d8602b7d3 100644 --- a/packages/kokkos-kernels/cmake/kokkos_backends.cmake +++ b/packages/kokkos-kernels/cmake/kokkos_backends.cmake @@ -11,6 +11,7 @@ ENDMACRO(CHECK_KOKKOS_BACKEND) CHECK_KOKKOS_BACKEND(CUDA) CHECK_KOKKOS_BACKEND(HIP) +CHECK_KOKKOS_BACKEND(SYCL) CHECK_KOKKOS_BACKEND(OPENMP) CHECK_KOKKOS_BACKEND(PTHREAD) CHECK_KOKKOS_BACKEND(SERIAL) diff --git a/packages/kokkos-kernels/cmake/kokkoskernels_eti_devices.cmake b/packages/kokkos-kernels/cmake/kokkoskernels_eti_devices.cmake index ede934023cbc..ad7ef15e55a7 100644 --- a/packages/kokkos-kernels/cmake/kokkoskernels_eti_devices.cmake +++ b/packages/kokkos-kernels/cmake/kokkoskernels_eti_devices.cmake @@ -5,12 +5,14 @@ SET(EXEC_SPACES EXECSPACE_CUDA EXECSPACE_HIP + EXECSPACE_SYCL EXECSPACE_OPENMP EXECSPACE_PTHREAD EXECSPACE_SERIAL ) SET(EXECSPACE_CUDA_CPP_TYPE Kokkos::Cuda) SET(EXECSPACE_HIP_CPP_TYPE Kokkos::Experimental::HIP) +SET(EXECSPACE_SYCL_CPP_TYPE Kokkos::Experimental::SYCL) SET(EXECSPACE_OPENMP_CPP_TYPE Kokkos::OpenMP) SET(EXECSPACE_PTHREAD_CPP_TYPE Kokkos::Threads) SET(EXECSPACE_SERIAL_CPP_TYPE Kokkos::Serial) @@ -19,14 +21,18 @@ SET(MEM_SPACES MEMSPACE_CUDASPACE MEMSPACE_CUDAUVMSPACE MEMSPACE_HIPSPACE + MEMSPACE_SYCLSPACE + MEMSPACE_SYCLSHAREDSPACE MEMSPACE_HOSTSPACE MEMSPACE_HBWSPACE ) -SET(MEMSPACE_CUDASPACE_CPP_TYPE Kokkos::CudaSpace) -SET(MEMSPACE_CUDAUVMSPACE_CPP_TYPE Kokkos::CudaUVMSpace) -SET(MEMSPACE_HIPSPACE_CPP_TYPE Kokkos::Experimental::HIPSpace) -SET(MEMSPACE_HOSTSPACE_CPP_TYPE Kokkos::HostSpace) -SET(MEMSPACE_HBWSPACE_CPP_TYPE Kokkos::HBWSpace) +SET(MEMSPACE_CUDASPACE_CPP_TYPE Kokkos::CudaSpace) +SET(MEMSPACE_CUDAUVMSPACE_CPP_TYPE Kokkos::CudaUVMSpace) +SET(MEMSPACE_HIPSPACE_CPP_TYPE Kokkos::Experimental::HIPSpace) +SET(MEMSPACE_SYCLSPACE_CPP_TYPE Kokkos::Experimental::SYCLDeviceUSMSpace) +SET(MEMSPACE_SYCLSHAREDSPACE_CPP_TYPE Kokkos::Experimental::SYCLSharedUSMSpace) +SET(MEMSPACE_HOSTSPACE_CPP_TYPE Kokkos::HostSpace) +SET(MEMSPACE_HBWSPACE_CPP_TYPE Kokkos::HBWSpace) IF(KOKKOS_ENABLE_CUDA) KOKKOSKERNELS_ADD_OPTION( @@ -85,6 +91,33 @@ IF(KOKKOS_ENABLE_HIP) ENDIF() +IF(KOKKOS_ENABLE_SYCL) + KOKKOSKERNELS_ADD_OPTION( + INST_EXECSPACE_SYCL + ${KOKKOSKERNELS_INST_EXECSPACE_SYCL_DEFAULT} + BOOL + "Whether to pre instantiate kernels for the execution space Kokkos::Experimental::SYCL. Disabling this when Kokkos_ENABLE_SYCL is enabled may increase build times. Default: ON if Kokkos is SYCL-enabled, OFF otherwise." + ) + KOKKOSKERNELS_ADD_OPTION( + INST_MEMSPACE_SYCLSPACE + ${KOKKOSKERNELS_INST_EXECSPACE_SYCL_DEFAULT} + BOOL + "Whether to pre instantiate kernels for the memory space Kokkos::Experimental::SYCLSpace. Disabling this when Kokkos_ENABLE_SYCL is enabled may increase build times. Default: ON if Kokkos is SYCL-enabled, OFF otherwise." + ) + + IF(KOKKOSKERNELS_INST_EXECSPACE_SYCL AND KOKKOSKERNELS_INST_MEMSPACE_SYCLSPACE) + LIST(APPEND DEVICE_LIST "") + ENDIF() + IF(KOKKOSKERNELS_INST_EXECSPACE_SYCL AND KOKKOSKERNELS_INST_MEMSPACE_SYCLSHAREDSPACE) + LIST(APPEND DEVICE_LIST "") + ENDIF() + + IF( Trilinos_ENABLE_COMPLEX_DOUBLE AND ((NOT DEFINED CMAKE_CXX_USE_RESPONSE_FILE_FOR_OBJECTS) OR (NOT CMAKE_CXX_USE_RESPONSE_FILE_FOR_OBJECTS)) ) + MESSAGE( WARNING "The CMake option CMAKE_CXX_USE_RESPONSE_FILE_FOR_OBJECTS is either undefined or OFF. Please set CMAKE_CXX_USE_RESPONSE_FILE_FOR_OBJECTS:BOOL=ON when building with SYCL and complex double enabled.") + ENDIF() + +ENDIF() + KOKKOSKERNELS_ADD_OPTION( INST_MEMSPACE_HOSTSPACE ${KOKKOSKERNELS_ADD_DEFAULT_ETI} @@ -138,6 +171,7 @@ KOKKOSKERNELS_ADD_OPTION( SET(EXECSPACE_CUDA_VALID_MEM_SPACES CUDASPACE CUDAUVMSPACE) SET(EXECSPACE_HIP_VALID_MEM_SPACES HIPSPACE) +SET(EXECSPACE_SYCL_VALID_MEM_SPACES SYCLSPACE SYCLSHAREDSPACE) SET(EXECSPACE_SERIAL_VALID_MEM_SPACES HBWSPACE HOSTSPACE) SET(EXECSPACE_OPENMP_VALID_MEM_SPACES HBWSPACE HOSTSPACE) SET(EXECSPACE_PTHREAD_VALID_MEM_SPACES HBWSPACE HOSTSPACE) diff --git a/packages/kokkos-kernels/cmake/kokkoskernels_features.cmake b/packages/kokkos-kernels/cmake/kokkoskernels_features.cmake index 2212332b7d66..6f4561f664fc 100644 --- a/packages/kokkos-kernels/cmake/kokkoskernels_features.cmake +++ b/packages/kokkos-kernels/cmake/kokkoskernels_features.cmake @@ -24,7 +24,7 @@ KOKKOSKERNELS_FEATURE_DEPENDS_ON_TPLS( # Fortran Complex BLAS # ================================================================== -IF (KOKKOSKERNELS_ENABLE_TPL_BLAS OR KOKKOSKERNELS_ENABLE_TPL_MKL) +IF (KOKKOSKERNELS_ENABLE_TPL_BLAS OR KOKKOSKERNELS_ENABLE_TPL_MKL OR KOKKOSKERNELS_ENABLE_TPL_ARMPL) INCLUDE(CheckHostBlasReturnComplex.cmake) CHECK_HOST_BLAS_RETURN_COMPLEX(KOKKOSKERNELS_TPL_BLAS_RETURN_COMPLEX) ENDIF() diff --git a/packages/kokkos-kernels/cmake/kokkoskernels_tpls.cmake b/packages/kokkos-kernels/cmake/kokkoskernels_tpls.cmake index 08230dd987f8..2bdcda1e81fb 100644 --- a/packages/kokkos-kernels/cmake/kokkoskernels_tpls.cmake +++ b/packages/kokkos-kernels/cmake/kokkoskernels_tpls.cmake @@ -420,11 +420,12 @@ KOKKOSKERNELS_ADD_TPL_OPTION(MKL OFF "Whether to enable MKL") KOKKOSKERNELS_ADD_TPL_OPTION(MAGMA OFF "Whether to enable MAGMA") KOKKOSKERNELS_ADD_TPL_OPTION(CBLAS OFF "Whether to enable CBLAS") KOKKOSKERNELS_ADD_TPL_OPTION(LAPACKE OFF "Whether to enable LAPACKE") +KOKKOSKERNELS_ADD_TPL_OPTION(ARMPL OFF "Whether to enable ARMPL") # Set F77_BLAS_MANGLE macro based on Fortran-C interface (unless already set # by Trilinos or user) IF ("${F77_BLAS_MANGLE}" STREQUAL "") - IF (KOKKOSKERNELS_ENABLE_TPL_BLAS OR KOKKOSKERNELS_ENABLE_TPL_MKL OR KOKKOSKERNELS_ENABLE_TPL_MAGMA) + IF (KOKKOSKERNELS_ENABLE_TPL_BLAS OR KOKKOSKERNELS_ENABLE_TPL_MKL OR KOKKOSKERNELS_ENABLE_TPL_MAGMA OR KOKKOSKERNELS_ENABLE_TPL_ARMPL) ENABLE_LANGUAGE(C) ENABLE_LANGUAGE(Fortran) INCLUDE(FortranCInterface) @@ -481,6 +482,7 @@ IF (NOT KOKKOSKERNELS_HAS_TRILINOS) KOKKOSKERNELS_IMPORT_TPL(CHOLMOD) KOKKOSKERNELS_IMPORT_TPL(SUPERLU) KOKKOSKERNELS_IMPORT_TPL(METIS) + KOKKOSKERNELS_IMPORT_TPL(ARMPL) ENDIF() #Convert list to newlines (which CMake doesn't always like in cache variables) diff --git a/packages/kokkos-kernels/cmake/kokkoskernels_tribits.cmake b/packages/kokkos-kernels/cmake/kokkoskernels_tribits.cmake index 4eebb97c7b9e..b023d7c4d2eb 100644 --- a/packages/kokkos-kernels/cmake/kokkoskernels_tribits.cmake +++ b/packages/kokkos-kernels/cmake/kokkoskernels_tribits.cmake @@ -149,12 +149,13 @@ IF (IS_ENABLED) IF (KOKKOSKERNELS_HAS_TRILINOS) TRIBITS_ADD_EXECUTABLE(${EXE_NAME} SOURCES ${PARSE_SOURCES} - TESTONLYLIBS ${TESTONLYLIBS}) + TESTONLYLIBS ${PARSE_TESTONLYLIBS}) ELSE() ADD_EXECUTABLE(${EXE_NAME} ${PARSE_SOURCES}) - TARGET_LINK_LIBRARIES(${EXE_NAME} PRIVATE Kokkos::kokkoskernels) IF (PARSE_TESTONLYLIBS) - TARGET_LINK_LIBRARIES(${EXE_NAME} ${PARSE_TESTONLYLIBS}) + TARGET_LINK_LIBRARIES(${EXE_NAME} PRIVATE Kokkos::kokkoskernels ${PARSE_TESTONLYLIBS}) + ELSE () + TARGET_LINK_LIBRARIES(${EXE_NAME} PRIVATE Kokkos::kokkoskernels) ENDIF() ENDIF() ELSE() diff --git a/packages/kokkos-kernels/example/buildlib/compileKokkosKernels.sh b/packages/kokkos-kernels/example/buildlib/compileKokkosKernels.sh index 9f5978bb5836..4bf57ce31632 100755 --- a/packages/kokkos-kernels/example/buildlib/compileKokkosKernels.sh +++ b/packages/kokkos-kernels/example/buildlib/compileKokkosKernels.sh @@ -1,13 +1,22 @@ -KOKKOS_PATH=${HOME}/work/kokkos #path to kokkos source -KOKKOSKERNELS_SCALARS='double,"complex"' #the scalar types to instantiate =double,float... -KOKKOSKERNELS_LAYOUTS=LayoutLeft #the layout types to instantiate. -KOKKOSKERNELS_ORDINALS=int,long #ordinal types to instantiate -KOKKOSKERNELS_OFFSETS=int,size_t #offset types to instantiate -KOKKOSKERNELS_PATH=../.. #path to kokkos-kernels top directory. -KOKKOSKERNELS_OPTIONS=eti-only #options for kokkoskernels +#!/bin/bash +# Requires cmake version > 3.12 +# Paths to source +KOKKOS_PATH="${HOME}/Kokkos/kokkos" #path to kokkos source +KOKKOSKERNELS_PATH="../.." #path to kokkos-kernels top directory + +# Compiler - must be passed to kokkos and kokkos-kernels configurations +CXX=${KOKKOS_PATH}/bin/nvcc_wrapper #Options: icpc #g++ #clang++ CXXFLAGS="-Wall -pedantic -Werror -O3 -g -Wshadow -Wsign-compare -Wignored-qualifiers -Wempty-body -Wclobbered -Wuninitialized" -CXX=${KOKKOS_PATH}/bin/nvcc_wrapper #icpc # -KOKKOS_DEVICES=Serial,Cuda,OpenMP #devices Cuda... -KOKKOS_ARCHS=Pascal60,Power8 -../../scripts/generate_makefile.bash --kokkoskernels-path=${KOKKOSKERNELS_PATH} --with-scalars=${KOKKOSKERNELS_SCALARS} --with-ordinals=${KOKKOSKERNELS_ORDINALS} --with-offsets=${KOKKOSKERNELS_OFFSETS} --kokkos-path=${KOKKOS_PATH} --with-devices=${KOKKOS_DEVICES} --arch=${KOKKOS_ARCHS} --compiler=${CXX} --with-options=${KOKKOSKERNELS_OPTIONS} --cxxflags="${CXXFLAGS}" +# Configure Kokkos (Unit Tests OFF) - Makefile located in kokkos-build +cmake -Bkokkos-build -DCMAKE_CXX_COMPILER=${CXX} -DKokkos_ARCH_PASCAL60=ON -DKokkos_ARCH_POWER8=ON -DKokkos_ENABLE_CUDA=ON -DKokkos_ENABLE_SERIAL=ON -DKokkos_ENABLE_OPENMP=ON -DKokkos_ENABLE_CUDA_LAMBDA=ON -DCMAKE_CXX_FLAGS="${CXXFLAGS}" -DCMAKE_INSTALL_PREFIX="${PWD}/kokkos-install" -DKokkos_ENABLE_TESTS=OFF ${KOKKOS_PATH} + +# Build and Install Kokkos - install lib at ${PWD}/kokkos-install +cmake --build kokkos-build -j 8 --target install + + +# Configure KokkosKernels (Unit Tests OFF) - Makefile located in kokkoskernels-build +cmake -Bkokkoskernels-build -DCMAKE_CXX_COMPILER=${CXX} -DKokkos_ROOT="${PWD}/kokkos-install" -DKokkosKernels_INST_DOUBLE=ON -DKokkosKernels_INST_COMPLEX_DOUBLE=ON -DKokkosKernels_INST_ORDINAL_INT=ON -DKokkosKernels_INST_ORDINAL_INT64_T=ON -DKokkosKernels_INST_OFFSET_INT=ON -DKokkosKernels_INST_OFFSET_SIZE_T=ON -DKokkosKernels_INST_LAYOUTLEFT=ON -DKokkosKernels_ADD_DEFAULT_ETI=ON -DCMAKE_INSTALL_PREFIX="${PWD}/kokkoskernels-install" -DKokkosKernels_ENABLE_TESTS=OFF -DKokkosKernels_ENABLE_TPL_CUBLAS=OFF ${KOKKOSKERNELS_PATH} + +# Build and Install KokkosKernels - install lib at ${PWD}/kokkoskernels-install +cmake --build kokkoskernels-build -j 8 --target install diff --git a/packages/kokkos-kernels/example/buildlib/compileKokkosKernelsSimple.sh b/packages/kokkos-kernels/example/buildlib/compileKokkosKernelsSimple.sh index 20d0a7aef47b..9502235aba1e 100755 --- a/packages/kokkos-kernels/example/buildlib/compileKokkosKernelsSimple.sh +++ b/packages/kokkos-kernels/example/buildlib/compileKokkosKernelsSimple.sh @@ -1,13 +1,17 @@ -KOKKOS_PATH=${HOME}/proj/kokkos #path to kokkos source +KOKKOS_PATH="${HOME}/Kokkos/kokkos" #path to kokkos source +KOKKOSKERNELS_PATH="../.." #path to kokkos-kernels top directory. + KOKKOSKERNELS_SCALARS=double #the scalar types to instantiate =double,float... KOKKOSKERNELS_LAYOUTS=LayoutLeft #the layout types to instantiate. KOKKOSKERNELS_ORDINALS=int #ordinal types to instantiate KOKKOSKERNELS_OFFSETS=int #offset types to instantiate -KOKKOSKERNELS_PATH=../.. #path to kokkos-kernels top directory. -CXX=${KOKKOS_PATH}/bin/nvcc_wrapper #icpc # +CXX=${KOKKOS_PATH}/bin/nvcc_wrapper KOKKOSKERNELS_OPTIONS=eti-only #options for kokkoskernels -KOKKOS_DEVICES=Cuda # other devices Cuda,Serial .. +KOKKOS_DEVICES=Cuda KOKKOS_ARCHS=SKX,Volta70 +KOKKOS_CUDA_OPTIONS=enable_lambda CXXFLAGS="-Wall -pedantic -Werror -O3 -g -Wshadow -Wsign-compare -Wtype-limits -Wuninitialized" -../../scripts/generate_makefile.bash --kokkoskernels-path=${KOKKOSKERNELS_PATH} --with-scalars=${KOKKOSKERNELS_SCALARS} --with-ordinals=${KOKKOSKERNELS_ORDINALS} --with-offsets=${KOKKOSKERNELS_OFFSETS} --kokkos-path=${KOKKOS_PATH} --with-devices=${KOKKOS_DEVICES} --arch=${KOKKOS_ARCHS} --compiler=${CXX} --with-options=${KOKKOSKERNELS_OPTIONS} --cxxflags="${CXXFLAGS}" +../../cm_generate_makefile.bash --kokkoskernels-path=${KOKKOSKERNELS_PATH} --with-scalars=${KOKKOSKERNELS_SCALARS} --with-ordinals=${KOKKOSKERNELS_ORDINALS} --with-offsets=${KOKKOSKERNELS_OFFSETS} --kokkos-path=${KOKKOS_PATH} --with-devices=${KOKKOS_DEVICES} --arch=${KOKKOS_ARCHS} --compiler=${CXX} --with-cuda-options=${KOKKOS_CUDA_OPTIONS} --with-options=${KOKKOSKERNELS_OPTIONS} --cxxflags="${CXXFLAGS}" + +# Call "../../scripts/cm_generate_makefile.bash --help" for options diff --git a/packages/kokkos-kernels/install_test/CMakeLists.txt b/packages/kokkos-kernels/install_test/CMakeLists.txt new file mode 100644 index 000000000000..4be641e87ac9 --- /dev/null +++ b/packages/kokkos-kernels/install_test/CMakeLists.txt @@ -0,0 +1,15 @@ +# First copy the CMakeList.txt so we can build the test +configure_file(${PACKAGE_SOURCE_DIR}/install_test/CMakeLists.txt.in ${CMAKE_CURRENT_BINARY_DIR}/source/CMakeLists.txt) + +# Second copy the source files needed to the build area +file(COPY ${PACKAGE_SOURCE_DIR}/perf_test/sparse/KokkosSparse_pcg.hpp DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/source) +file(COPY ${PACKAGE_SOURCE_DIR}/perf_test/sparse/KokkosSparse_block_pcg.cpp DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/source) + +# Third write a configure file that can be invoked to test the library installation +configure_file(${PACKAGE_SOURCE_DIR}/install_test/run_install_test.sh.in ${CMAKE_CURRENT_BINARY_DIR}/run_install_test.sh @ONLY) + +# Fourth create the build directory where the installation of the cg example will take place +file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/build) + +add_test(NAME install_test COMMAND /bin/bash ${CMAKE_CURRENT_BINARY_DIR}/run_install_test.sh) +# KOKKOSKERNELS_ADD_TEST(NAME "install_test" COMMAND "${CMAKE_CURRENT_BINARY_DIR}/run_install_test.sh" COMPONENTS Sparse) diff --git a/packages/kokkos-kernels/install_test/CMakeLists.txt.in b/packages/kokkos-kernels/install_test/CMakeLists.txt.in new file mode 100644 index 000000000000..74605ac73f94 --- /dev/null +++ b/packages/kokkos-kernels/install_test/CMakeLists.txt.in @@ -0,0 +1,31 @@ +cmake_minimum_required(VERSION 3.13) +project(kokkoskernels_install_test CXX) + +include(CTest) + +find_package(KokkosKernels REQUIRED) + +add_executable(kokkoskernels_install_test KokkosSparse_block_pcg.cpp) +target_link_libraries(kokkoskernels_install_test PRIVATE Kokkos::kokkoskernels) + +if(BUILD_TESTING) + + add_test(NAME cg_test_serial COMMAND kokkoskernels_install_test --mtx auto --serial) + + if(KOKKOS_ENABLE_THREADS) + add_test(NAME cg_test_threads COMMAND kokkoskernels_install_test --mtx auto --threads 2) + endif() + + if(KOKKOS_ENABLE_OPENMP) + add_test(NAME cg_test_openmp COMMAND kokkoskernels_install_test --mtx auto --openmp 2) + endif() + + if(KOKKOS_ENABLE_CUDA) + add_test(NAME cg_test_cuda COMMAND kokkoskernels_install_test --mtx auto --cuda) + endif() + + if(KOKKOS_ENABLE_HIP) + add_test(NAME cg_test_hip COMMAND kokkoskernels_install_test --mtx auto --hip) + endif() + +endif() diff --git a/packages/kokkos-kernels/install_test/run_install_test.sh.in b/packages/kokkos-kernels/install_test/run_install_test.sh.in new file mode 100755 index 000000000000..a3b0fd6a59af --- /dev/null +++ b/packages/kokkos-kernels/install_test/run_install_test.sh.in @@ -0,0 +1,37 @@ +#!/bin/bash + +KOKKOSKERNELS_INTALL="@CMAKE_BINARY_DIR@" +INSTALL_TEST_SOURCE="@CMAKE_CURRENT_BINARY_DIR@/source" +INSTALL_TEST_BUILD="@CMAKE_CURRENT_BINARY_DIR@/build" + +cd "${INSTALL_TEST_BUILD}" +rm -rf CMake* + +cmake "${INSTALL_TEST_SOURCE}" \ + -D CMAKE_CXX_COMPILER="@CMAKE_CXX_COMPILER@" \ + -D KokkosKernels_ROOT:PATH="@CMAKE_BINARY_DIR@/@CMAKE_INSTALL_LIBDIR@/cmake/KokkosKernels" + +if [ $? -eq 0 ]; then + echo "*** install test: cmake configure SUCCESSFUL ***" +else + echo "*** install test: cmake configure FAILED ***" + exit 1; +fi + +make -j 4 + +if [ $? -eq 0 ]; then + echo "*** install test: build SUCCESSFUL ***" +else + echo "*** install test: build FAILED ***" + exit 1; +fi + +ctest -V -R + +if [ $? -eq 0 ]; then + echo "*** install test: run SUCCESSFUL ***" +else + echo "*** install test: run FAILED ***" + exit 1; +fi diff --git a/packages/kokkos-kernels/perf_test/CMakeLists.txt b/packages/kokkos-kernels/perf_test/CMakeLists.txt index fe3b3c51bab2..08788d648d8d 100644 --- a/packages/kokkos-kernels/perf_test/CMakeLists.txt +++ b/packages/kokkos-kernels/perf_test/CMakeLists.txt @@ -10,6 +10,22 @@ KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/../test_common) # build correctly with or without MPI, but only run them with a single # MPI process. +SET(GTEST_SOURCE_DIR ${PACKAGE_SOURCE_DIR}/tpls/gtest) + +KOKKOSKERNELS_ADD_TEST_LIBRARY( + kokkoskernelsperf_gtest + HEADERS ${GTEST_SOURCE_DIR}/gtest/gtest.h + SOURCES ${GTEST_SOURCE_DIR}/gtest/gtest-all.cc + ) +# Disables pthreads, this is a problem for serial builds in Trilinos & Sierra if it's enabled. +TARGET_COMPILE_DEFINITIONS(kokkoskernelsperf_gtest PUBLIC "-DGTEST_HAS_PTHREAD=0") +TARGET_INCLUDE_DIRECTORIES(kokkoskernelsperf_gtest PUBLIC $) + +#Gtest minimally requires C++11 +TARGET_COMPILE_FEATURES(kokkoskernelsperf_gtest PUBLIC cxx_std_11) + + + ADD_COMPONENT_SUBDIRECTORY(batched) ADD_COMPONENT_SUBDIRECTORY(graph) ADD_COMPONENT_SUBDIRECTORY(sparse) diff --git a/packages/kokkos-kernels/perf_test/batched/README.md b/packages/kokkos-kernels/perf_test/batched/README.md new file mode 100644 index 000000000000..ca5920ae3954 --- /dev/null +++ b/packages/kokkos-kernels/perf_test/batched/README.md @@ -0,0 +1 @@ +Batched BLAS performance tests reside in `perf_test/blas/{blas,blas3}`. diff --git a/packages/kokkos-kernels/perf_test/batched/do-not-use/KokkosBatched_Test_Trsm_Cuda.cpp b/packages/kokkos-kernels/perf_test/batched/do-not-use/KokkosBatched_Test_Trsm_Cuda.cpp index 0c0dbe4eac1f..fc7e727123d3 100644 --- a/packages/kokkos-kernels/perf_test/batched/do-not-use/KokkosBatched_Test_Trsm_Cuda.cpp +++ b/packages/kokkos-kernels/perf_test/batched/do-not-use/KokkosBatched_Test_Trsm_Cuda.cpp @@ -608,20 +608,12 @@ namespace KokkosBatched { Kokkos::deep_copy(a, amat); Kokkos::deep_copy(b, bmat); -<<<<<<< HEAD:perf_test/batched/do-not-use/KokkosBatched_Test_Trsm_Cuda.cpp - Kokkos::fence(); -======= DeviceSpaceType().fence(); ->>>>>>> develop:perf_test/batched/KokkosBatched_Test_Trsm_Cuda.cpp timer.reset(); Kokkos::parallel_for("KokkosBatched::PerfTest::TeamTagV2", policy, functor_type(a, b)); -<<<<<<< HEAD:perf_test/batched/do-not-use/KokkosBatched_Test_Trsm_Cuda.cpp - Kokkos::fence(); -======= DeviceSpaceType().fence(); ->>>>>>> develop:perf_test/batched/KokkosBatched_Test_Trsm_Cuda.cpp const double t = timer.seconds(); tmin = std::min(tmin, t); tavg += (iter >= 0)*t; @@ -693,20 +685,12 @@ namespace KokkosBatched { Kokkos::deep_copy(a, amat); Kokkos::deep_copy(b, bmat); -<<<<<<< HEAD:perf_test/batched/do-not-use/KokkosBatched_Test_Trsm_Cuda.cpp - Kokkos::fence(); -======= DeviceSpaceType().fence(); ->>>>>>> develop:perf_test/batched/KokkosBatched_Test_Trsm_Cuda.cpp timer.reset(); Kokkos::parallel_for("KokkosBatched::PerfTest::TeamTagV3", policy, functor_type(a, b)); -<<<<<<< HEAD:perf_test/batched/do-not-use/KokkosBatched_Test_Trsm_Cuda.cpp - Kokkos::fence(); -======= DeviceSpaceType().fence(); ->>>>>>> develop:perf_test/batched/KokkosBatched_Test_Trsm_Cuda.cpp const double t = timer.seconds(); tmin = std::min(tmin, t); tavg += (iter >= 0)*t; diff --git a/packages/kokkos-kernels/perf_test/blas/blas/KokkosBlas_common.hpp b/packages/kokkos-kernels/perf_test/blas/blas/KokkosBlas_common.hpp index a6f9c65d8b67..54e79647bf3a 100644 --- a/packages/kokkos-kernels/perf_test/blas/blas/KokkosBlas_common.hpp +++ b/packages/kokkos-kernels/perf_test/blas/blas/KokkosBlas_common.hpp @@ -56,6 +56,7 @@ #define DEFAULT_STEP 3 #define DEFAULT_WARM_UP_N 100 #define DEFAULT_N 100 +#define DEFAULT_K 10 #define DEFAULT_OUT &std::cout #define DEFAULT_BLAS_ROUTINES "trtri," @@ -117,7 +118,7 @@ static std::string test_e_str[TEST_N]{"BLAS", "BATCHED"}; * @var n: Number of columns. */ struct matrix_dim { - int m, n; + int k, m, n; }; typedef struct matrix_dim matrix_dim_t; diff --git a/packages/kokkos-kernels/perf_test/blas/blas/KokkosBlas_perf_test.cpp b/packages/kokkos-kernels/perf_test/blas/blas/KokkosBlas_perf_test.cpp index 46e89d5abb2d..803286f26646 100644 --- a/packages/kokkos-kernels/perf_test/blas/blas/KokkosBlas_perf_test.cpp +++ b/packages/kokkos-kernels/perf_test/blas/blas/KokkosBlas_perf_test.cpp @@ -57,6 +57,7 @@ static struct option long_options[] = { {"matrix_size_step", required_argument, 0, 's'}, {"warm_up_loop", required_argument, 0, 'w'}, {"iter", required_argument, 0, 'i'}, + {"batch_size", required_argument, 0, 'k'}, {"csv", required_argument, 0, 'c'}, {"routines", required_argument, 0, 'r'}, {"trtri_options", required_argument, 0, 'o'}, @@ -135,6 +136,11 @@ static void __print_help_blas_perf_test() { "(default: %d)\n\n", DEFAULT_N); + printf("\t-k, --batch_size=LEN\n"); + printf("\t\tBatch size. Adds third dimension to matrices A and B.\n"); + printf("\t\t\tThe value of LEN as an integer. (default: %d)\n", + DEFAULT_K); + printf("\t-c, --csv=/path/to/file.csv\n"); printf("\t\tCsv output file selection.\n"); printf( @@ -166,12 +172,16 @@ int main(int argc, char **argv) { /* set default options */ options.test = DEFAULT_TEST; options.loop = DEFAULT_LOOP; + options.start.a.k = DEFAULT_K; options.start.a.m = DEFAULT_MATRIX_START; options.start.a.n = DEFAULT_MATRIX_START; + options.stop.a.k = DEFAULT_K; options.stop.a.m = DEFAULT_MATRIX_STOP; options.stop.a.n = DEFAULT_MATRIX_STOP; + options.start.b.k = DEFAULT_K; options.start.b.m = DEFAULT_MATRIX_START; options.start.b.n = DEFAULT_MATRIX_START; + options.stop.b.k = DEFAULT_K; options.stop.b.m = DEFAULT_MATRIX_STOP; options.stop.b.n = DEFAULT_MATRIX_STOP; options.step = DEFAULT_STEP; @@ -182,7 +192,7 @@ int main(int argc, char **argv) { options.blas_args.trtri.trtri_args = DEFAULT_TRTRI_ARGS; - while ((ret = getopt_long(argc, argv, "ht:l:b:e:s:w:i:o:c:r:", long_options, + while ((ret = getopt_long(argc, argv, "ht:l:b:e:s:w:i:o:c:r:k:", long_options, &option_idx)) != -1) { switch (ret) { case 'h': __print_help_blas_perf_test(); return 0; @@ -255,6 +265,11 @@ int main(int argc, char **argv) { case 's': options.step = atoi(optarg); break; case 'w': options.warm_up_n = atoi(optarg); break; case 'i': options.n = atoi(optarg); break; + case 'k': + options.start.a.k = options.stop.a.k = + options.start.b.k = options.stop.b.k = + atoi(optarg); + break; case 'c': out_file = optarg; options.out_file = std::string(out_file); diff --git a/packages/kokkos-kernels/perf_test/blas/blas/KokkosBlas_trtri_perf_test.hpp b/packages/kokkos-kernels/perf_test/blas/blas/KokkosBlas_trtri_perf_test.hpp index e6b7b825a7ff..d60f15b92bf0 100644 --- a/packages/kokkos-kernels/perf_test/blas/blas/KokkosBlas_trtri_perf_test.hpp +++ b/packages/kokkos-kernels/perf_test/blas/blas/KokkosBlas_trtri_perf_test.hpp @@ -78,6 +78,64 @@ void (*do_trtri_invoke[LOOP_N][TEST_N])(options_t) = { /*************************** Test types and defaults **************************/ #define DEFAULT_TRTRI_ARGS "UU" + /** + * The KokkosBatched::SerialTrtri implementation performs trmm and scal on subblocks + * of the A matrix. a_m subblocks are selected. + */ +static inline double __trtri_impl_flop_count(double a_m, double a_n) { + double flop_count = 0; + double flops_per_div, flops_per_mul, flops_per_add; + + if (std::is_same::value || + std::is_same::value || + std::is_same::value) { + flops_per_div = 1; + flops_per_mul = 1; + flops_per_add = 1; + } else { + // For complex, we need to count 2 flops for each add and 6 flops for each multiply or divide. + flops_per_div = 6; + flops_per_mul = 6; + flops_per_add = 2; + } + + for (int i = 0; i < a_m; i++) { + flop_count += flops_per_div; // 1 / A[i,j] + flop_count += ((i * (i + 1)) / 2) * (flops_per_mul + flops_per_add); // TRMM FLOPS + flop_count += i * flops_per_mul; // SCAL FLOPS + } + + return flop_count; +} + +// Flop count formula from lapack working note 41: http://www.icl.utk.edu/~mgates3/docs/lawn41.pdf +static inline double __trtri_flop_count(double a_m, double a_n) { + double flops; + double flops_per_mul; + double flops_per_add; + + if (a_m != a_n) { + fprintf(stderr, "%s:%d:ERROR: a_m != a_n.\n", __FILE__, __LINE__); + exit(255); + } + + if (std::is_same::value || + std::is_same::value || + std::is_same::value) { + flops_per_mul = 1; + flops_per_add = 1; + } else { + // For complex, we need to count 2 flops for each add and 6 flops for each multiply. + flops_per_mul = 6; + flops_per_add = 2; + } + + flops = (1./6.*a_n*a_n*a_n + 1./2.*a_n*a_n + 1./3.*a_n) * flops_per_mul + + (1./6.*a_n*a_n*a_n - 1./2.*a_n*a_n + 1./3.*a_n) * flops_per_add; + + return flops; +} + using view_type_3d = Kokkos::View; struct trtri_args { @@ -87,18 +145,25 @@ struct trtri_args { typedef struct trtri_args trtri_args_t; static std::string trtri_csv_header_str = - "algorithm,side-uplo-trans-diag,alpha,loop_type,A_dims,warm_up_n,iter," - "total_time(s),average_time(s)"; + "algorithm,side-uplo-trans-diag,loop_type,A_dims,warm_up_n,iter," + "total_time(s),average_time(s),FLOPS,GFLOP/average_time(s)"; /*************************** Internal helper fns **************************/ static void __trtri_output_csv_row(options_t options, trtri_args_t trtri_args, double time_in_seconds) { + double flops = trtri_args.A.extent(0) * __trtri_flop_count(trtri_args.A.extent(1), trtri_args.A.extent(2)); + double gflops = flops / 1e9; + double average_time = time_in_seconds / options.n; + options.out[0] << test_e_str[options.test] << "," << options.blas_args.trtri.trtri_args << "," - << loop_e_str[options.loop] << "," << trtri_args.A.extent(1) + << loop_e_str[options.loop] << "," << trtri_args.A.extent(0) << "x" << trtri_args.A.extent(1) << "x" << trtri_args.A.extent(2) << "," << options.warm_up_n << "," << options.n << "," << time_in_seconds << "," - << time_in_seconds / options.n << std::endl; + << average_time << "," + << flops << "," + << gflops / average_time + << std::endl; } static void __print_trtri_perf_test_options(options_t options) { @@ -133,19 +198,26 @@ void __do_trtri_serial_blas(options_t options, trtri_args_t trtri_args) { STATUS; - for (uint32_t i = 0; i < warm_up_n; ++i) { - auto A = Kokkos::subview(trtri_args.A, i, Kokkos::ALL(), Kokkos::ALL()); + for (uint32_t j = 0; j < warm_up_n; ++j) { + for (int i = 0; i < options.start.a.k; ++i) { + auto A = Kokkos::subview(trtri_args.A, i, Kokkos::ALL(), Kokkos::ALL()); - KokkosBlas::trtri(&trtri_args.uplo, &trtri_args.diag, A); + KokkosBlas::trtri(&trtri_args.uplo, &trtri_args.diag, A); + } + // Fence after each batch operation + Kokkos::fence(); } timer.reset(); - for (uint32_t i = 0; i < n; ++i) { - auto A = Kokkos::subview(trtri_args.A, i, Kokkos::ALL(), Kokkos::ALL()); + for (uint32_t j = 0; j < n; ++j) { + for (int i = 0; i < options.start.a.k; ++i) { + auto A = Kokkos::subview(trtri_args.A, i, Kokkos::ALL(), Kokkos::ALL()); - KokkosBlas::trtri(&trtri_args.uplo, &trtri_args.diag, A); + KokkosBlas::trtri(&trtri_args.uplo, &trtri_args.diag, A); + } + // Fence after each batch operation + Kokkos::fence(); } - Kokkos::fence(); __trtri_output_csv_row(options, trtri_args, timer.seconds()); #else std::cerr << std::string(__func__) @@ -164,19 +236,26 @@ void __do_trtri_serial_batched_template(options_t options, Kokkos::Timer timer; using tag = Algo::Trtri::Unblocked; - for (uint32_t i = 0; i < warm_up_n; ++i) { - auto A = Kokkos::subview(trtri_args.A, i, Kokkos::ALL(), Kokkos::ALL()); + for (uint32_t j = 0; j < warm_up_n; ++j) { + for (int i = 0; i < options.start.a.k; ++i) { + auto A = Kokkos::subview(trtri_args.A, i, Kokkos::ALL(), Kokkos::ALL()); - SerialTrtri::invoke(A); + SerialTrtri::invoke(A); + } + // Fence after each batch operation + Kokkos::fence(); } timer.reset(); - for (uint32_t i = 0; i < n; ++i) { - auto A = Kokkos::subview(trtri_args.A, i, Kokkos::ALL(), Kokkos::ALL()); + for (uint32_t j = 0; j < n; ++j) { + for (int i = 0; i < options.start.a.k; ++i) { + auto A = Kokkos::subview(trtri_args.A, i, Kokkos::ALL(), Kokkos::ALL()); - SerialTrtri::invoke(A); + SerialTrtri::invoke(A); + } + // Fence after each batch operation + Kokkos::fence(); } - Kokkos::fence(); __trtri_output_csv_row(options, trtri_args, timer.seconds()); #else std::cerr << std::string(__func__) @@ -241,16 +320,22 @@ void __do_trtri_parallel_blas(options_t options, trtri_args_t trtri_args) { STATUS; - Kokkos::parallel_for("parallelBlasWarmUpLoopTrtri", - Kokkos::RangePolicy(0, warm_up_n), - parallel_blas_trtri_functor); - Kokkos::fence(); + for (uint32_t i = 0; i < warm_up_n; ++i) { + Kokkos::parallel_for("parallelBlasWarmUpLoopTrtri", + Kokkos::RangePolicy(0, options.start.a.k), + parallel_blas_trtri_functor); + // Fence after each batch operation + Kokkos::fence(); + } timer.reset(); - Kokkos::parallel_for("parallelBlasTimedLoopTrtri", - Kokkos::RangePolicy(0, n), - parallel_blas_trtri_functor); - Kokkos::fence(); + for (uint32_t i = 0; i < n; ++i) { + Kokkos::parallel_for("parallelBlasTimedLoopTrtri", + Kokkos::RangePolicy(0, options.start.a.k), + parallel_blas_trtri_functor); + // Fence after each batch operation + Kokkos::fence(); + } __trtri_output_csv_row(options, trtri_args, timer.seconds()); #else std::cerr << std::string(__func__) @@ -287,16 +372,23 @@ void __do_trtri_parallel_batched_template(options_t options, STATUS; - Kokkos::parallel_for("parallelBatchedWarmUpLoopTrtri", - Kokkos::RangePolicy(0, warm_up_n), - parallel_batched_trtri_functor); - Kokkos::fence(); + for (uint32_t i = 0; i < warm_up_n; ++i) { + Kokkos::parallel_for("parallelBatchedWarmUpLoopTrtri", + Kokkos::RangePolicy(0, options.start.a.k), + parallel_batched_trtri_functor); + // Fence after each batch operation + Kokkos::fence(); + } timer.reset(); - Kokkos::parallel_for("parallelBatchedTimedLoopTrtri", - Kokkos::RangePolicy(0, n), - parallel_batched_trtri_functor); - Kokkos::fence(); + + for (uint32_t i = 0; i < n; ++i) { + Kokkos::parallel_for("parallelBatchedTimedLoopTrtri", + Kokkos::RangePolicy(0, options.start.a.k), + parallel_batched_trtri_functor); + // Fence after each batch operation + Kokkos::fence(); + } __trtri_output_csv_row(options, trtri_args, timer.seconds()); return; @@ -345,7 +437,7 @@ trtri_args_t __do_setup(options_t options, matrix_dims_t dim) { trtri_args.uplo = options.blas_args.trtri.trtri_args.c_str()[0]; trtri_args.diag = options.blas_args.trtri.trtri_args.c_str()[1]; - trtri_args.A = vta("trtri_args.A", options.n, dim.a.m, dim.a.n); + trtri_args.A = vta("trtri_args.A", dim.a.k, dim.a.m, dim.a.n); host_A = Kokkos::create_mirror_view(trtri_args.A); Kokkos::fill_random(trtri_args.A, rand_pool, @@ -355,7 +447,7 @@ trtri_args_t __do_setup(options_t options, matrix_dims_t dim) { if (trtri_args.uplo == 'U' || trtri_args.uplo == 'u') { // Make A upper triangular - for (uint32_t k = 0; k < options.n; ++k) { + for (int k = 0; k < dim.a.k; ++k) { auto A = Kokkos::subview(host_A, k, Kokkos::ALL(), Kokkos::ALL()); for (int i = 1; i < dim.a.m; i++) { for (int j = 0; j < i; j++) { @@ -367,7 +459,7 @@ trtri_args_t __do_setup(options_t options, matrix_dims_t dim) { // Make A lower triangular // Kokkos::parallel_for("toLowerLoop", options.n, KOKKOS_LAMBDA (const int& // i) { - for (uint32_t k = 0; k < options.n; ++k) { + for (int k = 0; k < dim.a.k; ++k) { auto A = Kokkos::subview(host_A, k, Kokkos::ALL(), Kokkos::ALL()); for (int i = 0; i < dim.a.m - 1; i++) { for (int j = i + 1; j < dim.a.n; j++) { @@ -378,7 +470,7 @@ trtri_args_t __do_setup(options_t options, matrix_dims_t dim) { } if (trtri_args.diag == 'U' || trtri_args.diag == 'u') { - for (uint32_t k = 0; k < options.n; ++k) { + for (int k = 0; k < dim.a.k; ++k) { auto A = Kokkos::subview(host_A, k, Kokkos::ALL(), Kokkos::ALL()); for (int i = 0; i < min_dim; i++) { A(i, i) = scalar_type(1); @@ -408,8 +500,8 @@ void __do_loop_and_invoke(options_t options, for (cur_dims = options.start; cur_dims.a.m <= options.stop.a.m && cur_dims.a.n <= options.stop.a.n && cur_dims.b.m <= options.stop.b.m && cur_dims.b.n <= options.stop.b.n; - cur_dims.a.m *= options.step, cur_dims.a.n *= options.step, - cur_dims.b.m *= options.step, cur_dims.b.n *= options.step) { + cur_dims.a.m += options.step, cur_dims.a.n += options.step, + cur_dims.b.m += options.step, cur_dims.b.n += options.step) { trtri_args = __do_setup( options, cur_dims); fn(options, trtri_args); diff --git a/packages/kokkos-kernels/perf_test/blas/blas3/CMakeLists.txt b/packages/kokkos-kernels/perf_test/blas/blas3/CMakeLists.txt index c1e3a117fa60..73c094387c1f 100644 --- a/packages/kokkos-kernels/perf_test/blas/blas3/CMakeLists.txt +++ b/packages/kokkos-kernels/perf_test/blas/blas3/CMakeLists.txt @@ -4,4 +4,5 @@ KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) KOKKOSKERNELS_ADD_EXECUTABLE( KokkosBlas3_perf_test SOURCES KokkosBlas3_perf_test.cpp + TESTONLYLIBS kokkoskernelsperf_gtest ) diff --git a/packages/kokkos-kernels/perf_test/blas/blas3/KokkosBlas3_common.hpp b/packages/kokkos-kernels/perf_test/blas/blas3/KokkosBlas3_common.hpp index 4952a8e606f1..2103d0d57e2e 100644 --- a/packages/kokkos-kernels/perf_test/blas/blas3/KokkosBlas3_common.hpp +++ b/packages/kokkos-kernels/perf_test/blas/blas3/KokkosBlas3_common.hpp @@ -61,6 +61,9 @@ #define DEFAULT_BLAS_ROUTINES "trmm,gemm," #define DEFAULT_TEAM_SIZE 1 #define DEFAULT_VECTOR_LEN 1 +#define DEFAULT_USE_AUTO 0 +#define DEFAULT_BATCH_SIZE_LAST_DIM 0 +#define DEFAULT_VERIFY 1 /************************ blas routine structure definitions **********/ struct perf_test_trmm_args { @@ -83,6 +86,7 @@ struct blas_args { // ADD MORE BLAS3 ROUTINES HERE int team_size; int vector_len; + bool use_auto, batch_size_last_dim; // ADD MORE COMMON BLAS3 OPTIONS HERE }; typedef struct blas_args blas_args_t; @@ -116,13 +120,19 @@ static std::string loop_e_str[LOOP_N] = {"serial", "parallel"}; /** * @var BLAS: Run the blas routine through the - * KokkosBlas namespace. + * KokkosBlas namespace. * @var BATCHED_SERIAL{_BLOCKED}: Run the serial blas routine through the * KokkosBatched namespace. + * @var BATCHED_SERIAL_SIMD{_BLOCKED}: Run the serial blas routine through the + * KokkosBatched namespace using SIMD views. + * @var BATCHED_SERIAL_COMPACT_MKL: Run the serial blas mkl routine through + * the KokkosBatched namespace. * @var BATCHED_TEAM{_BLOCKED}: Run the team blas routine through the - * KokkosBatched namespace. + * KokkosBatched namespace. * @var BATCHED_TEAM_VECTOR{_BLOCKED}: Run the team vector blas routine through - * the KokkosBatched namespace. + * the KokkosBatched namespace. + * @var BATCHED_TEAM_SIMD{_BLOCKED}: Run the team vector blas routine through + * the KokkosBatched namespace using SIMD views. * @var EXPERIMENT: Run the blas routine as a custom * experiment. */ @@ -130,19 +140,26 @@ typedef enum TEST { BLAS, BATCHED_SERIAL, BATCHED_SERIAL_BLOCKED, + BATCHED_SERIAL_SIMD, + BATCHED_SERIAL_SIMD_BLOCKED, + BATCHED_SERIAL_COMPACT_MKL, BATCHED_TEAM, BATCHED_TEAM_BLOCKED, BATCHED_TEAM_VECTOR, BATCHED_TEAM_VECTOR_BLOCKED, + BATCHED_TEAM_SIMD, + BATCHED_TEAM_SIMD_BLOCKED, // ADD MORE TEST TYPES HERE EXPERIMENT, TEST_N } test_e; static std::string test_e_str[TEST_N]{ - "blas", "batched_serial", "batched_serial_blocked", "batched_team", + "blas", "batched_serial", "batched_serial_blocked", "batched_serial_simd", + "batched_serial_simd_blocked", "batched_serial_compact_mkl", "batched_team", "batched_team_blocked", "batched_team_vector", - "batched_team_vector_blocked", + "batched_team_vector_blocked", "batched_team_simd", + "batched_team_simd_blocked", // ADD MORE TEST TYPES HERE "experiment"}; @@ -176,6 +193,8 @@ typedef struct matrix_dims matrix_dims_t; * @var out_file: The file to write csv data to. Defaults to stdout. * @var blas_args: Arguments for each supported blas routine. * @var blas_routines: Selects which supported blas routines to test. + * @var verify: Performs verification of the blas routine for each input + * before timing it. */ struct perf_test_options { test_e test; @@ -189,6 +208,7 @@ struct perf_test_options { std::string out_file; blas_args_t blas_args; std::string blas_routines; + bool verify; }; typedef struct perf_test_options options_t; diff --git a/packages/kokkos-kernels/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp b/packages/kokkos-kernels/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp index f26fbb7287e1..b9556d1c46f4 100644 --- a/packages/kokkos-kernels/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp +++ b/packages/kokkos-kernels/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp @@ -56,6 +56,8 @@ //#include "KokkosBatched_Gemm_Team_Impl.hpp" //#include "KokkosBatched_Gemm_TeamVector_Impl.hpp" #include "KokkosBatched_Util.hpp" +#include "gtest/gtest.h" // EXPECT_NEAR +#include "KokkosKernels_TestUtils.hpp" //#define GEMM_PERF_TEST_DEBUG @@ -70,15 +72,27 @@ void do_gemm_serial_batched_blocked(options_t options); // invocation! void do_gemm_serial_batched_parallel(options_t options); void do_gemm_serial_batched_blocked_parallel(options_t options); +void do_gemm_serial_simd_batched_parallel(options_t options); +void do_gemm_serial_simd_batched_blocked_parallel(options_t options); +void do_gemm_serial_batched_compact_mkl_parallel(options_t options); void do_gemm_team_batched_parallel(options_t options); void do_gemm_team_batched_blocked_parallel(options_t options); void do_gemm_team_vector_batched_parallel(options_t options); void do_gemm_team_vector_batched_blocked_parallel(options_t options); +void do_gemm_team_simd_batched_parallel(options_t options); +void do_gemm_team_simd_batched_blocked_parallel(options_t options); void do_gemm_experiment_parallel(options_t options); struct SerialTag {}; +struct SerialBatchDim3Tag {}; +struct SerialSimdTag {}; +struct SerialSimdBatchDim3Tag {}; struct TeamTag {}; +struct TeamBatchDim3Tag {}; struct TeamVectorTag {}; +struct TeamVectorBatchDim3Tag {}; +struct TeamSimdTag {}; +struct TeamSimdBatchDim4Tag {}; struct LayoutLeftTag {}; struct LayoutRightTag {}; struct SimdCpuTag {}; @@ -90,24 +104,52 @@ void (*do_gemm_invoke[LOOP_N][TEST_N])(options_t) = { do_gemm_serial_batched, do_gemm_serial_batched_blocked, // Serial NULL, NULL, // Team NULL, NULL, // TeamVector + NULL, NULL, // TeamSimd NULL // Serial Experiment }, { - NULL, // BLAS - do_gemm_serial_batched_parallel, - do_gemm_serial_batched_blocked_parallel, // Serial + NULL, // BLAS + do_gemm_serial_batched_parallel, // Serial + do_gemm_serial_batched_blocked_parallel, + do_gemm_serial_simd_batched_parallel, + do_gemm_serial_simd_batched_blocked_parallel, + do_gemm_serial_batched_compact_mkl_parallel, do_gemm_team_batched_parallel, do_gemm_team_batched_blocked_parallel, // Team do_gemm_team_vector_batched_parallel, NULL, // TeamVector + do_gemm_team_simd_batched_parallel, + do_gemm_team_simd_batched_blocked_parallel, // TeamSimd do_gemm_experiment_parallel // Parallel Experiment }}; /*************************** Test types and defaults **************************/ #define DEFAULT_GEMM_ARGS "NN" #define DEFAULT_GEMM_ALPHA 1.0 +#define DEFAULT_GEMM_BETA 1.0 using view_type_3d = Kokkos::View; +using view_type_4d = + Kokkos::View; +using view_type_5d = + Kokkos::View; + +// Construct the vector type +using memory_space = typename default_device::execution_space::memory_space; +constexpr int simd_vector_size = + KokkosBatched::DefaultVectorLength::value; +constexpr int simd_internal_vector_size = + KokkosBatched::DefaultInternalVectorLength::value; +using vector_type = KokkosBatched::Vector, + simd_vector_size>; +using internal_vector_type = + KokkosBatched::Vector, + simd_internal_vector_size>; +using vector_view_type_3d = + Kokkos::View; +using internal_vector_view_type_4d = + Kokkos::View; struct batched_params { int team_size; @@ -115,39 +157,124 @@ struct batched_params { }; typedef struct batched_params batched_params_t; +/** + * @brief struct gemm_simd_args encapsulates the data types required + * for allocating and passing a single matrix to the KokkosBatched gemm + * kernels. To invoke gemm on a batch of matrices, three instances of this + * struct are required, one for each matrix, A, B, and C. + * + * @var vec_3d: 3-rank view type used for allocating the underlying data. + * A reference must be kept to this object to ensure the + * data is not free'd by the C++ runtime. + * @var mat_4d: 4-rank view type used for populating the simd view with + random values. + * @var ivec_4d: 4-rank view type used for passing to math kernels. This + * view type is used for leveraging simd instructions on + * both the host and device. + */ +struct gemm_simd_args { + vector_view_type_3d vec_3d; + view_type_4d mat_4d; + internal_vector_view_type_4d ivec_4d; +}; +typedef struct gemm_simd_args gemm_simd_args_t; + +/** + * @brief struct gemm_args are common arguments passed to + * both gemm implementations in the KokkosBlas and KokkosBatched + * namespaces throughout these performance tests. + * + * @var transA: transpose type for A matrix. + * supported types: 'n' - no transpose, 't' - transpose. + * unsupported types: 'c' - conjugate transpose. + * @var transB: transpose type for B matrix. + * supported types: 'n' - no transpose, 't' - transpose. + * unsupported types: 'c' - conjugate transpose. + * @var alpha: scalar applied to A matrix. + * @var beta: scalar applied to B matrix. + * @var A: 3-rank view type used in all non-simd tests. + * @var B: 3-rank view type used in all non-simd tests. + * @var C: 3-rank view type used in all non-simd tests. + * @var bp: team_size and vector_length for tests that use + * Kokkos::TeamPolicy. + * @var Av: 3-rank and 4-rank vector view types for simd tests. + * @var Bv: 3-rank and 4-rank vector view types for simd tests. + * @var Cv: 3-rank and 4-rank vector view types for simd tests. + */ struct gemm_args { char transA, transB; default_scalar alpha; default_scalar beta; view_type_3d A, B, C; batched_params_t bp; + // Below are matrices for simd tests + gemm_simd_args_t Av, Bv, Cv; + matrix_dims_t dims; }; typedef struct gemm_args gemm_args_t; static std::string gemm_csv_header_str = - "algorithm,transAtransB,alpha,beta,team_size,vector_len,loop_type,A_dims,B_" + "algorithm,vector_type,transAtransB,alpha,beta,team_size,vector_len,loop_type,A_dims,B_" "dims,C_dims,warm_up_n," - "iter,total_time(s),average_time(s)"; + "iter,total_time(s),average_time(s),FLOPS,GFLOP/average_time(s)"; /*************************** Internal helper fns **************************/ +// Flop count formula from lapack working note 41: +// http://www.icl.utk.edu/~mgates3/docs/lawn41.pdf +static inline double __gemm_flop_count(double a_m, double a_n, double b_n) { + if (std::is_same::value || + std::is_same::value || + std::is_same::value) + return 2 * a_m * b_n * a_n; + else + // For complex, we need to count 2 flops for each add and 6 flops for each + // multiply. + return (2 + 6) * a_m * b_n * a_n; +} + +static inline std::string __gemm_output_dim_string(options_t options, + matrix_dim_t dim) { + std::string x = "x"; + std::string ret = std::to_string(dim.m) + x + std::to_string(dim.n); + + if (options.blas_args.batch_size_last_dim) + return ret + x + std::to_string(dim.k); + else + return std::to_string(dim.k) + x + ret; +} + static void __gemm_output_csv_row(options_t options, gemm_args_t gemm_args, double time_in_seconds, const char *experiment_name = nullptr) { std::string algo_name = test_e_str[options.test]; + std::string ts = std::to_string(gemm_args.bp.team_size); + std::string vlen = std::to_string(gemm_args.bp.vector_len); + std::string vtype = internal_vector_type::label(); if (experiment_name) algo_name = std::string(experiment_name); - - options.out[0] << algo_name << "," << options.blas_args.gemm.gemm_args << "," - << options.blas_args.gemm.alpha << "," - << options.blas_args.gemm.beta << "," << gemm_args.bp.team_size - << "," << gemm_args.bp.vector_len << "," - << loop_e_str[options.loop] << "," << gemm_args.A.extent(0) - << "x" << gemm_args.A.extent(1) << "x" << gemm_args.A.extent(2) - << "," << gemm_args.B.extent(0) << "x" << gemm_args.B.extent(1) - << "x" << gemm_args.B.extent(2) << "," << gemm_args.C.extent(0) - << "x" << gemm_args.C.extent(1) << "x" << gemm_args.C.extent(2) - << "," << options.warm_up_n << "," << options.n << "," - << time_in_seconds << "," << time_in_seconds / options.n - << std::endl; + if (options.blas_args.use_auto) ts = vlen = "Kokkos::AUTO"; + + double flops; + double gflops; + double average_time = time_in_seconds / options.n; + + if (options.verify) return; + + flops = gemm_args.dims.a.k * __gemm_flop_count(gemm_args.dims.a.m, + gemm_args.dims.a.n, + gemm_args.dims.b.n); + + gflops = flops / 1e9; + + options.out[0] << algo_name << "," << vtype << "," << options.blas_args.gemm.gemm_args << "," + << static_cast(options.blas_args.gemm.alpha) << "," + << static_cast(options.blas_args.gemm.beta) << "," + << ts << "," << vlen << "," << loop_e_str[options.loop] << "," + << __gemm_output_dim_string(options, gemm_args.dims.a) << "," + << __gemm_output_dim_string(options, gemm_args.dims.b) << "," + << __gemm_output_dim_string(options, gemm_args.dims.c) << "," + << options.warm_up_n << "," << options.n << "," + << time_in_seconds << "," << time_in_seconds / options.n << "," + << flops << "," << gflops / average_time << std::endl; } static void __print_gemm_perf_test_options(options_t options) { @@ -181,21 +308,30 @@ void __do_gemm_serial_blas(options_t options, gemm_args_t gemm_args) { STATUS; - auto __do_loop = [](uint32_t n, gemm_args_t _gemm_args) { + auto __do_loop = [](uint32_t n, gemm_args_t _gemm_args, + bool batch_size_last_dim) { for (uint32_t i = 0; i < n; ++i) { - auto A = Kokkos::subview(_gemm_args.A, i, Kokkos::ALL(), Kokkos::ALL()); - auto B = Kokkos::subview(_gemm_args.B, i, Kokkos::ALL(), Kokkos::ALL()); - auto C = Kokkos::subview(_gemm_args.C, i, Kokkos::ALL(), Kokkos::ALL()); - - KokkosBlas::gemm(&_gemm_args.transA, &_gemm_args.transB, _gemm_args.alpha, - A, B, _gemm_args.beta, C); + for (int j = 0; j < _gemm_args.dims.c.k; j++) { + auto A = Kokkos::subview(_gemm_args.A, j, Kokkos::ALL(), Kokkos::ALL()); + auto B = Kokkos::subview(_gemm_args.B, j, Kokkos::ALL(), Kokkos::ALL()); + auto C = Kokkos::subview(_gemm_args.C, j, Kokkos::ALL(), Kokkos::ALL()); + if (batch_size_last_dim) { + A = Kokkos::subview(_gemm_args.A, Kokkos::ALL(), Kokkos::ALL(), j); + B = Kokkos::subview(_gemm_args.B, Kokkos::ALL(), Kokkos::ALL(), j); + C = Kokkos::subview(_gemm_args.C, Kokkos::ALL(), Kokkos::ALL(), j); + } + + KokkosBlas::gemm(&_gemm_args.transA, &_gemm_args.transB, + _gemm_args.alpha, A, B, _gemm_args.beta, C); + } } }; - __do_loop(options.warm_up_n, gemm_args); + __do_loop(options.warm_up_n, gemm_args, + options.blas_args.batch_size_last_dim); Kokkos::fence(); timer.reset(); - __do_loop(options.n, gemm_args); + __do_loop(options.n, gemm_args, options.blas_args.batch_size_last_dim); Kokkos::fence(); __gemm_output_csv_row(options, gemm_args, timer.seconds()); @@ -213,22 +349,31 @@ void __do_gemm_serial_batched_template(options_t options, #if !defined(KOKKOS_ENABLE_CUDA) Kokkos::Timer timer; - auto __do_loop = [](uint32_t n, gemm_args_t _gemm_args) { + auto __do_loop = [](uint32_t n, gemm_args_t _gemm_args, + bool batch_size_last_dim) { for (uint32_t i = 0; i < n; ++i) { - auto A = Kokkos::subview(_gemm_args.A, i, Kokkos::ALL(), Kokkos::ALL()); - auto B = Kokkos::subview(_gemm_args.B, i, Kokkos::ALL(), Kokkos::ALL()); - auto C = Kokkos::subview(_gemm_args.C, i, Kokkos::ALL(), Kokkos::ALL()); - - SerialGemm::invoke( - _gemm_args.alpha, A, B, _gemm_args.beta, C); + for (int j = 0; j < _gemm_args.dims.c.k; j++) { + auto A = Kokkos::subview(_gemm_args.A, j, Kokkos::ALL(), Kokkos::ALL()); + auto B = Kokkos::subview(_gemm_args.B, j, Kokkos::ALL(), Kokkos::ALL()); + auto C = Kokkos::subview(_gemm_args.C, j, Kokkos::ALL(), Kokkos::ALL()); + if (batch_size_last_dim) { + A = Kokkos::subview(_gemm_args.A, Kokkos::ALL(), Kokkos::ALL(), j); + B = Kokkos::subview(_gemm_args.B, Kokkos::ALL(), Kokkos::ALL(), j); + C = Kokkos::subview(_gemm_args.C, Kokkos::ALL(), Kokkos::ALL(), j); + } + + SerialGemm::invoke( + _gemm_args.alpha, A, B, _gemm_args.beta, C); + } } }; - __do_loop(options.warm_up_n, gemm_args); + __do_loop(options.warm_up_n, gemm_args, + options.blas_args.batch_size_last_dim); Kokkos::fence(); timer.reset(); - __do_loop(options.n, gemm_args); + __do_loop(options.n, gemm_args, options.blas_args.batch_size_last_dim); Kokkos::fence(); __gemm_output_csv_row(options, gemm_args, timer.seconds()); #else @@ -240,8 +385,8 @@ void __do_gemm_serial_batched_template(options_t options, template void __do_gemm_serial_batched(options_t options, gemm_args_t gemm_args) { - char a = gemm_args.transA; - char b = gemm_args.transB; + char a = toupper(gemm_args.transA); + char b = toupper(gemm_args.transB); using N = Trans::NoTranspose; using T = Trans::Transpose; // using C = Trans::ConjTranspose; @@ -272,58 +417,92 @@ void __do_gemm_serial_batched(options_t options, gemm_args_t gemm_args) { return; } -#if !defined(KOKKOS_ENABLE_CUDA) -template -struct parallel_blas_gemm { +template +struct parallel_batched_gemm_range_policy { gemm_args_t gemm_args_; - parallel_blas_gemm(gemm_args_t gemm_args) : gemm_args_(gemm_args) {} + parallel_batched_gemm_range_policy(gemm_args_t gemm_args) + : gemm_args_(gemm_args) {} KOKKOS_INLINE_FUNCTION - void operator()(const int &i) const { + void operator()(const SerialTag &, const int &i) const { auto svA = Kokkos::subview(gemm_args_.A, i, Kokkos::ALL(), Kokkos::ALL()); auto svB = Kokkos::subview(gemm_args_.B, i, Kokkos::ALL(), Kokkos::ALL()); auto svC = Kokkos::subview(gemm_args_.C, i, Kokkos::ALL(), Kokkos::ALL()); - KokkosBlas::gemm(&gemm_args_.transA, &gemm_args_.transB, gemm_args_.alpha, - svA, svB, gemm_args_.beta, svC); + KokkosBatched::SerialGemm::invoke( + gemm_args_.alpha, svA, svB, gemm_args_.beta, svC); } -}; -#endif // !KOKKOS_ENABLE_CUDA -template -void __do_gemm_parallel_blas(options_t options, gemm_args_t gemm_args) { -#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP) - uint32_t warm_up_n = options.warm_up_n; - uint32_t n = options.n; - Kokkos::Timer timer; - using execution_space = typename device_type::execution_space; - using functor_type = parallel_blas_gemm; - functor_type parallel_blas_gemm_functor(gemm_args); + KOKKOS_INLINE_FUNCTION + void operator()(const SerialBatchDim3Tag &, const int &i) const { + auto svA = Kokkos::subview(gemm_args_.A, Kokkos::ALL(), Kokkos::ALL(), i); + auto svB = Kokkos::subview(gemm_args_.B, Kokkos::ALL(), Kokkos::ALL(), i); + auto svC = Kokkos::subview(gemm_args_.C, Kokkos::ALL(), Kokkos::ALL(), i); - STATUS; + KokkosBatched::SerialGemm::invoke( + gemm_args_.alpha, svA, svB, gemm_args_.beta, svC); + } - Kokkos::parallel_for("parallelBlasWarmUpLoopGemm", - Kokkos::RangePolicy(0, warm_up_n), - parallel_blas_gemm_functor); - Kokkos::fence(); + KOKKOS_INLINE_FUNCTION + void operator()(const SerialSimdTag &, const int &i) const { + auto svA = + Kokkos::subview(gemm_args_.Av.vec_3d, i, Kokkos::ALL(), Kokkos::ALL()); + auto svB = + Kokkos::subview(gemm_args_.Bv.vec_3d, i, Kokkos::ALL(), Kokkos::ALL()); + auto svC = + Kokkos::subview(gemm_args_.Cv.vec_3d, i, Kokkos::ALL(), Kokkos::ALL()); - timer.reset(); - Kokkos::parallel_for("parallelBlasTimedLoopGemm", - Kokkos::RangePolicy(0, n), - parallel_blas_gemm_functor); - Kokkos::fence(); - __gemm_output_csv_row(options, gemm_args, timer.seconds()); -#else - std::cerr << std::string(__func__) - << " disabled since KOKKOS_ENABLE_CUDA is defined." << std::endl; - __gemm_output_csv_row(options, gemm_args, -1); -#endif // !KOKKOS_ENABLE_CUDA - return; -} + KokkosBatched::SerialGemm::invoke( + gemm_args_.alpha, svA, svB, gemm_args_.beta, svC); + } + + KOKKOS_INLINE_FUNCTION + void operator()(const SerialSimdBatchDim3Tag &, const int &i) const { + auto svA = + Kokkos::subview(gemm_args_.Av.vec_3d, Kokkos::ALL(), Kokkos::ALL(), i); + auto svB = + Kokkos::subview(gemm_args_.Bv.vec_3d, Kokkos::ALL(), Kokkos::ALL(), i); + auto svC = + Kokkos::subview(gemm_args_.Cv.vec_3d, Kokkos::ALL(), Kokkos::ALL(), i); + + KokkosBatched::SerialGemm::invoke( + gemm_args_.alpha, svA, svB, gemm_args_.beta, svC); + } + + KOKKOS_INLINE_FUNCTION + void operator()(const TeamTag &, const int &i) const { + Kokkos::abort("TeamTag not supported using RangePolicy."); + } + + KOKKOS_INLINE_FUNCTION + void operator()(const TeamBatchDim3Tag &, const int &i) const { + Kokkos::abort("TeamBatchDim3Tag not supported using RangePolicy."); + } + + KOKKOS_INLINE_FUNCTION + void operator()(const TeamVectorTag &, const int &i) const { + Kokkos::abort("TeamVectorTag not supported using RangePolicy."); + } + + KOKKOS_INLINE_FUNCTION + void operator()(const TeamVectorBatchDim3Tag &, const int &i) const { + Kokkos::abort("TeamVectorBatchDim3Tag not supported using RangePolicy."); + } + + KOKKOS_INLINE_FUNCTION + void operator()(const TeamSimdTag &, const int &i) const { + Kokkos::abort("TeamSimdTag not supported using RangePolicy."); + } + + KOKKOS_INLINE_FUNCTION + void operator()(const TeamSimdBatchDim4Tag &, const int &i) const { + Kokkos::abort("TeamSimdBatchDim4Tag not supported using RangePolicy."); + } +}; template + class BlockingType, class AlgoMode = void> struct parallel_batched_gemm { gemm_args_t gemm_args_; @@ -340,6 +519,17 @@ struct parallel_batched_gemm { gemm_args_.alpha, svA, svB, gemm_args_.beta, svC); } + KOKKOS_INLINE_FUNCTION + void operator()(const SerialBatchDim3Tag &, const MemberType &member) const { + auto i = member.league_rank(); + auto svA = Kokkos::subview(gemm_args_.A, Kokkos::ALL(), Kokkos::ALL(), i); + auto svB = Kokkos::subview(gemm_args_.B, Kokkos::ALL(), Kokkos::ALL(), i); + auto svC = Kokkos::subview(gemm_args_.C, Kokkos::ALL(), Kokkos::ALL(), i); + + KokkosBatched::SerialGemm::invoke( + gemm_args_.alpha, svA, svB, gemm_args_.beta, svC); + } + KOKKOS_INLINE_FUNCTION void operator()(const TeamTag &, const MemberType &member) const { auto i = member.league_rank(); @@ -352,6 +542,18 @@ struct parallel_batched_gemm { svB, gemm_args_.beta, svC); } + KOKKOS_INLINE_FUNCTION + void operator()(const TeamBatchDim3Tag &, const MemberType &member) const { + auto i = member.league_rank(); + auto svA = Kokkos::subview(gemm_args_.A, Kokkos::ALL(), Kokkos::ALL(), i); + auto svB = Kokkos::subview(gemm_args_.B, Kokkos::ALL(), Kokkos::ALL(), i); + auto svC = Kokkos::subview(gemm_args_.C, Kokkos::ALL(), Kokkos::ALL(), i); + + KokkosBatched::TeamGemm::invoke(member, gemm_args_.alpha, svA, + svB, gemm_args_.beta, svC); + } + KOKKOS_INLINE_FUNCTION void operator()(const TeamVectorTag &, const MemberType &member) const { auto team_idx = member.league_rank(); @@ -368,50 +570,202 @@ struct parallel_batched_gemm { svB, gemm_args_.beta, svC); } + + KOKKOS_INLINE_FUNCTION + void operator()(const TeamVectorBatchDim3Tag &, + const MemberType &member) const { + auto team_idx = member.league_rank(); + auto svA = + Kokkos::subview(gemm_args_.A, Kokkos::ALL(), Kokkos::ALL(), team_idx); + auto svB = + Kokkos::subview(gemm_args_.B, Kokkos::ALL(), Kokkos::ALL(), team_idx); + auto svC = + Kokkos::subview(gemm_args_.C, Kokkos::ALL(), Kokkos::ALL(), team_idx); + + KokkosBatched::TeamVectorGemm::invoke(member, + gemm_args_.alpha, svA, + svB, gemm_args_.beta, + svC); + } + + KOKKOS_INLINE_FUNCTION + void operator()(const TeamSimdTag &, const MemberType &member) const { + auto i = member.league_rank(); + Kokkos::parallel_for( + Kokkos::ThreadVectorRange(member, gemm_args_.Cv.ivec_4d.extent(3)), + [&](const int &vector_lane) { + auto svA = Kokkos::subview(gemm_args_.Av.ivec_4d, i, Kokkos::ALL(), + Kokkos::ALL(), vector_lane); + auto svB = Kokkos::subview(gemm_args_.Bv.ivec_4d, i, Kokkos::ALL(), + Kokkos::ALL(), vector_lane); + auto svC = Kokkos::subview(gemm_args_.Cv.ivec_4d, i, Kokkos::ALL(), + Kokkos::ALL(), vector_lane); + + KokkosBatched::Gemm::invoke(member, gemm_args_.alpha, + svA, svB, gemm_args_.beta, + svC); + }); + } + + KOKKOS_INLINE_FUNCTION + void operator()(const TeamSimdBatchDim4Tag &, + const MemberType &member) const { + auto i = member.league_rank(); + Kokkos::parallel_for( + Kokkos::ThreadVectorRange(member, gemm_args_.Cv.ivec_4d.extent(0)), + [&](const int &vector_lane) { + auto svA = Kokkos::subview(gemm_args_.Av.ivec_4d, vector_lane, + Kokkos::ALL(), Kokkos::ALL(), i); + auto svB = Kokkos::subview(gemm_args_.Bv.ivec_4d, vector_lane, + Kokkos::ALL(), Kokkos::ALL(), i); + auto svC = Kokkos::subview(gemm_args_.Cv.ivec_4d, vector_lane, + Kokkos::ALL(), Kokkos::ALL(), i); + + KokkosBatched::Gemm::invoke(member, gemm_args_.alpha, + svA, svB, gemm_args_.beta, + svC); + }); + } + + KOKKOS_INLINE_FUNCTION + void operator()(const SerialSimdTag &, const MemberType &member) const { + Kokkos::abort("SerialSimdTag not supported using RangePolicy."); + } + + KOKKOS_INLINE_FUNCTION + void operator()(const SerialSimdBatchDim3Tag &, + const MemberType &member) const { + Kokkos::abort("SerialSimdBatchDim3Tag not supported using RangePolicy."); + } }; template -void __do_gemm_parallel_batched_template(options_t options, - gemm_args_t gemm_args) { +void __do_gemm_parallel_batched_template_range_policy(options_t options, + gemm_args_t gemm_args) { using execution_space = typename device_type::execution_space; - using policy_type = Kokkos::TeamPolicy; - using member_type = typename policy_type::member_type; + using policy_type = Kokkos::RangePolicy; using functor_type = - parallel_batched_gemm; + parallel_batched_gemm_range_policy; uint32_t warm_up_n = options.warm_up_n; uint32_t n = options.n; - auto league_size = options.start.c.k; + auto batch_size = options.start.c.k; Kokkos::Timer timer; STATUS; functor_type parallel_batched_gemm_functor(gemm_args); - auto team_size = gemm_args.bp.team_size; - auto vector_len = gemm_args.bp.vector_len; + + if (std::is_same::value || + std::is_same::value) { + batch_size = options.blas_args.batch_size_last_dim + ? gemm_args.Cv.vec_3d.extent(2) + : gemm_args.Cv.vec_3d.extent(0); + } for (uint32_t i = 0; i < warm_up_n; i++) { Kokkos::parallel_for("parallelBatchedWarmUpLoopGemm", - policy_type(league_size, team_size, vector_len), + policy_type(0, batch_size), parallel_batched_gemm_functor); + Kokkos::fence(); } - Kokkos::fence(); timer.reset(); for (uint32_t i = 0; i < n; i++) { Kokkos::parallel_for("parallelBatchedTimedLoopGemm", - policy_type(league_size, team_size, vector_len), + policy_type(0, batch_size), parallel_batched_gemm_functor); + Kokkos::fence(); } - Kokkos::fence(); __gemm_output_csv_row(options, gemm_args, timer.seconds()); return; } -template +template +void __do_gemm_parallel_batched_template(options_t options, + gemm_args_t gemm_args) { + using execution_space = typename device_type::execution_space; + using policy_type = Kokkos::TeamPolicy; + using member_type = typename policy_type::member_type; + using functor_type = + parallel_batched_gemm; + + uint32_t warm_up_n = options.warm_up_n; + uint32_t n = options.n; + auto league_size = options.start.c.k; + auto team_size = gemm_args.bp.team_size; + auto vector_len = gemm_args.bp.vector_len; + Kokkos::Timer timer; + + if (std::is_same::value || + std::is_same::value || + std::is_same::value || + std::is_same::value) { + return __do_gemm_parallel_batched_template_range_policy< + TransAType, TransBType, BlockingType, AlgoTag, device_type>(options, + gemm_args); + } + + if (std::is_same::value || + std::is_same::value) { + league_size = options.blas_args.batch_size_last_dim + ? gemm_args.Cv.ivec_4d.extent(3) + : gemm_args.Cv.ivec_4d.extent(0); + vector_len = simd_vector_size / + simd_internal_vector_size; // TODO: use bp.vector_len? + } + + STATUS; + + functor_type parallel_batched_gemm_functor(gemm_args); + + if (options.blas_args.use_auto) { + for (uint32_t i = 0; i < warm_up_n; i++) { + Kokkos::parallel_for("parallelBatchedWarmUpLoopGemm", + policy_type(league_size, Kokkos::AUTO, Kokkos::AUTO), + parallel_batched_gemm_functor); + Kokkos::fence(); + } + + timer.reset(); + for (uint32_t i = 0; i < n; i++) { + Kokkos::parallel_for("parallelBatchedTimedLoopGemm", + policy_type(league_size, Kokkos::AUTO, Kokkos::AUTO), + parallel_batched_gemm_functor); + Kokkos::fence(); + } + } else { + for (uint32_t i = 0; i < warm_up_n; i++) { + Kokkos::parallel_for("parallelBatchedWarmUpLoopGemm", + policy_type(league_size, team_size, vector_len), + parallel_batched_gemm_functor); + Kokkos::fence(); + } + + timer.reset(); + for (uint32_t i = 0; i < n; i++) { + Kokkos::parallel_for("parallelBatchedTimedLoopGemm", + policy_type(league_size, team_size, vector_len), + parallel_batched_gemm_functor); + Kokkos::fence(); + } + } + + __gemm_output_csv_row(options, gemm_args, timer.seconds()); + + return; +} + +template void __do_gemm_parallel_batched(options_t options, gemm_args_t gemm_args) { char a = gemm_args.transA; char b = gemm_args.transB; @@ -423,19 +777,23 @@ void __do_gemm_parallel_batched(options_t options, gemm_args_t gemm_args) { if (a == 'N' && b == 'N') { __do_gemm_parallel_batched_template(options, gemm_args); + device_type, algo_mode>(options, + gemm_args); } else if (a == 'N' && b == 'T') { __do_gemm_parallel_batched_template(options, gemm_args); + device_type, algo_mode>(options, + gemm_args); //} else if (a == 'N' && b == 'C') { // __do_gemm_parallel_batched_template(options, gemm_args); } else if (a == 'T' && b == 'N') { __do_gemm_parallel_batched_template(options, gemm_args); + device_type, algo_mode>(options, + gemm_args); } else if (a == 'T' && b == 'T') { __do_gemm_parallel_batched_template(options, gemm_args); + device_type, algo_mode>(options, + gemm_args); //} else if (a == 'T' && b == 'C') { // __do_gemm_parallel_batched_template(options, gemm_args); @@ -796,7 +1154,8 @@ void __do_gemm_parallel_experiment5(options_t options, gemm_args_t gemm_args) { using scalar_type = typename view_type_3d::value_type; constexpr int vl = KokkosBatched::DefaultVectorLength::value; - using simd_type = KokkosBatched::Vector, vl>; + using simd_type = + KokkosBatched::Vector, simd_vector_size>; using simd_view_type = Kokkos::View; using functor_type = @@ -821,12 +1180,12 @@ void __do_gemm_parallel_experiment5(options_t options, gemm_args_t gemm_args) { // uint64_t seed = Kokkos::Impl::clock_tic(); // Kokkos::Random_XorShift64_Pool rand_pool(seed); // Kokkos::fill_random(A, rand_pool, - // Kokkos::rand, simd_type>::max()); - // Kokkos::fill_random(B, rand_pool, - // Kokkos::rand, simd_type>::max()); - // Kokkos::fill_random(C, rand_pool, - // Kokkos::rand, simd_type>::max()); - // execution_space::fence(); + // Kokkos::rand, + // simd_type>::max()); Kokkos::fill_random(B, rand_pool, + // Kokkos::rand, + // simd_type>::max()); Kokkos::fill_random(C, rand_pool, + // Kokkos::rand, + // simd_type>::max()); execution_space::fence(); functor_type experiment5_functor(A, B, C, gemm_args); @@ -854,9 +1213,410 @@ void __do_gemm_parallel_experiment5(options_t options, gemm_args_t gemm_args) { return; } +template +class parallel_batched_gemm_experiment6 { + private: + SimdViewType &A, &B, &C; + gemm_args_t gemm_args; + + public: + parallel_batched_gemm_experiment6(SimdViewType &_A, SimdViewType &_B, + SimdViewType &_C, gemm_args_t _gemm_args) + : A(_A), B(_B), C(_C), gemm_args(_gemm_args) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const MemberType &member) const { + auto i = member.league_rank(); + auto svA = Kokkos::subview(A, i, Kokkos::ALL(), Kokkos::ALL()); + auto svB = Kokkos::subview(B, i, Kokkos::ALL(), Kokkos::ALL()); + auto svC = Kokkos::subview(C, i, Kokkos::ALL(), Kokkos::ALL()); + + // Uses two serial for-loops internally + KokkosBatched::TeamVectorGemm::invoke(member, gemm_args.alpha, + svA, svB, + gemm_args.beta, svC); + } +}; + +template +void __do_gemm_parallel_experiment6(options_t options, gemm_args_t gemm_args) { +#if 0 + using execution_space = typename device_type::execution_space; + using policy_type = Kokkos::TeamPolicy; + using member_type = typename policy_type::member_type; + + // Construct the vector type + using scalar_type = typename view_type_3d::value_type; + constexpr int vl = + KokkosBatched::DefaultVectorLength::value; + constexpr int il = + KokkosBatched::DefaultInternalVectorLength::value; + using view_type = Kokkos::View; + using vector_view_type = Kokkos::View; + using internal_vector_view_type = Kokkos::View; + using functor_type = + parallel_batched_gemm_experiment6; + + uint32_t warm_up_n = options.warm_up_n; + uint32_t n = options.n; + auto k = options.start.c.k; + Kokkos::Timer timer; + auto simd_batch_size = k / vl + (k % vl > 0); + STATUS; + + // Construct matrices + vector_view_type A_vector("A_vector", simd_batch_size, gemm_args.A.extent(0), gemm_args.A.extent(1)); + view_type A((scalar_type *)A_vector.data(), simd_batch_size, gemm_args.A.extent(0), gemm_args.A.extent(1)); + internal_vector_view_type A_vector_internal(A_vector.data(), simd_batch_size, gemm_args.A.extent(0), gemm_args.A.extent(1)); + + vector_view_type B_vector("B_vector", simd_batch_size, gemm_args.B.extent(0), gemm_args.B.extent(1)); + view_type B((scalar_type *)B_vector.data(), simd_batch_size, gemm_args.B.extent(0), gemm_args.B.extent(1)); + internal_vector_view_type B_vector_internal(B_vector.data(), simd_batch_size, gemm_args.B.extent(0), gemm_args.B.extent(1)); + + vector_view_type C_vector("C_vector", simd_batch_size, gemm_args.C.extent(0), gemm_args.C.extent(1)); + view_type C((scalar_type *)C_vector.data(), simd_batch_size, gemm_args.C.extent(0), gemm_args.C.extent(1)); + internal_vector_view_type C_vector_internal(C_vector.data(), simd_batch_size, gemm_args.C.extent(0), gemm_args.C.extent(1)); + + uint64_t seed = Kokkos::Impl::clock_tic(); + Kokkos::Random_XorShift64_Pool rand_pool(seed); + Kokkos::fill_random(A, rand_pool, Kokkos::rand, scalar_type>::max()); + Kokkos::fill_random(B, rand_pool, Kokkos::rand, scalar_type>::max()); + Kokkos::fill_random(C, rand_pool, Kokkos::rand, scalar_type>::max()); + Kokkos::fence(); + + functor_type experiment6_functor(A_vector_internal, B_vector_internal, C_vector_internal, gemm_args); + + for (uint32_t i = 0; i < warm_up_n; ++i) { + Kokkos::parallel_for("parallelBatchedUntimedExperiment6Gemm", + policy_type(simd_batch_size, Kokkos::AUTO, vl/il), experiment6_functor); + Kokkos::fence(); + } + + timer.reset(); + for (uint32_t i = 0; i < n; ++i) { + Kokkos::parallel_for("parallelBatchedTimedExperiment6Gemm", + policy_type(simd_batch_size, Kokkos::AUTO, vl/il), experiment6_functor); + Kokkos::fence(); + } + + __gemm_output_csv_row(options, gemm_args, timer.seconds(), "experiment6"); +#endif + return; +} + +/** + * Check difference of scalars expected and actual at indexes i,j,k + * @var expected: The expected result. + * @var actual: The actual result. + * @var epsilon: The tolerance to use when comparing. + * @return true if the comparison fails and false if the comparison succeeds. + */ +template +static inline bool __gemm_print_compare_failure(ViewType h_expected, + ViewType h_actual, int i, + int j, int k, double epsilon) { + STATUS; + auto diff = static_cast(Kokkos::Experimental::fabs( + static_cast(h_expected(i, j, k) - h_actual(i, j, k)))); + + if (diff > epsilon) { + printf( + "fabs(expected(%d,%d,%d):%g - actual(%d,%d,%d):%g):%g > epsilon:%g\n", + i, j, k, static_cast(h_expected(i, j, k)), i, j, k, + static_cast(h_actual(i, j, k)), diff, epsilon); + FATAL_ERROR("Comparison failure!"); + return true; + } + return false; +} + +/** + * Compare all values of expected with all values of actual. + * @var expected: the expected results + * @var actual: the actual results + * @return false if expected matches actual within epsilon, otherwise true. + */ +template +static inline bool __gemm_do_compare(view_type_3d expected, + view_type_3d actual) { + double epsilon = Test::epsilon::value * 1e3; + STATUS; + + typename view_type_3d::HostMirror h_expected = + Kokkos::create_mirror_view(expected); + typename view_type_3d::HostMirror h_actual = + Kokkos::create_mirror_view(actual); + + // Copy to host for comparision + Kokkos::deep_copy(h_expected, expected); + Kokkos::deep_copy(h_actual, actual); + Kokkos::fence(); + + if (std::is_same::value) { + for (size_t i = 0; i < h_expected.extent(0); i++) { + for (size_t j = 0; j < h_expected.extent(1); j++) { + for (size_t k = 0; k < h_expected.extent(2); k++) { + if (__gemm_print_compare_failure(h_expected, h_actual, i, j, k, epsilon)) + return true; + } + } + } + } + + if (std::is_same::value) { + for (size_t k = 0; k < h_expected.extent(2); k++) { + for (size_t j = 0; j < h_expected.extent(1); j++) { + for (size_t i = 0; i < h_expected.extent(0); i++) { + if (__gemm_print_compare_failure(h_expected, h_actual, i, j, k, epsilon)) + return true; + } + } + } + } + + return false; +} + +template +static inline void __gemm_copy_simd_view_to_3d_view(gemm_simd_args_t src, + dstViewType dst, + options_t options) { + using dst_scalar_type = typename dstViewType::value_type; + using src_scalar_type = typename view_type_5d::value_type; + size_t remainder, vector_batch_size, simd_batch_size, last_batch; + bool data_layout_same_as_3d_view = false; + typename dstViewType::HostMirror h_dst = + Kokkos::create_mirror_view(dst); + typename view_type_4d::HostMirror h_src = + Kokkos::create_mirror_view(src.mat_4d); + Kokkos::deep_copy(h_src, src.mat_4d); + Kokkos::fence(); + + if (options.blas_args.batch_size_last_dim) { + remainder = dst.extent(2) % simd_internal_vector_size; + vector_batch_size = src.ivec_4d.extent(0); + simd_batch_size = src.ivec_4d.extent(3); + last_batch = dst.extent(2); + if (std::is_same::value && remainder == 0) + data_layout_same_as_3d_view = true; + + } else { + remainder = dst.extent(0) % simd_internal_vector_size; + vector_batch_size = src.ivec_4d.extent(3); + simd_batch_size = src.ivec_4d.extent(0); + last_batch = dst.extent(0); + if (std::is_same::value && remainder == 0) + data_layout_same_as_3d_view = true; + } + + // When the batch_size is a multiple of the simd_vector_size and the batch_size + // dimension is nearest to the simd_vector_size dimension, each 2-rank matrix + // lies in the correct location and the data can simply be cast to the 3d view. + if (data_layout_same_as_3d_view) { + // We can just re-cast the data to the 3d view but we'll copy it for verification + memcpy(h_dst.data(), h_src.data(), + sizeof(dst_scalar_type) * dst.extent(0) * dst.extent(1) * + dst.extent(2)); + Kokkos::deep_copy(dst, h_dst); + Kokkos::fence(); + return; + } + + // If the remainder is 0, we have simd_vector_size sub-batches to copy out... + // this is a bad data access pattern but for these perf_tests we will support it. + // If the remainder is non-zero, we have simd_vector_size sub-batches + remainder to + // copy out. + remainder += simd_internal_vector_size; + + // Views needed for slow manual copy + using h_view_type_5d = Kokkos::View; + using h_subview_type_2d = Kokkos::View; + using h_subview_type_3d = Kokkos::View; + using h_subview_type_4d = Kokkos::View; + h_view_type_5d h_src_raw; + h_subview_type_4d h_sv0; + h_subview_type_3d h_sv1; + h_subview_type_2d h_sv2; + + // TODO: Clean everything below this point up... + if (std::is_same::value) + h_src_raw = h_view_type_5d((src_scalar_type *)h_src.data(), src.ivec_4d.extent(0), src.ivec_4d.extent(1), src.ivec_4d.extent(2), src.ivec_4d.extent(3), simd_internal_vector_size); + else + h_src_raw = h_view_type_5d((src_scalar_type *)h_src.data(), + simd_internal_vector_size, src.ivec_4d.extent(0), + src.ivec_4d.extent(1), src.ivec_4d.extent(2), + src.ivec_4d.extent(3)); + + // The below loops copies each corresponding 2-rank matrix within the simd + // view back to the 3-rank view. + for (size_t simd_internal_vec_idx = 0; simd_internal_vec_idx < remainder; + simd_internal_vec_idx++) { + if (std::is_same::value) + h_sv0 = Kokkos::subview(h_src_raw, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), simd_internal_vec_idx); + else + h_sv0 = Kokkos::subview(h_src_raw, simd_internal_vec_idx, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + + for (size_t vector_batch_idx = 0; + vector_batch_idx < vector_batch_size; vector_batch_idx++) { + if (options.blas_args.batch_size_last_dim) + h_sv1 = Kokkos::subview(h_sv0, vector_batch_idx, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + else + h_sv1 = Kokkos::subview(h_sv0, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), vector_batch_idx); + for (size_t simd_batch_size_idx = 0; + simd_batch_size_idx < simd_batch_size; + simd_batch_size_idx++) { + if (options.blas_args.batch_size_last_dim) + h_sv2 = Kokkos::subview(h_sv1, Kokkos::ALL(), Kokkos::ALL(), simd_batch_size_idx); + else + h_sv2 = Kokkos::subview(h_sv1, simd_batch_size_idx, Kokkos::ALL(), Kokkos::ALL()); + for (size_t m = 0; m < src.ivec_4d.extent(1); m++) { + for (size_t n = 0; n < src.ivec_4d.extent(2); n++) { + if (options.blas_args.batch_size_last_dim) + h_dst(m, n, simd_internal_vec_idx + simd_batch_size_idx + vector_batch_idx) = h_sv2(m, n); + else + h_dst(simd_internal_vec_idx + simd_batch_size_idx + vector_batch_idx, m, n) = h_sv2(m, n); + } + } + if (simd_internal_vec_idx + simd_batch_size_idx + vector_batch_idx == last_batch - 1) + goto out; + } + } + } +out: + Kokkos::deep_copy(dst, h_dst); + Kokkos::fence(); +} + +/** + * Compare all values of expected with all values of actual. + * @var expected: the expected results + * @var actual: the actual results + * @return false if expected matches actual within epsilon, otherwise true. + */ +template +static inline bool __gemm_do_compare(view_type_3d expected, + gemm_simd_args_t actual, + options_t options) { + decltype(expected) actual_data("actual_data", expected.extent(0), + expected.extent(1), expected.extent(2)); + + STATUS; + + // Copy the simd view to a 3d view for comparision. + // NOTE: The raw results are different when batch_size % simd_vector_size != + // 0. Also note that when batch_size % simd_vector_size != 0, the simd + // operation calculates results that we do not require. So, we end up running + // an extra batch_size % simd_vector_size GEMMs! + __gemm_copy_simd_view_to_3d_view(actual, actual_data, options); + return __gemm_do_compare(expected, actual_data); +} + +template +static inline void __gemm_do_verify(options_t options, gemm_args_t gemm_args, + void (*fn)(options_t, gemm_args_t)) { + using execution_space = typename DeviceType::execution_space; + // Just create "expected" types using non-simd types. + decltype(gemm_args.C) C_expected; + decltype(gemm_args.A) A_expected; + decltype(gemm_args.B) B_expected; + STATUS; + + if (options.blas_args.batch_size_last_dim) { + C_expected = decltype(C_expected)("C_expected", gemm_args.dims.c.m, + gemm_args.dims.c.n, gemm_args.dims.c.k); + A_expected = decltype(A_expected)("A_expected", gemm_args.dims.a.m, + gemm_args.dims.a.n, gemm_args.dims.a.k); + B_expected = decltype(B_expected)("B_expected", gemm_args.dims.b.m, + gemm_args.dims.b.n, gemm_args.dims.b.k); + } else { + C_expected = decltype(C_expected)("C_expected", gemm_args.dims.c.k, + gemm_args.dims.c.m, gemm_args.dims.c.n); + A_expected = decltype(A_expected)("A_expected", gemm_args.dims.a.k, + gemm_args.dims.a.m, gemm_args.dims.a.n); + B_expected = decltype(B_expected)("B_expected", gemm_args.dims.b.k, + gemm_args.dims.b.m, gemm_args.dims.b.n); + } + + // Initialize "expected" matrices. + if (gemm_args.C.data() != nullptr) { + Kokkos::deep_copy(C_expected, gemm_args.C); + Kokkos::deep_copy(A_expected, gemm_args.A); + Kokkos::deep_copy(B_expected, gemm_args.B); + + Kokkos::fence(); // Ensure that deep_copy has completed + + // Check that initial values match + if (__gemm_do_compare(C_expected, gemm_args.C)) + FATAL_ERROR("Inital values mismatch!"); + } else if (gemm_args.Cv.vec_3d.data() != nullptr) { + __gemm_copy_simd_view_to_3d_view(gemm_args.Cv, + C_expected, options); + __gemm_copy_simd_view_to_3d_view(gemm_args.Av, + A_expected, options); + __gemm_copy_simd_view_to_3d_view(gemm_args.Bv, + B_expected, options); + + // Check that initial values match + if (__gemm_do_compare(C_expected, gemm_args.Cv, + options)) + FATAL_ERROR("Inital values mismatch!"); + } else { + FATAL_ERROR("Input arguments are empty!"); + } + + // Populate "expected" matrices via VanillaGemm + Test::Functor_BatchedVanillaGEMM + vgemm; + vgemm.A_t = toupper(gemm_args.transA) == 'T'; + vgemm.B_t = toupper(gemm_args.transB) == 'T'; + vgemm.A_c = vgemm.B_c = false; + vgemm.batch_size_last_dim = options.blas_args.batch_size_last_dim; + vgemm.A = A_expected; + vgemm.B = B_expected; + vgemm.C = C_expected; + vgemm.alpha = gemm_args.alpha; + vgemm.beta = gemm_args.beta; + vgemm.run(); // Compute C_expected + + // Run routine with warm_up_n = 1 and n = 0. + auto warm_up_n_bak = options.warm_up_n; + options.warm_up_n = 1; + auto n_bak = options.n; + options.n = 0; + fn(options, gemm_args); + + Kokkos::fence(); // Redundant fence. + + // Check the result + if (gemm_args.C.data() != nullptr) { + if (__gemm_do_compare(C_expected, gemm_args.C)) + FATAL_ERROR("Result value mismatch!"); + } + + if (gemm_args.Cv.vec_3d.data() != nullptr) { + if (__gemm_do_compare(C_expected, gemm_args.Cv, + options)) + FATAL_ERROR("Result value mismatch!"); + } + + // Run actual timed test. + options.verify = false; // Set verify to false for csv output. + options.warm_up_n = warm_up_n_bak; + options.n = n_bak; + fn(options, gemm_args); + + // Reset verify for next matrix size. + options.verify = true; +} + /*************************** Internal setup fns **************************/ template -gemm_args_t __do_setup(options_t options, matrix_dims_t dim) { +gemm_args_t __do_setup(options_t options, matrix_dims_t dims) { using execution_space = typename device_type::execution_space; gemm_args_t gemm_args; @@ -864,25 +1624,151 @@ gemm_args_t __do_setup(options_t options, matrix_dims_t dim) { Kokkos::Random_XorShift64_Pool rand_pool(seed); STATUS; - gemm_args.transA = options.blas_args.gemm.gemm_args.c_str()[0]; - gemm_args.transB = options.blas_args.gemm.gemm_args.c_str()[1]; - gemm_args.A = vta("gemm_args.A", dim.a.k, dim.a.m, dim.a.n); - gemm_args.B = vtb("gemm_args.B", dim.b.k, dim.b.m, dim.b.n); - gemm_args.C = vtc("gemm_args.C", dim.c.k, dim.c.m, dim.c.n); + gemm_args.dims = dims; + gemm_args.transA = options.blas_args.gemm.gemm_args.c_str()[0]; + gemm_args.transB = options.blas_args.gemm.gemm_args.c_str()[1]; + if (options.test == BATCHED_TEAM_SIMD || + options.test == BATCHED_TEAM_SIMD_BLOCKED || + options.test == BATCHED_SERIAL_SIMD || + options.test == BATCHED_SERIAL_SIMD_BLOCKED || + options.test == BATCHED_SERIAL_COMPACT_MKL) { + // Calculate the batch size for simd views + auto a_simd_batch_size = + dims.a.k / simd_vector_size + (dims.a.k % simd_vector_size > 0); + auto b_simd_batch_size = + dims.b.k / simd_vector_size + (dims.b.k % simd_vector_size > 0); + auto c_simd_batch_size = + dims.c.k / simd_vector_size + (dims.c.k % simd_vector_size > 0); + + // Reference gemm simd arguments for allocating A, B, and C matrices + gemm_simd_args_t &A = gemm_args.Av, &B = gemm_args.Bv, &C = gemm_args.Cv; + + if (options.blas_args.batch_size_last_dim) { + // Construct simd matrices with batch_size in the last dimension (better + // for LayoutLeft views) + A.vec_3d = vector_view_type_3d("A_vector", dims.a.m, dims.a.n, + a_simd_batch_size); + A.mat_4d = view_type_4d((scalar_type *)A.vec_3d.data(), simd_vector_size, + dims.a.m, dims.a.n, a_simd_batch_size); + A.ivec_4d = internal_vector_view_type_4d( + (internal_vector_type *)A.mat_4d.data(), + simd_vector_size / simd_internal_vector_size, dims.a.m, dims.a.n, + a_simd_batch_size); + + B.vec_3d = vector_view_type_3d("B_vector", dims.b.m, dims.b.n, + b_simd_batch_size); + B.mat_4d = view_type_4d((scalar_type *)B.vec_3d.data(), simd_vector_size, + dims.b.m, dims.b.n, b_simd_batch_size); + B.ivec_4d = internal_vector_view_type_4d( + (internal_vector_type *)B.mat_4d.data(), + simd_vector_size / simd_internal_vector_size, dims.b.m, dims.b.n, + b_simd_batch_size); + + C.vec_3d = vector_view_type_3d("C_vector", dims.c.m, dims.c.n, + c_simd_batch_size); + C.mat_4d = view_type_4d((scalar_type *)C.vec_3d.data(), simd_vector_size, + dims.c.m, dims.c.n, c_simd_batch_size); + C.ivec_4d = internal_vector_view_type_4d( + (internal_vector_type *)C.mat_4d.data(), + simd_vector_size / simd_internal_vector_size, dims.c.m, dims.c.n, + c_simd_batch_size); + + } else { + // Construct simd matrices with batch_size in the first dimension (better + // for LayoutRight views) + A.vec_3d = vector_view_type_3d("A_vector", a_simd_batch_size, dims.a.m, + dims.a.n); + A.mat_4d = view_type_4d((scalar_type *)A.vec_3d.data(), a_simd_batch_size, + dims.a.m, dims.a.n, simd_vector_size); + A.ivec_4d = internal_vector_view_type_4d( + (internal_vector_type *)A.mat_4d.data(), a_simd_batch_size, dims.a.m, + dims.a.n, simd_vector_size / simd_internal_vector_size); + + B.vec_3d = vector_view_type_3d("B_vector", b_simd_batch_size, dims.b.m, + dims.b.n); + B.mat_4d = view_type_4d((scalar_type *)B.vec_3d.data(), b_simd_batch_size, + dims.b.m, dims.b.n, simd_vector_size); + B.ivec_4d = internal_vector_view_type_4d( + (internal_vector_type *)B.mat_4d.data(), b_simd_batch_size, dims.b.m, + dims.b.n, simd_vector_size / simd_internal_vector_size); + + C.vec_3d = vector_view_type_3d("C_vector", c_simd_batch_size, dims.c.m, + dims.c.n); + C.mat_4d = view_type_4d((scalar_type *)C.vec_3d.data(), c_simd_batch_size, + dims.c.m, dims.c.n, simd_vector_size); + C.ivec_4d = internal_vector_view_type_4d( + (internal_vector_type *)C.mat_4d.data(), c_simd_batch_size, dims.c.m, + dims.c.n, simd_vector_size / simd_internal_vector_size); + } + + // Use the non-simd 4-rank view type to randomly populate the gemm simd + // arguments + using tmp_view_type_4d = + Kokkos::View; + tmp_view_type_4d tmpA( + "tmpA", gemm_args.Av.mat_4d.extent(0), gemm_args.Av.mat_4d.extent(1), + gemm_args.Av.mat_4d.extent(2), gemm_args.Av.mat_4d.extent(3)); + Kokkos::fill_random(tmpA, rand_pool, + Kokkos::rand, + double>::max()); + tmp_view_type_4d tmpB( + "tmpB", gemm_args.Bv.mat_4d.extent(0), gemm_args.Bv.mat_4d.extent(1), + gemm_args.Bv.mat_4d.extent(2), gemm_args.Bv.mat_4d.extent(3)); + Kokkos::fill_random(tmpB, rand_pool, + Kokkos::rand, + double>::max()); + tmp_view_type_4d tmpC( + "tmpC", gemm_args.Cv.mat_4d.extent(0), gemm_args.Cv.mat_4d.extent(1), + gemm_args.Cv.mat_4d.extent(2), gemm_args.Cv.mat_4d.extent(3)); + Kokkos::fill_random(tmpC, rand_pool, + Kokkos::rand, + double>::max()); + Kokkos::fence(); + Kokkos::deep_copy(gemm_args.Av.mat_4d, tmpA); + Kokkos::deep_copy(gemm_args.Bv.mat_4d, tmpB); + Kokkos::deep_copy(gemm_args.Cv.mat_4d, tmpC); + Kokkos::fence(); + } else { + if (options.blas_args.batch_size_last_dim) { + gemm_args.A = vta("gemm_args.A", dims.a.m, dims.a.n, dims.a.k); + gemm_args.B = vtb("gemm_args.B", dims.b.m, dims.b.n, dims.b.k); + gemm_args.C = vtc("gemm_args.C", dims.c.m, dims.c.n, dims.c.k); + } else { + gemm_args.A = vta("gemm_args.A", dims.a.k, dims.a.m, dims.a.n); + gemm_args.B = vtb("gemm_args.B", dims.b.k, dims.b.m, dims.b.n); + gemm_args.C = vtc("gemm_args.C", dims.c.k, dims.c.m, dims.c.n); + } + + using tmp_view_type_3d = + Kokkos::View; + tmp_view_type_3d tmpA("tmpA", gemm_args.A.extent(0), gemm_args.A.extent(1), + gemm_args.A.extent(2)); + Kokkos::fill_random(tmpA, rand_pool, + Kokkos::rand, + double>::max()); + tmp_view_type_3d tmpB("tmpB", gemm_args.B.extent(0), gemm_args.B.extent(1), + gemm_args.B.extent(2)); + Kokkos::fill_random(tmpB, rand_pool, + Kokkos::rand, + double>::max()); + tmp_view_type_3d tmpC("tmpC", gemm_args.C.extent(0), gemm_args.C.extent(1), + gemm_args.C.extent(2)); + Kokkos::fill_random(tmpC, rand_pool, + Kokkos::rand, + double>::max()); + + Kokkos::fence(); + Kokkos::deep_copy(gemm_args.A, tmpA); + Kokkos::deep_copy(gemm_args.B, tmpB); + Kokkos::deep_copy(gemm_args.C, tmpC); + Kokkos::fence(); + } gemm_args.alpha = options.blas_args.gemm.alpha; - gemm_args.alpha = options.blas_args.gemm.beta; + gemm_args.beta = options.blas_args.gemm.beta; gemm_args.bp.team_size = options.blas_args.team_size; gemm_args.bp.vector_len = options.blas_args.vector_len; - Kokkos::fill_random(gemm_args.A, rand_pool, - Kokkos::rand, - scalar_type>::max()); - Kokkos::fill_random(gemm_args.B, rand_pool, - Kokkos::rand, - scalar_type>::max()); - Kokkos::fill_random(gemm_args.C, rand_pool, - Kokkos::rand, - scalar_type>::max()); + Kokkos::fence(); // Ensure that fill_random has completed. return gemm_args; } @@ -897,7 +1783,8 @@ void __do_loop_and_invoke(options_t options, __print_gemm_perf_test_options(options); std::cout << "SCALAR:" << typeid(default_scalar).name() << ", LAYOUT:" << typeid(default_layout).name() - << ", DEVICE:" << typeid(default_device).name() << std::endl; + << ", DEVICE:" << typeid(default_device).name() + << ", SPACE:" << typeid(memory_space).name() << std::endl; options.out[0] << gemm_csv_header_str << std::endl; @@ -905,12 +1792,18 @@ void __do_loop_and_invoke(options_t options, cur_dims.a.m <= options.stop.a.m && cur_dims.a.n <= options.stop.a.n && cur_dims.b.m <= options.stop.b.m && cur_dims.b.n <= options.stop.b.n && cur_dims.c.m <= options.stop.c.m && cur_dims.c.n <= options.stop.c.n; - cur_dims.a.m *= options.step, cur_dims.a.n *= options.step, - cur_dims.b.m *= options.step, cur_dims.b.n *= options.step, - cur_dims.c.m *= options.step, cur_dims.c.n *= options.step) { + cur_dims.a.m += options.step, cur_dims.a.n += options.step, + cur_dims.b.m += options.step, cur_dims.b.n += options.step, + cur_dims.c.m += options.step, cur_dims.c.n += options.step) { gemm_args = __do_setup(options, cur_dims); - fn(options, gemm_args); + + if (options.verify) { + __gemm_do_verify( + options, gemm_args, fn); + } else { + fn(options, gemm_args); + } } return; } @@ -944,44 +1837,173 @@ void do_gemm_serial_batched_blocked(options_t options) { void do_gemm_serial_batched_parallel(options_t options) { STATUS; - __do_loop_and_invoke( - options, __do_gemm_parallel_batched); + if (options.blas_args.batch_size_last_dim) + __do_loop_and_invoke( + options, + __do_gemm_parallel_batched); + else + __do_loop_and_invoke( + options, __do_gemm_parallel_batched); return; } void do_gemm_serial_batched_blocked_parallel(options_t options) { STATUS; - __do_loop_and_invoke( - options, __do_gemm_parallel_batched); + if (options.blas_args.batch_size_last_dim) + __do_loop_and_invoke( + options, + __do_gemm_parallel_batched); + else + __do_loop_and_invoke( + options, __do_gemm_parallel_batched); + return; +} + +void do_gemm_serial_simd_batched_parallel(options_t options) { + STATUS; + // SerialBatchDim3Tag + // SerialSimdTag + if (options.blas_args.batch_size_last_dim) + __do_loop_and_invoke( + options, + __do_gemm_parallel_batched); + else + __do_loop_and_invoke( + options, __do_gemm_parallel_batched); + return; +} + +void do_gemm_serial_simd_batched_blocked_parallel(options_t options) { + STATUS; + // SerialBatchDim3Tag + // SerialSimdTag + if (options.blas_args.batch_size_last_dim) + __do_loop_and_invoke( + options, + __do_gemm_parallel_batched); + else + __do_loop_and_invoke( + options, __do_gemm_parallel_batched); + return; +} + +void do_gemm_serial_batched_compact_mkl_parallel(options_t options) { + STATUS; +#if defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL__) && \ + defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_BATCHED__) && \ + defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_COMPACT_BATCHED__) + if (options.blas_args.batch_size_last_dim) + __do_loop_and_invoke( + options, + __do_gemm_parallel_batched); + else + __do_loop_and_invoke( + options, + __do_gemm_parallel_batched); +#else +#if !defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL__) + std::cerr + << std::string(__func__) + << " disabled since __KOKKOSBATCHED_ENABLE_INTEL_MKL__ is undefined." + << std::endl; +#elif !defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_BATCHED__) + std::cerr << std::string(__func__) + << " disabled since __KOKKOSBATCHED_ENABLE_INTEL_MKL_BATCHED__ is " + "undefined." + << std::endl; +#elif !defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_COMPACT_BATCHED__) + std::cerr + << std::string(__func__) + << " disabled since __KOKKOSBATCHED_ENABLE_INTEL_MKL_COMPACT_BATCHED__ " + "is undefined." + << std::endl; +#endif +#endif return; } void do_gemm_team_batched_parallel(options_t options) { STATUS; - __do_loop_and_invoke( - options, __do_gemm_parallel_batched); + if (options.blas_args.batch_size_last_dim) + __do_loop_and_invoke( + options, + __do_gemm_parallel_batched); + else + __do_loop_and_invoke( + options, __do_gemm_parallel_batched); return; } void do_gemm_team_batched_blocked_parallel(options_t options) { STATUS; - __do_loop_and_invoke( - options, - __do_gemm_parallel_batched); + if (options.blas_args.batch_size_last_dim) + __do_loop_and_invoke( + options, + __do_gemm_parallel_batched); + else + __do_loop_and_invoke( + options, __do_gemm_parallel_batched); return; } void do_gemm_team_vector_batched_parallel(options_t options) { STATUS; - __do_loop_and_invoke( - options, __do_gemm_parallel_batched); + if (options.blas_args.batch_size_last_dim) + __do_loop_and_invoke( + options, + __do_gemm_parallel_batched); + else + __do_loop_and_invoke( + options, + __do_gemm_parallel_batched); return; } +void do_gemm_team_simd_batched_parallel(options_t options) { + STATUS; + if (options.blas_args.batch_size_last_dim) + __do_loop_and_invoke( + options, + __do_gemm_parallel_batched); + else + __do_loop_and_invoke( + options, __do_gemm_parallel_batched); + return; +} + +void do_gemm_team_simd_batched_blocked_parallel(options_t options) { + STATUS; + if (options.blas_args.batch_size_last_dim) + __do_loop_and_invoke( + options, + __do_gemm_parallel_batched); + else + __do_loop_and_invoke( + options, __do_gemm_parallel_batched); + return; +} + +// Blocked algo not yet implemented for TeamVectorGemm. /* void do_gemm_team_vector_batched_blocked_parallel(options_t options) { STATUS; __do_loop_and_invoke( @@ -1010,6 +2032,9 @@ void do_gemm_experiment_parallel(options_t options) { __do_loop_and_invoke( options, __do_gemm_parallel_experiment5); + __do_loop_and_invoke( + options, __do_gemm_parallel_experiment6); } #endif // KOKKOSBLAS3_GEMM_PERF_TEST_H_ diff --git a/packages/kokkos-kernels/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp b/packages/kokkos-kernels/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp index b493c244d884..149cc00fd18b 100644 --- a/packages/kokkos-kernels/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp +++ b/packages/kokkos-kernels/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp @@ -63,17 +63,19 @@ static struct option long_options[] = { {"trmm_options", required_argument, 0, 'o'}, {"trmm_alpha", required_argument, 0, 'a'}, {"gemm_options", required_argument, 0, 'g'}, - {"gemm_alpha", required_argument, 0, 'p'}, + {"gemm_scalars", required_argument, 0, 'p'}, {"team_size", required_argument, 0, 'z'}, {"vector_len", required_argument, 0, 'n'}, {"batch_size", required_argument, 0, 'k'}, + {"batch_size_last_dim", required_argument, 0, 'd'}, + {"verify", required_argument, 0, 'v'}, {0, 0, 0, 0}}; static void __print_help_blas3_perf_test() { printf("Options:\n"); printf("\t-h, --help\n"); - printf("\t\tPrint this help menu.\n\n"); + printf("\t\tPrint this help menu.\n"); printf("\t-t, --test=OPTION\n"); printf("\t\tAlgorithm selection.\n"); @@ -104,10 +106,12 @@ static void __print_help_blas3_perf_test() { "%s)\n", DEFAULT_GEMM_ARGS); - printf("\t-p, --gemm_alpha=SCALAR_VALUE\n"); - printf("\t\tGEMM alpha value.\n"); - printf("\t\t\tThe value of alpha in floating point. (default: %lf)\n", - DEFAULT_GEMM_ALPHA); + printf("\t-p, --gemm_scalars=ALPHA_SCALAR_VALUE,BETA_SCALAR_VALUE\n"); + printf("\t\tGEMM alpha and beta values.\n"); + printf( + "\t\t\tThe value of alpha and beta in floating point. (default: " + "%lf,%lf)\n", + DEFAULT_GEMM_ALPHA, DEFAULT_GEMM_BETA); printf("\t-z, --team_size=SIZE\n"); printf("\t\tKokkos team size.\n"); @@ -119,10 +123,27 @@ static void __print_help_blas3_perf_test() { printf("\t\t\tThe value of LEN as an integer. (default: %d)\n", DEFAULT_VECTOR_LEN); + printf("\t-u, --use_auto=AUTO\n"); + printf( + "\t\tWhether to use Kokkos::AUTO for vector_len and team_size " + "(Heirarchical parallelism).\n"); + printf( + "\t\t\tValid values for AUTO are 1 to use Kokkos::AUTO and 0 to use " + "--vector_len and --team_size " + "instead. (default: %d)\n", + DEFAULT_USE_AUTO); + printf("\t-k, --batch_size=LEN\n"); printf("\t\tBatch size. Adds third dimension to matrices A, B, and C.\n"); - printf("\t\t\tThe value of LEN as an integer. (default: %d)\n", - DEFAULT_VECTOR_LEN); + printf("\t\t\tThe value of LEN as an integer. (default: %d)\n", DEFAULT_K); + + printf("\t-d, --batch_size_last_dim=LAST_DIM\n"); + printf("\t\tHow to allocate the batch_size in the matrices.\n"); + printf( + "\t\t\tValid values for LAST_DIM are 1 make the batch_size the last " + "dimension and 0 to make the batch_size " + "the first dimension (default: %d)\n", + DEFAULT_BATCH_SIZE_LAST_DIM); printf("\t-l, --loop_type=OPTION\n"); printf("\t\tLoop selection.\n"); @@ -134,7 +155,7 @@ static void __print_help_blas3_perf_test() { printf("%c[1m", 27); printf("\t\t\t\tparallel:"); printf("%c[0m", 27); - printf(" invoke blas routine in a Kokkos::parallel_for-loop.\n\n"); + printf(" invoke blas routine in a Kokkos::parallel_for-loop.\n"); printf("\t-b, --matrix_size_start=MxN,IxJ,PxQ\n"); printf( @@ -142,7 +163,7 @@ static void __print_help_blas3_perf_test() { "(start)\n"); printf( "\t\t\tValid values for M and N are any non-negative 32-bit integers. " - "(default: %dx%d,%dx%d,%dx%d)\n\n", + "(default: %dx%d,%dx%d,%dx%d)\n", DEFAULT_MATRIX_START, DEFAULT_MATRIX_START, DEFAULT_MATRIX_START, DEFAULT_MATRIX_START, DEFAULT_MATRIX_START, DEFAULT_MATRIX_START); @@ -152,7 +173,7 @@ static void __print_help_blas3_perf_test() { "(stop)\n"); printf( "\t\t\tValid dimension values are any non-negative 32-bit integers. " - "(default: %dx%d,%dx%d,%dx%d)\n\n", + "(default: %dx%d,%dx%d,%dx%d)\n", DEFAULT_MATRIX_STOP, DEFAULT_MATRIX_STOP, DEFAULT_MATRIX_STOP, DEFAULT_MATRIX_STOP, DEFAULT_MATRIX_STOP, DEFAULT_MATRIX_STOP); @@ -160,35 +181,43 @@ static void __print_help_blas3_perf_test() { printf("\t\tMatrix step selection.\n"); printf( "\t\t\tValid value for K is any non-negative 32-bit integer. (default: " - "%d)\n\n", + "%d)\n", DEFAULT_STEP); printf("\t-w, --warm_up_loop=LOOP\n"); printf("\t\tWarm up loop selection. (untimed)\n"); printf( "\t\t\tValid value for LOOP is any non-negative 32-bit integer that's <= " - "ITER. (default: %d)\n\n", + "ITER. (default: %d)\n", DEFAULT_WARM_UP_N); printf("\t-i, --iter=ITER\n"); printf("\t\tIteration selection. (timed)\n"); printf( "\t\t\tValid value for ITER is any non-negative 32-bit integer. " - "(default: %d)\n\n", + "(default: %d)\n", DEFAULT_N); printf("\t-c, --csv=/path/to/file.csv\n"); printf("\t\tCsv output file selection.\n"); printf( "\t\t\tValid value for /path/to/file.csv is any valid file name. " - "(default: stdout)\n\n"); + "(default: stdout)\n"); printf("\t-r, --routines=ROUTINES\n"); printf("\t\tRoutine selection.\n"); printf( "\t\t\tValid value for ROUTINES is one of more valid blas3 routines " - "delimited by a comma. (default: %s)\n\n", + "delimited by a comma. (default: %s)\n", DEFAULT_BLAS_ROUTINES); + + printf("\t-v, --verify=VERIFY\n"); + printf("\t\tVerification selection. (untimed)\n"); + printf( + "\t\t\tValid values for VERIFY are either 0 to skip verification or 1 to " + "verify before timing. " + "(default: %d)\n", + DEFAULT_VERIFY); } static void __blas3_perf_test_input_error(char **argv, char short_opt, @@ -211,42 +240,47 @@ int main(int argc, char **argv) { }; /* set default options */ - options.test = DEFAULT_TEST; - options.loop = DEFAULT_LOOP; - options.start.a.k = DEFAULT_K; - options.start.a.m = DEFAULT_MATRIX_START; - options.start.a.n = DEFAULT_MATRIX_START; - options.stop.a.k = DEFAULT_K; - options.stop.a.m = DEFAULT_MATRIX_STOP; - options.stop.a.n = DEFAULT_MATRIX_STOP; - options.start.b.k = DEFAULT_K; - options.start.b.m = DEFAULT_MATRIX_START; - options.start.b.n = DEFAULT_MATRIX_START; - options.stop.b.k = DEFAULT_K; - options.stop.b.m = DEFAULT_MATRIX_STOP; - options.stop.b.n = DEFAULT_MATRIX_STOP; - options.start.c.k = DEFAULT_K; - options.start.c.m = DEFAULT_MATRIX_START; - options.start.c.n = DEFAULT_MATRIX_START; - options.stop.c.k = DEFAULT_K; - options.stop.c.m = DEFAULT_MATRIX_STOP; - options.stop.c.n = DEFAULT_MATRIX_STOP; - options.step = DEFAULT_STEP; - options.warm_up_n = DEFAULT_WARM_UP_N; - options.n = DEFAULT_N; - options.out = DEFAULT_OUT; - options.blas_routines = std::string(DEFAULT_BLAS_ROUTINES); - options.blas_args.team_size = DEFAULT_TEAM_SIZE; - options.blas_args.vector_len = DEFAULT_VECTOR_LEN; + options.test = DEFAULT_TEST; + options.loop = DEFAULT_LOOP; + options.start.a.k = DEFAULT_K; + options.start.a.m = DEFAULT_MATRIX_START; + options.start.a.n = DEFAULT_MATRIX_START; + options.stop.a.k = DEFAULT_K; + options.stop.a.m = DEFAULT_MATRIX_STOP; + options.stop.a.n = DEFAULT_MATRIX_STOP; + options.start.b.k = DEFAULT_K; + options.start.b.m = DEFAULT_MATRIX_START; + options.start.b.n = DEFAULT_MATRIX_START; + options.stop.b.k = DEFAULT_K; + options.stop.b.m = DEFAULT_MATRIX_STOP; + options.stop.b.n = DEFAULT_MATRIX_STOP; + options.start.c.k = DEFAULT_K; + options.start.c.m = DEFAULT_MATRIX_START; + options.start.c.n = DEFAULT_MATRIX_START; + options.stop.c.k = DEFAULT_K; + options.stop.c.m = DEFAULT_MATRIX_STOP; + options.stop.c.n = DEFAULT_MATRIX_STOP; + options.step = DEFAULT_STEP; + options.warm_up_n = DEFAULT_WARM_UP_N; + options.n = DEFAULT_N; + options.out = DEFAULT_OUT; + options.blas_routines = std::string(DEFAULT_BLAS_ROUTINES); + options.blas_args.team_size = DEFAULT_TEAM_SIZE; + options.blas_args.vector_len = DEFAULT_VECTOR_LEN; + options.blas_args.use_auto = DEFAULT_USE_AUTO; + options.blas_args.batch_size_last_dim = DEFAULT_BATCH_SIZE_LAST_DIM; + options.verify = DEFAULT_VERIFY; options.blas_args.trmm.trmm_args = DEFAULT_TRMM_ARGS; options.blas_args.trmm.alpha = DEFAULT_TRMM_ALPHA; options.blas_args.gemm.gemm_args = DEFAULT_GEMM_ARGS; options.blas_args.gemm.alpha = DEFAULT_GEMM_ALPHA; + options.blas_args.gemm.beta = DEFAULT_GEMM_BETA; - while ((ret = getopt_long(argc, argv, "ht:l:b:e:s:w:i:o:a:c:r:g:z:n:k:", - long_options, &option_idx)) != -1) { + while ( + (ret = getopt_long(argc, argv, "ht:l:b:e:s:w:i:o:a:c:r:g:z:n:k:u:p:d:v:", + long_options, &option_idx)) != -1) { switch (ret) { case 'h': __print_help_blas3_perf_test(); return 0; case 't': @@ -269,14 +303,19 @@ int main(int argc, char **argv) { break; case 'g': // printf("optarg=%s. %d\n", optarg, strncasecmp(optarg, "blas", 4)); - if (strlen(optarg) != 3) { + if (strlen(optarg) != 2) { __blas3_perf_test_input_error(argv, ret, optarg); } options.blas_args.gemm.gemm_args = optarg; break; case 'p': // printf("optarg=%s. %d\n", optarg, strncasecmp(optarg, "blas", 4)); - options.blas_args.gemm.alpha = (default_scalar)atof(optarg); + double alpha, beta; + if (sscanf(optarg, "%lf,%lf", &alpha, &beta) != 2) + __blas3_perf_test_input_error(argv, ret, optarg); + + options.blas_args.gemm.alpha = static_cast(alpha); + options.blas_args.gemm.beta = static_cast(beta); break; case 'a': // printf("optarg=%s. %d\n", optarg, strncasecmp(optarg, "blas", 4)); @@ -361,8 +400,11 @@ int main(int argc, char **argv) { options.stop.a.k = options.stop.b.k = options.stop.c.k = atoi(optarg); break; + case 'd': options.blas_args.batch_size_last_dim = atoi(optarg); break; + case 'v': options.verify = atoi(optarg); break; case 'z': options.blas_args.team_size = atoi(optarg); break; case 'n': options.blas_args.vector_len = atoi(optarg); break; + case 'u': options.blas_args.use_auto = atoi(optarg); break; case 'c': out_file = optarg; options.out_file = std::string(out_file); diff --git a/packages/kokkos-kernels/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp b/packages/kokkos-kernels/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp index 70f7664679df..de2bbd9ce9ae 100644 --- a/packages/kokkos-kernels/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp +++ b/packages/kokkos-kernels/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp @@ -72,6 +72,62 @@ void (*do_trmm_invoke[LOOP_N][TEST_N])(options_t) = { #define DEFAULT_TRMM_ARGS "LUNU" #define DEFAULT_TRMM_ALPHA 1.0 +/** + * The KokkosBatched::SerialTrmm implementation performs dot products on + * non-zero elements of the triangular matrices. The flop calculation below + * assumes KokkosBatched::SerialTrmm is being used. Since the dot products + * do a multiply and add we can calculate the flops for any element in the last + * column of the LHS to be 2*columns_LHS, any element in the last-1 column of + * the LHS to be 2*(columns_LHS-1), and so on. We do this for every row of the + * LHS giving us this flop count: flops = columns_LHS * (columns_LHS + 1) flops + * = (flops / 2) * 2 flops = flops * rows_LHS + */ +static inline int __trmm_impl_flop_count(char side, int b_m, int b_n, int a_m, + int a_n) { + int flops; + + if (side == 'L' || side == 'l') { + flops = (b_m * (b_m + 1)) * b_n; + } else { + flops = (b_n * (b_n + 1)) * b_m; + } + + if (std::is_same::value || + std::is_same::value || + std::is_same::value) + return flops; + + // Account for 6 additional flops when complex numbers are used. + // Above we have counted 1 flop for each add and 1 flop for each multiply. + // For complex, we need to count 2 flops for each add and 6 flops for each + // multiply. + return flops * 4; +} + +// Flop count formula from lapack working note 41: +// http://www.icl.utk.edu/~mgates3/docs/lawn41.pdf +static inline double __trmm_flop_count(char side, double b_m, double b_n, + double a_m, double a_n) { + double flops; + + if (side == 'L' || side == 'l') { + flops = b_m * b_m * b_n; + } else { + flops = b_n * b_n * b_m; + } + + if (std::is_same::value || + std::is_same::value || + std::is_same::value) + return flops; + + // Account for 6 additional flops when complex numbers are used. + // Above we have counted 1 flop for each add and 1 flop for each multiply. + // For complex, we need to count 2 flops for each add and 6 flops for each + // multiply. + return flops * 4; +} + using view_type_3d = Kokkos::View; struct trmm_args { @@ -83,19 +139,54 @@ typedef struct trmm_args trmm_args_t; static std::string trmm_csv_header_str = "algorithm,side-uplo-trans-diag,alpha,loop_type,A_dims,B_dims,warm_up_n," - "iter,total_time(s),average_time(s)"; + "iter,total_time(s),average_time(s),FLOPS,GFLOP/" + "average_time(s),min_achieved_bandwidth(GB/s),max_achieved_bandwidth(GB/s)"; /*************************** Internal helper fns **************************/ static void __trmm_output_csv_row(options_t options, trmm_args_t trmm_args, double time_in_seconds) { + double flops = trmm_args.A.extent(0) * + __trmm_flop_count(trmm_args.side, trmm_args.B.extent(1), + trmm_args.B.extent(2), trmm_args.A.extent(1), + trmm_args.A.extent(2)); + double gflops = flops / 1e9; + double average_time = time_in_seconds / options.n; + double gbytes_in_matrix = (trmm_args.B.extent(0) * trmm_args.B.extent(1) * + trmm_args.B.extent(2) * sizeof(default_scalar)) / + 1e9; + double min_memory_transactions, max_memory_transactions; + + // Assuming infinite cache size + // We have to read A and B into the cache once and then write + // B back out to main memory once. + min_memory_transactions = 3; + + // Assuming no register or real caching + // We have to go out to memory for every element we read from A and B as well + // as every element we write to B. We use the trmm flops from lapack note 41 + // and multiple by 3/2 to account for the write to B since this flop count is + // for one multiply and one add. + if (trmm_args.side == 'l' || trmm_args.side == 'L') + max_memory_transactions = trmm_args.B.extent(1) * trmm_args.B.extent(1) * + trmm_args.B.extent(2) * (3. / 2.); + else + max_memory_transactions = trmm_args.B.extent(2) * trmm_args.B.extent(2) * + trmm_args.B.extent(1) * (3. / 2.); + options.out[0] << test_e_str[options.test] << "," << options.blas_args.trmm.trmm_args << "," - << options.blas_args.trmm.alpha << "," - << loop_e_str[options.loop] << "," << trmm_args.A.extent(1) - << "x" << trmm_args.A.extent(2) << "," << trmm_args.B.extent(1) + << static_cast(options.blas_args.trmm.alpha) << "," + << loop_e_str[options.loop] << "," << trmm_args.A.extent(0) + << "x" << trmm_args.A.extent(1) << "x" << trmm_args.A.extent(2) + << "," << trmm_args.B.extent(0) << "x" << trmm_args.B.extent(1) << "x" << trmm_args.B.extent(2) << "," << options.warm_up_n << "," << options.n << "," << time_in_seconds << "," - << time_in_seconds / options.n << std::endl; + << average_time << "," << flops << "," << gflops / average_time + << "," + << (gbytes_in_matrix * min_memory_transactions) / average_time + << "," + << (gbytes_in_matrix * max_memory_transactions) / average_time + << std::endl; } static void __print_trmm_perf_test_options(options_t options) { @@ -131,24 +222,30 @@ void __do_trmm_serial_blas(options_t options, trmm_args_t trmm_args) { STATUS; - for (uint32_t i = 0; i < warm_up_n; ++i) { - auto A = Kokkos::subview(trmm_args.A, i, Kokkos::ALL(), Kokkos::ALL()); - auto B = Kokkos::subview(trmm_args.B, i, Kokkos::ALL(), Kokkos::ALL()); + for (uint32_t j = 0; j < warm_up_n; ++j) { + for (int i = 0; i < options.start.a.k; ++i) { + auto A = Kokkos::subview(trmm_args.A, i, Kokkos::ALL(), Kokkos::ALL()); + auto B = Kokkos::subview(trmm_args.B, i, Kokkos::ALL(), Kokkos::ALL()); - KokkosBlas::trmm(&trmm_args.side, &trmm_args.uplo, &trmm_args.trans, - &trmm_args.diag, trmm_args.alpha, A, B); + KokkosBlas::trmm(&trmm_args.side, &trmm_args.uplo, &trmm_args.trans, + &trmm_args.diag, trmm_args.alpha, A, B); + } + // Fence after submitting each batch operation + Kokkos::fence(); } - Kokkos::fence(); timer.reset(); - for (uint32_t i = 0; i < n; ++i) { - auto A = Kokkos::subview(trmm_args.A, i, Kokkos::ALL(), Kokkos::ALL()); - auto B = Kokkos::subview(trmm_args.B, i, Kokkos::ALL(), Kokkos::ALL()); + for (uint32_t j = 0; j < n; ++j) { + for (int i = 0; i < options.start.a.k; ++i) { + auto A = Kokkos::subview(trmm_args.A, i, Kokkos::ALL(), Kokkos::ALL()); + auto B = Kokkos::subview(trmm_args.B, i, Kokkos::ALL(), Kokkos::ALL()); - KokkosBlas::trmm(&trmm_args.side, &trmm_args.uplo, &trmm_args.trans, - &trmm_args.diag, trmm_args.alpha, A, B); + KokkosBlas::trmm(&trmm_args.side, &trmm_args.uplo, &trmm_args.trans, + &trmm_args.diag, trmm_args.alpha, A, B); + } + // Fence after submitting each batch operation + Kokkos::fence(); } - Kokkos::fence(); __trmm_output_csv_row(options, trmm_args, timer.seconds()); #else std::cerr << std::string(__func__) @@ -167,21 +264,28 @@ void __do_trmm_serial_batched_template(options_t options, Kokkos::Timer timer; using tag = Algo::Trmm::Unblocked; - for (uint32_t i = 0; i < warm_up_n; ++i) { - auto A = Kokkos::subview(trmm_args.A, i, Kokkos::ALL(), Kokkos::ALL()); - auto B = Kokkos::subview(trmm_args.B, i, Kokkos::ALL(), Kokkos::ALL()); + for (uint32_t j = 0; j < warm_up_n; ++j) { + for (int i = 0; i < options.start.a.k; ++i) { + auto A = Kokkos::subview(trmm_args.A, i, Kokkos::ALL(), Kokkos::ALL()); + auto B = Kokkos::subview(trmm_args.B, i, Kokkos::ALL(), Kokkos::ALL()); - SerialTrmm::invoke(trmm_args.alpha, A, B); + SerialTrmm::invoke(trmm_args.alpha, A, B); + } + // Fence after submitting each batch operation + Kokkos::fence(); } timer.reset(); - for (uint32_t i = 0; i < n; ++i) { - auto A = Kokkos::subview(trmm_args.A, i, Kokkos::ALL(), Kokkos::ALL()); - auto B = Kokkos::subview(trmm_args.B, i, Kokkos::ALL(), Kokkos::ALL()); + for (uint32_t j = 0; j < n; ++j) { + for (int i = 0; i < options.start.a.k; ++i) { + auto A = Kokkos::subview(trmm_args.A, i, Kokkos::ALL(), Kokkos::ALL()); + auto B = Kokkos::subview(trmm_args.B, i, Kokkos::ALL(), Kokkos::ALL()); - SerialTrmm::invoke(trmm_args.alpha, A, B); + SerialTrmm::invoke(trmm_args.alpha, A, B); + } + // Fence after submitting each batch operation + Kokkos::fence(); } - Kokkos::fence(); __trmm_output_csv_row(options, trmm_args, timer.seconds()); #else std::cerr << std::string(__func__) @@ -306,6 +410,7 @@ struct parallel_blas_trmm { template void __do_trmm_parallel_blas(options_t options, trmm_args_t trmm_args) { +// TODO: Note why this is disabled on CUDA and HIP #if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP) uint32_t warm_up_n = options.warm_up_n; uint32_t n = options.n; @@ -316,16 +421,24 @@ void __do_trmm_parallel_blas(options_t options, trmm_args_t trmm_args) { STATUS; - Kokkos::parallel_for("parallelBlasWarmUpLoopTrmm", - Kokkos::RangePolicy(0, warm_up_n), - parallel_blas_trmm_functor); - Kokkos::fence(); + for (uint32_t j = 0; j < warm_up_n; ++j) { + Kokkos::parallel_for( + "parallelBlasWarmUpLoopTrmm", + Kokkos::RangePolicy(0, options.start.a.k), + parallel_blas_trmm_functor); + // Fence after each batch operation + Kokkos::fence(); + } timer.reset(); - Kokkos::parallel_for("parallelBlasTimedLoopTrmm", - Kokkos::RangePolicy(0, n), - parallel_blas_trmm_functor); - Kokkos::fence(); + for (uint32_t j = 0; j < n; ++j) { + Kokkos::parallel_for( + "parallelBlasTimedLoopTrmm", + Kokkos::RangePolicy(0, options.start.a.k), + parallel_blas_trmm_functor); + // Fence after each batch operation + Kokkos::fence(); + } __trmm_output_csv_row(options, trmm_args, timer.seconds()); #else std::cerr << std::string(__func__) @@ -368,16 +481,24 @@ void __do_trmm_parallel_batched_template(options_t options, STATUS; - Kokkos::parallel_for("parallelBatchedWarmUpLoopTrmm", - Kokkos::RangePolicy(0, warm_up_n), - parallel_batched_trmm_functor); - Kokkos::fence(); + for (uint32_t j = 0; j < warm_up_n; ++j) { + Kokkos::parallel_for( + "parallelBatchedWarmUpLoopTrmm", + Kokkos::RangePolicy(0, options.start.a.k), + parallel_batched_trmm_functor); + // Fence after each batch operation + Kokkos::fence(); + } timer.reset(); - Kokkos::parallel_for("parallelBatchedTimedLoopTrmm", - Kokkos::RangePolicy(0, n), - parallel_batched_trmm_functor); - Kokkos::fence(); + for (uint32_t j = 0; j < n; ++j) { + Kokkos::parallel_for( + "parallelBatchedTimedLoopTrmm", + Kokkos::RangePolicy(0, options.start.a.k), + parallel_batched_trmm_functor); + // Fence after each batch operation + Kokkos::fence(); + } __trmm_output_csv_row(options, trmm_args, timer.seconds()); return; @@ -498,19 +619,24 @@ trmm_args_t __do_setup(options_t options, matrix_dims_t dim) { trmm_args.uplo = options.blas_args.trmm.trmm_args.c_str()[1]; trmm_args.trans = options.blas_args.trmm.trmm_args.c_str()[2]; trmm_args.diag = options.blas_args.trmm.trmm_args.c_str()[3]; - trmm_args.A = vta("trmm_args.A", options.n, dim.a.m, dim.a.n); - trmm_args.B = vtb("trmm_args.B", options.n, dim.b.m, dim.b.n); + trmm_args.A = vta("trmm_args.A", dim.a.k, dim.a.m, dim.a.n); + trmm_args.B = vtb("trmm_args.B", dim.b.k, dim.b.m, dim.b.n); trmm_args.alpha = options.blas_args.trmm.alpha; host_A = Kokkos::create_mirror_view(trmm_args.A); - Kokkos::fill_random(trmm_args.A, rand_pool, - Kokkos::rand, - scalar_type>::max()); - Kokkos::deep_copy(host_A, trmm_args.A); + { + Kokkos::View tmp( + "tmp", trmm_args.A.extent(0), trmm_args.A.extent(1), + trmm_args.A.extent(2)); + Kokkos::fill_random(tmp, rand_pool, + Kokkos::rand, + double>::max()); + Kokkos::deep_copy(host_A, tmp); + } if (trmm_args.uplo == 'U' || trmm_args.uplo == 'u') { // Make A upper triangular - for (uint32_t k = 0; k < options.n; ++k) { + for (int k = 0; k < dim.a.k; ++k) { auto A = Kokkos::subview(host_A, k, Kokkos::ALL(), Kokkos::ALL()); for (int i = 1; i < dim.a.m; i++) { for (int j = 0; j < i; j++) { @@ -522,7 +648,7 @@ trmm_args_t __do_setup(options_t options, matrix_dims_t dim) { // Make A lower triangular // Kokkos::parallel_for("toLowerLoop", options.n, KOKKOS_LAMBDA (const int& // i) { - for (uint32_t k = 0; k < options.n; ++k) { + for (int k = 0; k < dim.a.k; ++k) { auto A = Kokkos::subview(host_A, k, Kokkos::ALL(), Kokkos::ALL()); for (int i = 0; i < dim.a.m - 1; i++) { for (int j = i + 1; j < dim.a.n; j++) { @@ -533,7 +659,7 @@ trmm_args_t __do_setup(options_t options, matrix_dims_t dim) { } if (trmm_args.diag == 'U' || trmm_args.diag == 'u') { - for (uint32_t k = 0; k < options.n; ++k) { + for (int k = 0; k < dim.a.k; ++k) { auto A = Kokkos::subview(host_A, k, Kokkos::ALL(), Kokkos::ALL()); for (int i = 0; i < min_dim; i++) { A(i, i) = scalar_type(1); @@ -542,9 +668,15 @@ trmm_args_t __do_setup(options_t options, matrix_dims_t dim) { } Kokkos::deep_copy(trmm_args.A, host_A); - Kokkos::fill_random(trmm_args.B, rand_pool, - Kokkos::rand, - scalar_type>::max()); + { + Kokkos::View tmp( + "tmp", trmm_args.B.extent(0), trmm_args.B.extent(1), + trmm_args.B.extent(2)); + Kokkos::fill_random(tmp, rand_pool, + Kokkos::rand, + double>::max()); + Kokkos::deep_copy(trmm_args.B, tmp); + } return trmm_args; } @@ -566,8 +698,8 @@ void __do_loop_and_invoke(options_t options, for (cur_dims = options.start; cur_dims.a.m <= options.stop.a.m && cur_dims.a.n <= options.stop.a.n && cur_dims.b.m <= options.stop.b.m && cur_dims.b.n <= options.stop.b.n; - cur_dims.a.m *= options.step, cur_dims.a.n *= options.step, - cur_dims.b.m *= options.step, cur_dims.b.n *= options.step) { + cur_dims.a.m += options.step, cur_dims.a.n += options.step, + cur_dims.b.m += options.step, cur_dims.b.n += options.step) { trmm_args = __do_setup( options, cur_dims); diff --git a/packages/kokkos-kernels/perf_test/sparse/KokkosSparse_block_pcg.cpp b/packages/kokkos-kernels/perf_test/sparse/KokkosSparse_block_pcg.cpp index 4f990f98ca3d..f4833cda1785 100644 --- a/packages/kokkos-kernels/perf_test/sparse/KokkosSparse_block_pcg.cpp +++ b/packages/kokkos-kernels/perf_test/sparse/KokkosSparse_block_pcg.cpp @@ -60,6 +60,69 @@ unsigned cg_iteration_limit = 10; +template +crsMat_t create_crs_matrix(char *mtx_bin_file) { + + using graph_t = typename crsMat_t::StaticCrsGraphType; + using row_map_view_t = typename graph_t::row_map_type::non_const_type; + using cols_view_t = typename graph_t::entries_type::non_const_type; + using values_view_t = typename crsMat_t::values_type::non_const_type; + using myExecSpace = typename crsMat_t::execution_space; + + crsMat_t crsmat; + + printf("matrix file: %s\n", mtx_bin_file); + + if(std::string(mtx_bin_file) == "auto") { + INDEX_TYPE num_rows = 11, num_cols = 11, nnz = 40; + crsmat = KokkosKernels::Impl::kk_generate_diagonally_dominant_sparse_matrix(num_rows, num_cols, nnz, 3, 5); + printf("generating test matrix automatically\n"); + printf(" num rows: %d", num_rows); + printf(" num cols: %d", num_cols); + printf(" num non zeros: %d\n", nnz); + } else { + INDEX_TYPE nv = 0, ne = 0; + INDEX_TYPE *xadj, *adj; + SCALAR_TYPE *ew; + + KokkosKernels::Impl::read_matrix (&nv, &ne, &xadj, &adj, &ew, mtx_bin_file); + + row_map_view_t rowmap_view ("rowmap_view", nv+1); + cols_view_t columns_view("colsmap_view", ne); + values_view_t values_view ("values_view", ne); + + if(KokkosKernels::Impl::kk_is_gpu_exec_space()) { + typename row_map_view_t::HostMirror hr = Kokkos::create_mirror_view (rowmap_view); + typename cols_view_t::HostMirror hc = Kokkos::create_mirror_view (columns_view); + typename values_view_t::HostMirror hv = Kokkos::create_mirror_view (values_view); + + for (INDEX_TYPE i = 0; i <= nv; ++i){ + hr(i) = xadj[i]; + } + for (INDEX_TYPE i = 0; i < ne; ++i){ + hc(i) = adj[i]; + hv(i) = ew[i]; + } + + Kokkos::deep_copy (rowmap_view , hr); + Kokkos::deep_copy (columns_view , hc); + Kokkos::deep_copy (values_view , hv); + } else { + KokkosKernels::Impl::copy_vector(ne, ew, values_view); + KokkosKernels::Impl::copy_vector(ne, adj, columns_view); + KokkosKernels::Impl::copy_vector(nv+1, xadj, rowmap_view); + } + + graph_t static_graph (columns_view, rowmap_view); + crsmat = crsMat_t("CrsMatrix", nv, values_view, static_graph); + delete [] xadj; + delete [] adj; + delete [] ew; + } + + return crsmat; +} + template scalar_view_t create_x_vector(INDEX_TYPE nv, SCALAR_TYPE max_value = 1.0){ @@ -338,10 +401,11 @@ enum { CMD_USE_THREADS = 0 , CMD_USE_CORE_PER_NUMA , CMD_USE_CUDA , CMD_USE_OPENMP + , CMD_USE_SERIAL , CMD_USE_CUDA_DEV , CMD_BIN_MTX , CMD_ERROR - , CMD_COUNT }; + , CMD_COUNT}; int main (int argc, char ** argv){ @@ -355,7 +419,10 @@ int main (int argc, char ** argv){ for ( int i = 1 ; i < argc ; ++i ) { - if ( 0 == strcasecmp( argv[i] , "--threads" ) ) { + if ( 0 == strcasecmp( argv[i] , "--serial" ) ) { + cmdline[ CMD_USE_SERIAL ] = 1 ; + } + else if ( 0 == strcasecmp( argv[i] , "--threads" ) ) { kargs.num_threads = cmdline[ CMD_USE_THREADS ] = atoi( argv[++i] ); } else if ( 0 == strcasecmp( argv[i] , "--openmp" ) ) { @@ -380,14 +447,14 @@ int main (int argc, char ** argv){ std::cerr << "Unrecognized command line argument #" << i << ": " << argv[i] << std::endl ; std::cerr << "OPTIONS\n\t--threads [numThreads]\n\t--openmp [numThreads]\n\t--cuda\n\t--cuda-dev[DeviceIndex]\n\t--mtx[binary_mtx_file]" << std::endl; - return 0; + return 1; } } if (mtx_bin_file == NULL){ - std::cerr << "Provide a mtx binary file" << std::endl ; - std::cerr << "OPTIONS\n\t--threads [numThreads]\n\t--openmp [numThreads]\n\t--cuda\n\t--cuda-dev[DeviceIndex]\n\t--mtx[binary_mtx_file]" << std::endl; - return 0; + std::cerr << "Provide a mtx binary file or specify auto-generation" << std::endl ; + std::cerr << "OPTIONS\n\t--serial\n\t--threads [numThreads]\n\t--openmp [numThreads]\n\t--cuda\n\t--cuda-dev[DeviceIndex]\n\t--mtx[binary_mtx_file|auto]" << std::endl; + return 1; } std::cout << "Running experiments with block size:" << block_size << std::endl; @@ -395,39 +462,37 @@ int main (int argc, char ** argv){ Kokkos::initialize(kargs); -#if defined( KOKKOS_ENABLE_THREADS ) +#if defined( KOKKOS_ENABLE_SERIAL ) - if ( cmdline[ CMD_USE_THREADS ] ) { - INDEX_TYPE nv = 0, ne = 0; - INDEX_TYPE *xadj, *adj; - SCALAR_TYPE *ew; + if ( cmdline[ CMD_USE_SERIAL ] ) { + using myExecSpace = Kokkos::Serial; + Kokkos::Serial::print_configuration(std::cout); + using crsMat_t = typename KokkosSparse::CrsMatrix; + crsMat_t crsmat = create_crs_matrix(mtx_bin_file); + INDEX_TYPE nv = crsmat.numRows(); - KokkosKernels::Impl::read_matrix (&nv, &ne, &xadj, &adj, &ew, mtx_bin_file); - Kokkos::Threads::print_configuration(std::cout); - - typedef Kokkos::Threads myExecSpace; - typedef typename KokkosSparse::CrsMatrix crsMat_t; + using values_view_t = typename crsMat_t::values_type::non_const_type; + values_view_t kok_x_original = create_x_vector(((nv /block_size) + 1) * block_size, MAXVAL); + for (INDEX_TYPE i = nv; i < ((nv /block_size) + 1) * block_size; ++i){ + kok_x_original(i) = 0; + } + run_experiment(crsmat, kok_x_original, block_size); + } - typedef typename crsMat_t::StaticCrsGraphType graph_t; - typedef typename graph_t::row_map_type::non_const_type row_map_view_t; - typedef typename graph_t::entries_type::non_const_type cols_view_t; - typedef typename crsMat_t::values_type::non_const_type values_view_t; +#endif - row_map_view_t rowmap_view("rowmap_view", nv+1); - cols_view_t columns_view("colsmap_view", ne); - values_view_t values_view("values_view", ne); +#if defined( KOKKOS_ENABLE_THREADS ) - KokkosKernels::Impl::copy_vector(ne, ew, values_view); - KokkosKernels::Impl::copy_vector(ne, adj, columns_view); - KokkosKernels::Impl::copy_vector(nv+1, xadj, rowmap_view); + if ( cmdline[ CMD_USE_THREADS ] ) { + using myExecSpace = Kokkos::Threads; + Kokkos::Threads::print_configuration(std::cout); - graph_t static_graph (columns_view, rowmap_view); - crsMat_t crsmat("CrsMatrix", nv, values_view, static_graph); - delete [] xadj; - delete [] adj; - delete [] ew; + using crsMat_t = typename KokkosSparse::CrsMatrix; + crsMat_t crsmat = create_crs_matrix(mtx_bin_file); + INDEX_TYPE nv = crsmat.numRows(); + using values_view_t = typename crsMat_t::values_type::non_const_type; values_view_t kok_x_original = create_x_vector(((nv /block_size) + 1) * block_size, MAXVAL); for (INDEX_TYPE i = nv; i < ((nv /block_size) + 1) * block_size; ++i){ kok_x_original(i) = 0; @@ -440,47 +505,19 @@ int main (int argc, char ** argv){ #if defined( KOKKOS_ENABLE_OPENMP ) if ( cmdline[ CMD_USE_OPENMP ] ) { - INDEX_TYPE nv = 0, ne = 0; - INDEX_TYPE *xadj, *adj; - SCALAR_TYPE *ew; - - + using myExecSpace = Kokkos::OpenMP; Kokkos::OpenMP::print_configuration(std::cout); - KokkosKernels::Impl::read_matrix (&nv, &ne, &xadj, &adj, &ew, mtx_bin_file); - - - typedef Kokkos::OpenMP myExecSpace; - typedef typename KokkosSparse::CrsMatrix crsMat_t; - - typedef typename crsMat_t::StaticCrsGraphType graph_t; - typedef typename crsMat_t::row_map_type::non_const_type row_map_view_t; - typedef typename crsMat_t::index_type::non_const_type cols_view_t; - typedef typename crsMat_t::values_type::non_const_type values_view_t; - - row_map_view_t rowmap_view("rowmap_view", nv+1); - cols_view_t columns_view("colsmap_view", ne); - values_view_t values_view("values_view", ne); - - KokkosKernels::Impl::copy_vector(ne, ew, values_view); - KokkosKernels::Impl::copy_vector(ne, adj, columns_view); - KokkosKernels::Impl::copy_vector(nv+1, xadj, rowmap_view); - - graph_t static_graph (columns_view, rowmap_view); - crsMat_t crsmat("CrsMatrix", nv, values_view, static_graph); - - //crsMat_t crsmat("CrsMatrix", nv, nv, ne, ew, xadj, adj); - delete [] xadj; - delete [] adj; - delete [] ew; - + using crsMat_t = typename KokkosSparse::CrsMatrix; + crsMat_t crsmat = create_crs_matrix(mtx_bin_file); + INDEX_TYPE nv = crsmat.numRows(); + using values_view_t = typename crsMat_t::values_type::non_const_type; values_view_t kok_x_original = create_x_vector(((nv /block_size) + 1) * block_size, MAXVAL); for (INDEX_TYPE i = nv; i < ((nv /block_size) + 1) * block_size; ++i){ kok_x_original(i) = 0; } run_experiment(crsmat, kok_x_original, block_size); - } #endif @@ -488,57 +525,16 @@ int main (int argc, char ** argv){ #if defined( KOKKOS_ENABLE_CUDA ) if ( cmdline[ CMD_USE_CUDA ] ) { // Use the last device: - INDEX_TYPE nv = 0, ne = 0; - INDEX_TYPE *xadj, *adj; - SCALAR_TYPE *ew; + using myExecSpace = Kokkos::Cuda; Kokkos::Cuda::print_configuration(std::cout); - KokkosKernels::Impl::read_matrix (&nv, &ne, &xadj, &adj, &ew, mtx_bin_file); - - - typedef Kokkos::Cuda myExecSpace; - typedef typename KokkosSparse::CrsMatrix crsMat_t; - - typedef typename crsMat_t::StaticCrsGraphType graph_t; - typedef typename crsMat_t::row_map_type::non_const_type row_map_view_t; - typedef typename crsMat_t::index_type::non_const_type cols_view_t; - typedef typename crsMat_t::values_type::non_const_type values_view_t; - - row_map_view_t rowmap_view("rowmap_view", nv+1); - cols_view_t columns_view("colsmap_view", ne); - values_view_t values_view("values_view", ne); - - - { - typename row_map_view_t::HostMirror hr = Kokkos::create_mirror_view (rowmap_view); - typename cols_view_t::HostMirror hc = Kokkos::create_mirror_view (columns_view); - typename values_view_t::HostMirror hv = Kokkos::create_mirror_view (values_view); - - for (INDEX_TYPE i = 0; i <= nv; ++i){ - hr(i) = xadj[i]; - } - - for (INDEX_TYPE i = 0; i < ne; ++i){ - hc(i) = adj[i]; - hv(i) = ew[i]; - } - Kokkos::deep_copy (rowmap_view , hr); - Kokkos::deep_copy (columns_view , hc); - Kokkos::deep_copy (values_view , hv); - - - } - graph_t static_graph (columns_view, rowmap_view); - crsMat_t crsmat("CrsMatrix", nv, values_view, static_graph); - - delete [] xadj; - delete [] adj; - delete [] ew; + using crsMat_t = typename KokkosSparse::CrsMatrix; + crsMat_t crsmat = create_crs_matrix(mtx_bin_file); + INDEX_TYPE nv = crsmat.numRows(); + using values_view_t = typename crsMat_t::values_type::non_const_type; values_view_t kok_x_original = create_x_vector(((nv /block_size) + 1) * block_size, MAXVAL); run_experiment(crsmat, kok_x_original, block_size); - - } #endif diff --git a/packages/kokkos-kernels/perf_test/sparse/spmv/OpenMPSmartStatic_SPMV.hpp b/packages/kokkos-kernels/perf_test/sparse/spmv/OpenMPSmartStatic_SPMV.hpp index 285bd1038f7a..d769d3a4da38 100644 --- a/packages/kokkos-kernels/perf_test/sparse/spmv/OpenMPSmartStatic_SPMV.hpp +++ b/packages/kokkos-kernels/perf_test/sparse/spmv/OpenMPSmartStatic_SPMV.hpp @@ -138,7 +138,7 @@ void openmp_smart_static_matvec(AType A, XType x, YType y) { #pragma omp parallel { -#ifdef KOKKOS_COMPILER_INTEL +#if defined(KOKKOS_COMPILER_INTEL) && !defined(__clang__) __assume_aligned(x_ptr, 64); __assume_aligned(y_ptr, 64); #endif diff --git a/packages/kokkos-kernels/scripts/cm_test_all_sandia b/packages/kokkos-kernels/scripts/cm_test_all_sandia index c5f7148125b4..f41cd818bf29 100755 --- a/packages/kokkos-kernels/scripts/cm_test_all_sandia +++ b/packages/kokkos-kernels/scripts/cm_test_all_sandia @@ -445,12 +445,12 @@ elif [ "$MACHINE" = "kokkos-dev" ]; then MODULE_ENVIRONMENT="source /projects/sems/modulefiles/utils/sems-modules-init.sh" eval "$MODULE_ENVIRONMENT" - module load sems-cmake/3.12.2 - BASE_MODULE_LIST="sems-env,sems-cmake/3.12.2,sems-/" - CUDA9_MODULE_LIST="sems-env,sems-cmake/3.12.2,sems-/,sems-gcc/6.1.0" - CUDA10_MODULE_LIST="sems-env,sems-cmake/3.12.2,sems-/,sems-gcc/7.3.0" - CUDA11_MODULE_LIST="sems-env,sems-cmake/3.12.2,sems-/,sems-gcc/9.2.0" - CLANG7_MODULE_LIST="sems-env,sems-cmake/3.12.2,sems-/,sems-cuda/9.2" + module load sems-cmake/3.17.1 + BASE_MODULE_LIST="sems-env,sems-cmake/3.17.1,sems-/" + CUDA9_MODULE_LIST="sems-env,sems-cmake/3.17.1,sems-/,sems-gcc/6.1.0" + CUDA10_MODULE_LIST="sems-env,sems-cmake/3.17.1,sems-/,sems-gcc/7.3.0" + CUDA11_MODULE_LIST="sems-env,sems-cmake/3.17.1,sems-/,sems-gcc/9.2.0" + CLANG7_MODULE_LIST="sems-env,sems-cmake/3.17.1,sems-/,sems-cuda/9.2" SKIP_HWLOC=True if [ -z "$ARCH_FLAG" ]; then @@ -502,16 +502,16 @@ elif [ "$MACHINE" = "white" ]; then SKIP_HWLOC=True export SLURM_TASKS_PER_NODE=32 - BASE_MODULE_LIST="cmake/3.12.3,/" - IBM_MODULE_LIST="cmake/3.12.3,/xl/,gcc/7.2.0" - CUDA_MODULE_LIST="cmake/3.12.3,/,gcc/7.2.0,ibm/xl/16.1.1" - CUDA10_MODULE_LIST="cmake/3.12.3,/,gcc/7.4.0,ibm/xl/16.1.1" + BASE_MODULE_LIST="cmake/3.19.3,/" + IBM_MODULE_LIST="cmake/3.19.3,/xl/,gcc/7.2.0" + CUDA_MODULE_LIST="cmake/3.19.3,/,gcc/7.2.0,ibm/xl/16.1.1" + CUDA10_MODULE_LIST="cmake/3.19.3,/,gcc/7.4.0,ibm/xl/16.1.1" - GCC72_MODULE_TPL_LIST="cmake/3.12.3,/,netlib/3.8.0/gcc/7.2.0" - GCC74_MODULE_TPL_LIST="cmake/3.12.3,/,openblas/0.3.4/gcc/7.4.0" - CUDA_MODULE_TPL_LIST="cmake/3.12.3,/,gcc/7.2.0,netlib/3.8.0/gcc/7.2.0" - CUDA10_MODULE_TPL_LIST="cmake/3.12.3,/,gcc/7.4.0,openblas/0.3.4/gcc/7.4.0" - IBM_MODULE_TPL_LIST="cmake/3.12.3,/xl/,gcc/7.2.0,netlib/3.8.0/ibm/xl/16.1.1" + GCC72_MODULE_TPL_LIST="cmake/3.19.3,/,netlib/3.8.0/gcc/7.2.0" + GCC74_MODULE_TPL_LIST="cmake/3.19.3,/,openblas/0.3.4/gcc/7.4.0" + CUDA_MODULE_TPL_LIST="cmake/3.19.3,/,gcc/7.2.0,netlib/3.8.0/gcc/7.2.0" + CUDA10_MODULE_TPL_LIST="cmake/3.19.3,/,gcc/7.4.0,openblas/0.3.4/gcc/7.4.0" + IBM_MODULE_TPL_LIST="cmake/3.19.3,/xl/,gcc/7.2.0,netlib/3.8.0/ibm/xl/16.1.1" # Don't do pthread on white. GCC_BUILD_LIST="OpenMP,Serial,OpenMP_Serial" @@ -534,7 +534,8 @@ elif [ "$MACHINE" = "white" ]; then ) else # Format: (compiler module-list build-list exe-name warning-flag) - COMPILERS=("gcc/6.4.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + COMPILERS=("gcc/5.4.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "gcc/6.4.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" "gcc/7.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" "gcc/9.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" "ibm/16.1.1 $IBM_MODULE_LIST $IBM_BUILD_LIST xlC $IBM_WARNING_FLAGS" @@ -555,14 +556,14 @@ elif [ "$MACHINE" = "weaver" ]; then eval "$MODULE_ENVIRONMENT" SKIP_HWLOC=True - BASE_MODULE_LIST="cmake/3.12.3,/" - IBM_MODULE_LIST="cmake/3.12.3,/xl/,gcc/7.2.0" - CUDA_MODULE_LIST="cmake/3.12.3,/,gcc/7.2.0,ibm/xl/16.1.1" - CUDA10_MODULE_LIST="cmake/3.12.3,/,gcc/7.4.0,ibm/xl/16.1.1" + BASE_MODULE_LIST="cmake/3.19.3,/" + IBM_MODULE_LIST="cmake/3.19.3,/xl/,gcc/7.2.0" + CUDA_MODULE_LIST="cmake/3.19.3,/,gcc/7.2.0,ibm/xl/16.1.1" + CUDA10_MODULE_LIST="cmake/3.19.3,/,gcc/7.4.0,ibm/xl/16.1.1" - GCC72_MODULE_TPL_LIST="cmake/3.12.3,/,openblas/0.2.20/gcc/7.2.0" - CUDA_MODULE_TPL_LIST="cmake/3.12.3,/,gcc/7.2.0,netlib/3.8.0/gcc/7.2.0" - CUDA10_MODULE_TPL_LIST="cmake/3.12.3,/,gcc/7.2.0,openblas/0.2.20/gcc/7.2.0" + GCC72_MODULE_TPL_LIST="cmake/3.19.3,/,openblas/0.2.20/gcc/7.2.0" + CUDA_MODULE_TPL_LIST="cmake/3.19.3,/,gcc/7.2.0,netlib/3.8.0/gcc/7.2.0" + CUDA10_MODULE_TPL_LIST="cmake/3.19.3,/,gcc/7.2.0,openblas/0.2.20/gcc/7.2.0" # Issues finding CUBLAS with cuda/10.1.243 module at configure # "Could NOT find TPLCUBLAS (missing: CUDA_CUBLAS_LIBRARIES)" # Once resolved add the compiler + modules below to the SPOT_CHECK_TPLS @@ -609,7 +610,7 @@ elif [ "$MACHINE" = "voltrino" ]; then SKIP_HWLOC=True export SLURM_TASKS_PER_NODE=32 - BASE_MODULE_LIST="PrgEnv-intel,craype-mic-knl,cmake/3.16.2,slurm/19.05.5a,/,gcc/9.3.0" + BASE_MODULE_LIST="PrgEnv-intel,craype-mic-knl,cmake/3.16.2,slurm/20.11.4a,/,gcc/9.3.0" # Format: (compiler module-list build-list exe-name warning-flag) COMPILERS=("intel/17.0.4 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" @@ -624,14 +625,13 @@ elif [ "$MACHINE" = "mayer" ]; then SKIP_HWLOC=True export SLURM_TASKS_PER_NODE=96 - BASE_MODULE_LIST="cmake/3.14.5,/" -# ARM_MODULE_LIST="cmake/3.12.2,/" + BASE_MODULE_LIST="cmake/3.17.1,/" ARMCLANG_WARNING_FLAGS="-Wall,-Wshadow,-pedantic,-Wsign-compare,-Wtype-limits,-Wuninitialized" # Format: (compiler module-list build-list exe-name warning-flag) COMPILERS=("gnu7/7.2.0 $BASE_MODULE_LIST $ARM_GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "arm/20.0 $BASE_MODULE_LIST $ARM_GCC_BUILD_LIST armclang++ $ARMCLANG_WARNING_FLAGS") + "arm/20.1 $BASE_MODULE_LIST $ARM_GCC_BUILD_LIST armclang++ $ARMCLANG_WARNING_FLAGS") if [ -z "$ARCH_FLAG" ]; then ARCH_FLAG="--arch=ARMV8_THUNDERX2" @@ -650,7 +650,7 @@ elif [ "$MACHINE" = "caraway" ]; then HIPCLANG_WARNING_FLAGS="" # Format: (compiler module-list build-list exe-name warning-flag) - COMPILERS=("rocm/3.8.0 $BASE_MODULE_LIST $HIPCLANG_BUILD_LIST hipcc $HIPCLANG_WARNING_FLAGS") + COMPILERS=("rocm/3.10.0 $BASE_MODULE_LIST $HIPCLANG_BUILD_LIST hipcc $HIPCLANG_WARNING_FLAGS") if [ -z "$ARCH_FLAG" ]; then ARCH_FLAG="--arch=VEGA900" @@ -661,10 +661,14 @@ elif [ "$MACHINE" = "blake" ]; then SKIP_HWLOC=True export SLURM_TASKS_PER_NODE=32 - module load cmake/3.12.3 + module load cmake/3.19.3 - BASE_MODULE_LIST="cmake/3.12.3,/" - BASE_MODULE_LIST_INTEL="cmake/3.12.3,/compilers/" + BASE_MODULE_LIST="cmake/3.19.3,/" + BASE_MODULE_LIST_INTEL="cmake/3.19.3,/compilers/" + BASE_MODULE_LIST_ONEAPI="cmake/3.19.3,/oneAPI/base-toolkit/" + ONEAPI_WARNING_FLAGS="" + + GCC72_MODULE_TPL_LIST="$BASE_MODULE_LIST,openblas/0.2.20/gcc/7.2.0" if [ "$SPOT_CHECK" = "True" ]; then # Format: (compiler module-list build-list exe-name warning-flag) @@ -672,14 +676,14 @@ elif [ "$MACHINE" = "blake" ]; then #"intel/18.1.163 $BASE_MODULE_LIST_INTEL $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" #"pgi/18.7.0 $BASE_MODULE_LIST $GCC_BUILD_LIST pgc++ $PGI_WARNING_FLAGS" COMPILERS=("intel/19.1.144 $BASE_MODULE_LIST_INTEL "OpenMP_Serial" icpc $INTEL_WARNING_FLAGS" - "gcc/7.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "gcc/7.2.0 $BASE_MODULE_LIST "Pthread_Serial,OpenMP" g++ $GCC_WARNING_FLAGS" ) elif [ "$SPOT_CHECK_TPLS" = "True" ]; then # Format: (compiler module-list build-list exe-name warning-flag) # TODO: Failing toolchains: #"pgi/18.7.0 $BASE_MODULE_LIST $GCC_BUILD_LIST pgc++ $PGI_WARNING_FLAGS" - COMPILERS=("intel/18.1.163 $BASE_MODULE_LIST_INTEL $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" - "gcc/7.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + COMPILERS=("intel/18.1.163 $BASE_MODULE_LIST_INTEL "OpenMP,Pthread" icpc $INTEL_WARNING_FLAGS" + "gcc/7.2.0 $GCC72_MODULE_TPL_LIST "OpenMP_Serial" g++ $GCC_WARNING_FLAGS" ) else COMPILERS=("intel/17.4.196 $BASE_MODULE_LIST_INTEL $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" @@ -687,12 +691,14 @@ elif [ "$MACHINE" = "blake" ]; then "intel/19.1.144 $BASE_MODULE_LIST_INTEL $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" "intel/19.3.199 $BASE_MODULE_LIST_INTEL $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" "intel/19.5.281 $BASE_MODULE_LIST_INTEL $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" + "intel/2021.1.1 $BASE_MODULE_LIST_ONEAPI $INTEL_BUILD_LIST icpx $ONEAPI_WARNING_FLAGS" "gcc/5.5.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" "gcc/6.4.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" "gcc/7.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" "gcc/8.1.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" "gcc/8.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" "gcc/9.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "gcc/10.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" ) fi @@ -707,21 +713,21 @@ elif [ "$MACHINE" = "apollo" ]; then module load sems-git module load sems-tex - module load sems-cmake/3.12.2 + module load sems-cmake/3.17.1 module load sems-gdb module load binutils SKIP_HWLOC=True - BASE_MODULE_LIST="sems-env,sems-cmake/3.12.2,sems-/" - CUDA9_MODULE_LIST="sems-env,sems-cmake/3.12.2,/,sems-gcc/5.3.0" - CUDA10_MODULE_LIST="sems-env,sems-cmake/3.12.2,/,sems-gcc/5.3.0" - CUDA101_MODULE_LIST="sems-env,sems-cmake/3.12.2,/,sems-gcc/7.3.0" + BASE_MODULE_LIST="sems-env,sems-cmake/3.17.1,sems-/" + CUDA9_MODULE_LIST="sems-env,sems-cmake/3.17.1,/,sems-gcc/5.3.0" + CUDA10_MODULE_LIST="sems-env,sems-cmake/3.17.1,/,sems-gcc/5.3.0" + CUDA101_MODULE_LIST="sems-env,sems-cmake/3.17.1,/,sems-gcc/7.3.0" - CLANG_MODULE_LIST="sems-env,sems-cmake/3.12.2,/,cuda/9.0.69" - NVCC_MODULE_LIST="sems-env,sems-cmake/3.12.2,/,sems-gcc/5.3.0" -# HPX_MODULE_LIST="sems-env,sems-cmake/3.12.2,hpx/1.2.1,sems-gcc/6.1.0,binutils" -# HPX3_MODULE_LIST="sems-env,sems-cmake/3.12.2,compilers/hpx/1.3.0,sems-gcc/6.1.0,binutils" + CLANG_MODULE_LIST="sems-env,sems-cmake/3.17.1,/,cuda/9.0.69" + NVCC_MODULE_LIST="sems-env,sems-cmake/3.17.1,/,sems-gcc/5.3.0" +# HPX_MODULE_LIST="sems-env,sems-cmake/3.17.1,hpx/1.2.1,sems-gcc/6.1.0,binutils" +# HPX3_MODULE_LIST="sems-env,sems-cmake/3.17.1,compilers/hpx/1.3.0,sems-gcc/6.1.0,binutils" BUILD_LIST_CUDA_NVCC="Cuda_Serial,Cuda_OpenMP" BUILD_LIST_CUDA_CLANG="Cuda_Serial,Cuda_Pthread" @@ -761,19 +767,19 @@ elif [ "$MACHINE" = "kokkos-dev-2" ]; then module load sems-git module load sems-tex - module load sems-cmake/3.12.2 + module load sems-cmake/3.17.1 module load sems-gdb SKIP_HWLOC=True - BASE_MODULE_LIST="sems-env,sems-cmake/3.12.2,sems-/" - GCC91_MODULE_LIST="sems-env,sems-cmake/3.12.2,/" - NVCC_MODULE_LIST="sems-env,sems-cmake/3.12.2,/,sems-gcc/7.3.0" - NVCC_SEMSMODULE_LIST="sems-env,sems-cmake/3.12.2,sems-/,sems-gcc/7.3.0" - NVCC11_MODULE_LIST="sems-env,sems-cmake/3.12.2,/,sems-gcc/9.2.0" + BASE_MODULE_LIST="sems-env,sems-cmake/3.17.1,sems-/" + GCC91_MODULE_LIST="sems-env,sems-cmake/3.17.1,/" + NVCC_MODULE_LIST="sems-env,sems-cmake/3.17.1,/,sems-gcc/7.3.0" + NVCC_SEMSMODULE_LIST="sems-env,sems-cmake/3.17.1,sems-/,sems-gcc/7.3.0" + NVCC11_MODULE_LIST="sems-env,sems-cmake/3.17.1,/,sems-gcc/9.2.0" - CLANG_MODULE_LIST="sems-env,sems-cmake/3.12.2,sems-/,sems-gcc/6.1.0" - CLANG8_MODULE_LIST="sems-env,sems-cmake/3.12.2,/,cuda/10.0" + CLANG_MODULE_LIST="sems-env,sems-cmake/3.17.1,sems-/,sems-gcc/6.1.0" + CLANG8_MODULE_LIST="sems-env,sems-cmake/3.17.1,/,cuda/10.0" BUILD_LIST_CUDA_NVCC="Cuda_Serial,Cuda_Pthread" BUILD_LIST_CUDA_CLANG="Cuda_Serial,Cuda_OpenMP" @@ -1077,7 +1083,7 @@ setup_env() { if [[ "${SPOT_CHECK_TPLS}" = "True" ]]; then # Some machines will require explicitly setting include dirs and libs - if ([[ "$MACHINE" = white* ]] || [[ "$MACHINE" = weaver* ]]) && [[ "$mod" = openblas* ]]; then + if ([[ "$MACHINE" = white* ]] || [[ "$MACHINE" = weaver* ]] || [[ "$MACHINE" = blake* ]]) && [[ "$mod" = openblas* ]]; then BLAS_LIBRARY_DIRS="${OPENBLAS_ROOT}/lib" LAPACK_LIBRARY_DIRS="${OPENBLAS_ROOT}/lib" # BLAS_LIBRARIES="openblas" @@ -1104,8 +1110,8 @@ setup_env() { done if [ -e ${CM_ALL_SCRIPT_PATH}/update_lib.sh ]; then - echo "calling ${CM_ALL_SCRIPT_PATH}/update_lib.sh $MACHINE" - source ${CM_ALL_SCRIPT_PATH}/update_lib.sh $MACHINE + echo "calling ${CM_ALL_SCRIPT_PATH}/update_lib.sh $MACHINE $compiler" + source ${CM_ALL_SCRIPT_PATH}/update_lib.sh $MACHINE $compiler fi return 0 diff --git a/packages/kokkos-kernels/scripts/docker/Dockerfile.hip b/packages/kokkos-kernels/scripts/docker/Dockerfile.hip new file mode 100644 index 000000000000..2db14b100912 --- /dev/null +++ b/packages/kokkos-kernels/scripts/docker/Dockerfile.hip @@ -0,0 +1,28 @@ +ARG BASE=rocm/dev-ubuntu-20.04:3.10 +FROM $BASE + +RUN apt-get update && apt-get install -y \ + git \ + wget \ + && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +ENV PATH=/opt/rocm/bin:$PATH + +ARG CMAKE_VERSION=3.18.5 +ENV CMAKE_DIR=/opt/cmake +RUN CMAKE_KEY=2D2CEF1034921684 && \ + CMAKE_URL=https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION} && \ + CMAKE_SCRIPT=cmake-${CMAKE_VERSION}-Linux-x86_64.sh && \ + CMAKE_SHA256=cmake-${CMAKE_VERSION}-SHA-256.txt && \ + wget --quiet ${CMAKE_URL}/${CMAKE_SHA256} && \ + wget --quiet ${CMAKE_URL}/${CMAKE_SHA256}.asc && \ + wget --quiet ${CMAKE_URL}/${CMAKE_SCRIPT} && \ + gpg --keyserver pool.sks-keyservers.net --recv-keys ${CMAKE_KEY} && \ + gpg --verify ${CMAKE_SHA256}.asc ${CMAKE_SHA256} && \ + grep ${CMAKE_SCRIPT} ${CMAKE_SHA256} | sha256sum --check && \ + mkdir -p ${CMAKE_DIR} && \ + sh ${CMAKE_SCRIPT} --skip-license --prefix=${CMAKE_DIR} && \ + rm cmake* +ENV PATH=${CMAKE_DIR}/bin:$PATH diff --git a/packages/kokkos-kernels/scripts/update_lib.sh b/packages/kokkos-kernels/scripts/update_lib.sh index 822efa28b893..34ab5dd3c9a0 100755 --- a/packages/kokkos-kernels/scripts/update_lib.sh +++ b/packages/kokkos-kernels/scripts/update_lib.sh @@ -1,30 +1,53 @@ #!/bin/bash -if [ "$1" = blake ]; then - ICPCVER="$(icpc --version | grep icpc | cut -d ' ' -f 3)" - if [[ "${ICPCVER}" = 17.* || "${ICPCVER}" = 18.0.128 ]]; then - module swap gcc/4.9.3 gcc/6.4.0 - module list - fi -fi -if [ "$1" = kokkos-dev ]; then +local machine_input="$1" +local compiler_input="$2" + +check_sems_intel() { ICPCVER="$(icpc --version | grep icpc | cut -d ' ' -f 3)" if [[ "${ICPCVER}" = 17.* ]]; then module swap sems-gcc/4.9.3 sems-gcc/6.4.0 module list fi -fi -if [ "$1" = kokkos-dev-2 ]; then - ICPCVER="$(icpc --version | grep icpc | cut -d ' ' -f 3)" - if [[ "${ICPCVER}" = 17.* ]]; then - module swap sems-gcc/4.9.3 sems-gcc/6.4.0 + if [[ "${ICPCVER}" = 19.* ]]; then + # Newer gcc needed for c++ standard beyond c++14 + module swap sems-gcc/6.1.0 sems-gcc/7.2.0 module list fi -fi -if [ "$1" = sems ]; then +} + +check_sems_clang() { + CLANGVER=$(clang --version | grep "clang version" | cut -d " " -f 3) + if [[ "${CLANGVER}" = 9.* ]] || [[ "${CLANGVER}" = 10.* ]]; then + # Newer gcc needed for c++ standard beyond c++14 + module swap sems-gcc/5.3.0 sems-gcc/6.4.0 + module list + fi +} + +check_compiler_modules() { + if [[ "$compiler_input" = clang/* ]]; then + echo " clang compiler - check supporting modules" + check_sems_clang + elif [[ "$compiler_input" = intel/* ]]; then + echo " intel compiler - check supporting modules" + check_sems_intel + fi +} + +if [ "$machine_input" = blake ]; then ICPCVER="$(icpc --version | grep icpc | cut -d ' ' -f 3)" - if [[ "${ICPCVER}" = 17.* ]]; then - module swap sems-gcc/4.8.4 sems-gcc/6.4.0 + if [[ "${ICPCVER}" = 17.* || "${ICPCVER}" = 18.0.128 ]]; then + module swap gcc/4.9.3 gcc/6.4.0 module list fi fi +if [ "$machine_input" = kokkos-dev ]; then + check_compiler_modules +fi +if [ "$machine_input" = kokkos-dev-2 ]; then + check_compiler_modules +fi +if [ "$machine_input" = sems ] || [ "$machine_input" = sogpu ]; then + check_compiler_modules +fi diff --git a/packages/kokkos-kernels/src/CMakeLists.txt b/packages/kokkos-kernels/src/CMakeLists.txt index 22c17b524781..57b5394107df 100644 --- a/packages/kokkos-kernels/src/CMakeLists.txt +++ b/packages/kokkos-kernels/src/CMakeLists.txt @@ -437,4 +437,5 @@ KOKKOSKERNELS_LINK_TPL(kokkoskernels PUBLIC MKL) KOKKOSKERNELS_LINK_TPL(kokkoskernels PUBLIC CUBLAS) KOKKOSKERNELS_LINK_TPL(kokkoskernels PUBLIC CUSPARSE) KOKKOSKERNELS_LINK_TPL(kokkoskernels PUBLIC METIS) +KOKKOSKERNELS_LINK_TPL(kokkoskernels PUBLIC ARMPL) # Not yet here KOKKOSKERNELS_LINK_TPL(kokkoskernels PUBLIC MAGMA) diff --git a/packages/kokkos-kernels/src/Kokkos_ArithTraits.hpp b/packages/kokkos-kernels/src/Kokkos_ArithTraits.hpp index 83c483a3d6b3..f96ffc49c39c 100644 --- a/packages/kokkos-kernels/src/Kokkos_ArithTraits.hpp +++ b/packages/kokkos-kernels/src/Kokkos_ArithTraits.hpp @@ -856,15 +856,19 @@ class ArithTraits { static KOKKOS_FORCEINLINE_FUNCTION float infinity() { return HUGE_VALF; } static KOKKOS_FORCEINLINE_FUNCTION bool isInf (const float x) { - #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST using std::isinf; - #endif +#elif KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL + using sycl::isinf +#endif return isinf (x); } static KOKKOS_FORCEINLINE_FUNCTION bool isNan (const float x) { - #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST using std::isnan; - #endif +#elif KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL + using sycl::isnan +#endif return isnan (x); } static KOKKOS_FORCEINLINE_FUNCTION mag_type abs (const float x) { @@ -1030,18 +1034,52 @@ class ArithTraits > { return std::complex (ArithTraits::infinity (), ArithTraits::infinity ()); } - static bool isInf (const std::complex& x) { - #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST +#ifdef KOKKOS_ENABLE_SYCL + template + static bool isInf(const std::complex& x) { +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST using std::isinf; - #endif +#elif KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL + using sycl::isinf +#endif return isinf (real (x)) || isinf (imag (x)); } - static bool isNan (const std::complex& x) { - #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST + template <> + static bool isInf(const std::complex& x) { + Kokkos::abort("isInf not available for std::complex!\n"); + return true; + } +#else + static bool isInf(const std::complex& x) { +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST + using std::isinf; +#endif + return isinf (real (x)) || isinf (imag (x)); + } +#endif +#ifdef KOKKOS_ENABLE_SYCL + template + static bool isNan(const std::complex& x) { +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST using std::isnan; - #endif +#elif KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL + using sycl::isnan +#endif return isnan (real (x)) || isnan (imag (x)); } + template <> + static bool isNan(const std::complex& x) { + Kokkos::abort("isNan not available for std::complex!\n"); + return true; + } +#else + static bool isNan(const std::complex& x) { +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST + using std::isnan; +#endif + return isnan (real (x)) || isnan (imag (x)); + } +#endif static mag_type abs (const std::complex& x) { return std::abs (x); } @@ -1213,12 +1251,16 @@ class ArithTraits { static KOKKOS_FORCEINLINE_FUNCTION bool isInf (const val_type x) { #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST using std::isinf; + #elif KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL + using sycl::isinf; #endif return isinf (x); } static KOKKOS_FORCEINLINE_FUNCTION bool isNan (const val_type x) { #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST using std::isnan; + #elif KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL + using sycl::isnan; #endif return isnan (x); } diff --git a/packages/kokkos-kernels/src/batched/KokkosBatched_Eigendecomposition_Serial_Internal.hpp b/packages/kokkos-kernels/src/batched/KokkosBatched_Eigendecomposition_Serial_Internal.hpp index 07ca8933cf81..f46a278e8b8f 100644 --- a/packages/kokkos-kernels/src/batched/KokkosBatched_Eigendecomposition_Serial_Internal.hpp +++ b/packages/kokkos-kernels/src/batched/KokkosBatched_Eigendecomposition_Serial_Internal.hpp @@ -57,6 +57,7 @@ namespace KokkosBatched { RealType * w, const int wlen) { /// until debugging is done, comment out the code /// testing happens only for TPLs on host. + static_assert(false, "Serial eigendecomposition on device and/or without LAPACK is not implemented yet"); // typedef RealType real_type; // typedef Kokkos::Details::ArithTraits ats; @@ -356,9 +357,12 @@ namespace KokkosBatched { RealType * UR, const int urs0, const int urs1, RealType * w, const int wlen) { #if defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST) - if (as0 == 1 || as1 == 1) { + //if (as0 == 1 || as1 == 1) { /// column major or row major and it runs on host /// potentially it can run tpls internally + // NOTE BMK: If LAPACK not enabled, this will static_assert. + // If neither stride is unit, will runtime assert. + // Otherwise will succeed using LAPACK. host_invoke(m, A, as0, as1, er, ers, @@ -366,6 +370,7 @@ namespace KokkosBatched { UL, uls0, uls1, UR, urs0, urs1, w, wlen); + /* } else { /// arbitrary strides should be handled by native implementation device_invoke(m, @@ -375,7 +380,9 @@ namespace KokkosBatched { UL, uls0, uls1, UR, urs0, urs1, w, wlen); + throw std::runtime_error("Serial eigendecomposition without unit stride implemented yet."); } + */ #else /// device code runs device_invoke(m, diff --git a/packages/kokkos-kernels/src/batched/KokkosBatched_Eigendecomposition_TeamVector_Internal.hpp b/packages/kokkos-kernels/src/batched/KokkosBatched_Eigendecomposition_TeamVector_Internal.hpp index b63d8477868c..7c4026d1e94b 100644 --- a/packages/kokkos-kernels/src/batched/KokkosBatched_Eigendecomposition_TeamVector_Internal.hpp +++ b/packages/kokkos-kernels/src/batched/KokkosBatched_Eigendecomposition_TeamVector_Internal.hpp @@ -76,6 +76,8 @@ namespace KokkosBatched { RealType * UL, const int uls0, const int uls1, RealType * UR, const int urs0, const int urs1, RealType * w, const int wlen) { + static_assert(false, "TeamVector eigendecomposition is not implemented yet."); + /* #if defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST) if (as0 == 1 || as1 == 1) { /// column major or row major and it runs on host @@ -100,6 +102,7 @@ namespace KokkosBatched { UL, uls0, uls1, UR, urs0, urs1, w, wlen); + throw std::runtime_error("TeamVector eigendecomposition is not implemented yet."); } #else /// device code runs @@ -111,6 +114,7 @@ namespace KokkosBatched { UR, urs0, urs1, w, wlen); #endif +*/ return 0; } }; diff --git a/packages/kokkos-kernels/src/batched/KokkosBatched_Util.hpp b/packages/kokkos-kernels/src/batched/KokkosBatched_Util.hpp index 3253b6ce1298..89dd20015053 100644 --- a/packages/kokkos-kernels/src/batched/KokkosBatched_Util.hpp +++ b/packages/kokkos-kernels/src/batched/KokkosBatched_Util.hpp @@ -204,7 +204,8 @@ namespace KokkosBatched { std::is_same >::value || std::is_same >::value || std::is_same >::value || - std::is_same >::value, + std::is_same >::value || + std::is_same::value, "KokkosKernels:: Invalid SIMD<> type." ); using value_type = T; }; @@ -281,6 +282,16 @@ namespace KokkosBatched { template KOKKOS_INLINE_FUNCTION static constexpr typename std::enable_if::value,int> ::type mb() { return 2; } +#endif +#if defined(KOKKOS_ENABLE_SYCL) + template + KOKKOS_INLINE_FUNCTION static constexpr typename std::enable_if< + std::is_same::value, + int>::type + mb() { + return 2; + } #endif template KOKKOS_INLINE_FUNCTION static constexpr typename std::enable_if::value,int> @@ -330,6 +341,16 @@ namespace KokkosBatched { template KOKKOS_INLINE_FUNCTION static constexpr typename std::enable_if::value,int> ::type mb() { return 1; } +#endif +#if defined(KOKKOS_ENABLE_SYCL) + template + KOKKOS_INLINE_FUNCTION static constexpr typename std::enable_if< + std::is_same::value, + int>::type + mb() { + return 1; + } #endif template KOKKOS_INLINE_FUNCTION static constexpr typename std::enable_if::value,int> diff --git a/packages/kokkos-kernels/src/batched/KokkosBatched_Vector_SIMD.hpp b/packages/kokkos-kernels/src/batched/KokkosBatched_Vector_SIMD.hpp index a950e5e41f40..d7d3d5808092 100644 --- a/packages/kokkos-kernels/src/batched/KokkosBatched_Vector_SIMD.hpp +++ b/packages/kokkos-kernels/src/batched/KokkosBatched_Vector_SIMD.hpp @@ -702,6 +702,9 @@ namespace KokkosBatched { enum : int { vector_length = 8 }; typedef __m512d data_type __attribute__ ((aligned(64))); + inline + static const char* label() { return "AVX512"; } + template friend class Vector; diff --git a/packages/kokkos-kernels/src/blas/impl/KokkosBlas1_nrm1_impl.hpp b/packages/kokkos-kernels/src/blas/impl/KokkosBlas1_nrm1_impl.hpp index 9e1393eb5a15..296c424b3c89 100644 --- a/packages/kokkos-kernels/src/blas/impl/KokkosBlas1_nrm1_impl.hpp +++ b/packages/kokkos-kernels/src/blas/impl/KokkosBlas1_nrm1_impl.hpp @@ -52,10 +52,10 @@ namespace KokkosBlas { namespace Impl { // -// nrm1_squared +// nrm1 // -/// \brief 2-norm (squared) functor for single vectors. +/// \brief 1-norm functor for single vectors. /// /// \tparam RV 0-D output View /// \tparam XV 1-D input View @@ -63,12 +63,12 @@ namespace Impl { template struct V_Nrm1_Functor { - typedef typename XV::execution_space execution_space; - typedef SizeType size_type; - typedef typename XV::non_const_value_type xvalue_type; - typedef Kokkos::Details::InnerProductSpaceTraits IPT; - typedef Kokkos::Details::ArithTraits AT; - typedef typename IPT::mag_type value_type; + typedef typename XV::execution_space execution_space; + typedef SizeType size_type; + typedef typename XV::non_const_value_type xvalue_type; + typedef Kokkos::ArithTraits XAT; + typedef typename XAT::mag_type value_type; + typedef Kokkos::ArithTraits MAT; typename XV::const_type m_x; @@ -94,12 +94,13 @@ struct V_Nrm1_Functor KOKKOS_INLINE_FUNCTION void operator() (const size_type& i, value_type& sum) const { - sum += IPT::norm (m_x(i)); + xvalue_type val = m_x(i); + sum += MAT::abs(XAT::real(val)) + MAT::abs(XAT::imag(val)); } KOKKOS_INLINE_FUNCTION void init (value_type& update) const { - update = AT::zero (); + update = MAT::zero (); } KOKKOS_INLINE_FUNCTION void @@ -117,7 +118,7 @@ struct V_Nrm1_Functor } }; -/// \brief Column-wise 2-norm functor for multivectors; works for +/// \brief Column-wise 1-norm functor for multivectors; works for /// any layout, but best performance with LayoutRight. /// /// \tparam RV 1-D output View @@ -126,12 +127,12 @@ struct V_Nrm1_Functor template struct MV_Nrm1_Right_FunctorVector { - typedef typename XMV::execution_space execution_space; - typedef SizeType size_type; - typedef typename XMV::non_const_value_type xvalue_type; - typedef Kokkos::Details::InnerProductSpaceTraits IPT; - typedef Kokkos::Details::ArithTraits AT; - typedef typename IPT::mag_type value_type[]; + typedef typename XMV::execution_space execution_space; + typedef SizeType size_type; + typedef typename XMV::non_const_value_type xvalue_type; + typedef Kokkos::ArithTraits XAT; + typedef Kokkos::ArithTraits MAT; + typedef typename XAT::mag_type value_type[]; size_type value_count; typename XMV::const_type m_x; @@ -166,7 +167,8 @@ struct MV_Nrm1_Right_FunctorVector #pragma vector always #endif for (size_type j = 0; j < numVecs; ++j) { - sum[j] += IPT::norm (m_x(i,j)); + xvalue_type val = m_x(i, j); + sum[j] += MAT::abs(XAT::real(val)) + MAT::abs(XAT::imag(val)); } } @@ -181,7 +183,7 @@ struct MV_Nrm1_Right_FunctorVector #pragma vector always #endif for (size_type j = 0; j < numVecs; ++j) { - update[j] = AT::zero (); + update[j] = MAT::zero (); } } diff --git a/packages/kokkos-kernels/src/common/KokkosKernels_ExecSpaceUtils.hpp b/packages/kokkos-kernels/src/common/KokkosKernels_ExecSpaceUtils.hpp index 59bcf487fb84..9e06bc45f2e7 100644 --- a/packages/kokkos-kernels/src/common/KokkosKernels_ExecSpaceUtils.hpp +++ b/packages/kokkos-kernels/src/common/KokkosKernels_ExecSpaceUtils.hpp @@ -53,7 +53,15 @@ namespace KokkosKernels{ namespace Impl{ -enum ExecSpaceType{Exec_SERIAL, Exec_OMP, Exec_PTHREADS, Exec_QTHREADS, Exec_CUDA, Exec_HIP}; +enum ExecSpaceType { + Exec_SERIAL, + Exec_OMP, + Exec_PTHREADS, + Exec_QTHREADS, + Exec_CUDA, + Exec_HIP, + Exec_SYCL +}; template KOKKOS_FORCEINLINE_FUNCTION ExecSpaceType kk_get_exec_space_type(){ ExecSpaceType exec_space = Exec_SERIAL; @@ -87,6 +95,12 @@ KOKKOS_FORCEINLINE_FUNCTION ExecSpaceType kk_get_exec_space_type(){ } #endif +#if defined(KOKKOS_ENABLE_SYCL) + if (std::is_same::value) { + exec_space = Exec_SYCL; + } +#endif + #if defined( KOKKOS_ENABLE_QTHREAD) if (std::is_same< Kokkos::Qthread, ExecutionSpace >::value){ exec_space = Exec_QTHREADS; @@ -115,6 +129,14 @@ constexpr KOKKOS_INLINE_FUNCTION bool kk_is_gpu_exec_space +constexpr KOKKOS_INLINE_FUNCTION bool +kk_is_gpu_exec_space() { + return true; +} +#endif + //Host function to determine free and total device memory. //Will throw if execution space doesn't support this. template diff --git a/packages/kokkos-kernels/src/graph/KokkosGraph_Distance1ColorHandle.hpp b/packages/kokkos-kernels/src/graph/KokkosGraph_Distance1ColorHandle.hpp index fc7f40bf1a4a..54a9b6db5b05 100644 --- a/packages/kokkos-kernels/src/graph/KokkosGraph_Distance1ColorHandle.hpp +++ b/packages/kokkos-kernels/src/graph/KokkosGraph_Distance1ColorHandle.hpp @@ -248,6 +248,13 @@ class GraphColoringHandle this->coloring_algorithm_type = COLORING_EB; #ifdef VERBOSE std::cout << ExecutionSpace::name() << " Execution Space, Default Algorithm: COLORING_EB\n"; +#endif + } + else if(KokkosKernels::Impl::kk_is_gpu_exec_space()) + { + this->coloring_algorithm_type = COLORING_EB; +#ifdef VERBOSE + std::cout << ExecutionSpace::name() << " Execution Space, Default Algorithm: COLORING_EB\n"; #endif } else diff --git a/packages/kokkos-kernels/src/graph/KokkosGraph_Distance2ColorHandle.hpp b/packages/kokkos-kernels/src/graph/KokkosGraph_Distance2ColorHandle.hpp index 265d42d1628c..4dc7dd7fe7d1 100644 --- a/packages/kokkos-kernels/src/graph/KokkosGraph_Distance2ColorHandle.hpp +++ b/packages/kokkos-kernels/src/graph/KokkosGraph_Distance2ColorHandle.hpp @@ -208,14 +208,14 @@ class GraphColorDistance2Handle if(KokkosKernels::Impl::kk_get_exec_space_type() == KokkosKernels::Impl::Exec_SERIAL) { this->coloring_algorithm_type = COLORING_D2_SERIAL; -#ifdef VERBOSE +#ifdef VERBOSE std::cout << "Serial Execution Space, Default Algorithm: COLORING_D2_SERIAL\n"; #endif } else { this->coloring_algorithm_type = COLORING_D2_NB_BIT; -#ifdef VERBOSE +#ifdef VERBOSE std::cout << ExecutionSpace::name() << " Execution Space, Default Algorithm: COLORING_D2_NB_BIT\n"; #endif } diff --git a/packages/kokkos-kernels/src/graph/KokkosGraph_ExplicitCoarsening.hpp b/packages/kokkos-kernels/src/graph/KokkosGraph_ExplicitCoarsening.hpp index 212cb7c38329..def892a167ba 100644 --- a/packages/kokkos-kernels/src/graph/KokkosGraph_ExplicitCoarsening.hpp +++ b/packages/kokkos-kernels/src/graph/KokkosGraph_ExplicitCoarsening.hpp @@ -80,6 +80,8 @@ void graph_explicit_coarsen( coarse_entries_t mergedEntries; KokkosKernels::Impl::sort_and_merge_graph (coarseRowmap, coarseEntries, mergedRowmap, mergedEntries); + coarseRowmap = mergedRowmap; + coarseEntries = mergedEntries; } } @@ -109,6 +111,8 @@ void graph_explicit_coarsen_with_inverse_map( coarse_entries_t mergedEntries; KokkosKernels::Impl::sort_and_merge_graph (coarseRowmap, coarseEntries, mergedRowmap, mergedEntries); + coarseRowmap = mergedRowmap; + coarseEntries = mergedEntries; } } diff --git a/packages/kokkos-kernels/src/graph/KokkosGraph_MIS2.hpp b/packages/kokkos-kernels/src/graph/KokkosGraph_MIS2.hpp index c578a9727131..b3098870c545 100644 --- a/packages/kokkos-kernels/src/graph/KokkosGraph_MIS2.hpp +++ b/packages/kokkos-kernels/src/graph/KokkosGraph_MIS2.hpp @@ -94,6 +94,7 @@ graph_mis2_coarsen(const rowmap_t& rowmap, const colinds_t& colinds, typename co if(rowmap.extent(0) <= 1) { //there are no vertices to label + numClusters = 0; return labels_t(); } labels_t mis2 = graph_d2_mis(rowmap, colinds, algo); diff --git a/packages/kokkos-kernels/src/graph/impl/KokkosGraph_Distance1Color_impl.hpp b/packages/kokkos-kernels/src/graph/impl/KokkosGraph_Distance1Color_impl.hpp index 110756a3649b..3adda031df3b 100644 --- a/packages/kokkos-kernels/src/graph/impl/KokkosGraph_Distance1Color_impl.hpp +++ b/packages/kokkos-kernels/src/graph/impl/KokkosGraph_Distance1Color_impl.hpp @@ -2058,32 +2058,39 @@ class GraphColor_VBD:public GraphColor newFrontierSize_; size_type maxColors_; color_view_type colors_; - - functorDeterministicColoring(const_lno_row_view_t rowPtr, - const_lno_nnz_view_t colInd, - nnz_lno_persistent_work_view_t dependency, - nnz_lno_temp_work_view_t frontier, - Kokkos::View frontierSize, - nnz_lno_temp_work_view_t newFrontier, - Kokkos::View newFrontierSize, - size_type maxColors, - color_view_type colors) - : xadj_(rowPtr), adj_(colInd), dependency_(dependency), frontier_(frontier), - frontierSize_(frontierSize), newFrontier_(newFrontier), newFrontierSize_(newFrontierSize), - maxColors_(maxColors), colors_(colors) {} + Kokkos::View bannedColors_; + + functorDeterministicColoring( + const_lno_row_view_t rowPtr, const_lno_nnz_view_t colInd, + nnz_lno_persistent_work_view_t dependency, + nnz_lno_temp_work_view_t frontier, + Kokkos::View frontierSize, + nnz_lno_temp_work_view_t newFrontier, + Kokkos::View newFrontierSize, + size_type maxColors, color_view_type colors) + : xadj_(rowPtr), + adj_(colInd), + dependency_(dependency), + frontier_(frontier), + frontierSize_(frontierSize), + newFrontier_(newFrontier), + newFrontierSize_(newFrontierSize), + maxColors_(maxColors), + colors_(colors), + bannedColors_("KokkosKernels::bannedColors", frontier.size(), + maxColors_) {} KOKKOS_INLINE_FUNCTION void operator() (const size_type frontierIdx) const { typedef typename std::remove_reference< decltype( newFrontierSize_() ) >::type atomic_incr_type; size_type frontierNode = frontier_(frontierIdx); - int* bannedColors = new int[maxColors_]; for(size_type colorIdx= 0; colorIdx < maxColors_; ++colorIdx) { - bannedColors[colorIdx] = 0; + bannedColors_(frontierIdx, colorIdx) = 0; } // Loop over neighbors, find banned colors, decrement dependency and update newFrontier for(size_type neigh = xadj_(frontierNode); neigh < xadj_(frontierNode + 1); ++neigh) { - bannedColors[colors_(adj_(neigh))] = 1; + bannedColors_(frontierIdx, colors_(adj_(neigh))) = 1; // We want to avoid the cost of atomic operations when not needed // so let's check that the node is not already colored, i.e. @@ -2100,12 +2107,11 @@ class GraphColor_VBD:public GraphColor struct GetUnifiedLayout { typedef typename std::conditional< ( (ViewType::rank == 1) && - (std::is_same::value) ) || + (!std::is_same::value) ) || ( (ViewType::rank == 0) ) ,Kokkos::LayoutLeft,typename ViewType::array_layout>::type array_layout; }; diff --git a/packages/kokkos-kernels/src/impl/tpls/KokkosBlas1_axpby_tpl_spec_avail.hpp b/packages/kokkos-kernels/src/impl/tpls/KokkosBlas1_axpby_tpl_spec_avail.hpp index 217e6f493992..c0e58b19b3ec 100644 --- a/packages/kokkos-kernels/src/impl/tpls/KokkosBlas1_axpby_tpl_spec_avail.hpp +++ b/packages/kokkos-kernels/src/impl/tpls/KokkosBlas1_axpby_tpl_spec_avail.hpp @@ -72,18 +72,10 @@ struct axpby_tpl_spec_avail< \ Kokkos::MemoryTraits >, \ 1> { enum : bool { value = true }; }; -#if defined (KOKKOSKERNELS_INST_DOUBLE) KOKKOSBLAS1_AXPBY_TPL_SPEC_AVAIL_BLAS( double, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_FLOAT) KOKKOSBLAS1_AXPBY_TPL_SPEC_AVAIL_BLAS( float, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) KOKKOSBLAS1_AXPBY_TPL_SPEC_AVAIL_BLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) KOKKOSBLAS1_AXPBY_TPL_SPEC_AVAIL_BLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif #endif @@ -101,18 +93,10 @@ struct axpby_tpl_spec_avail< \ Kokkos::MemoryTraits >, \ 1> { enum : bool { value = true }; }; -#if defined (KOKKOSKERNELS_INST_DOUBLE) KOKKOSBLAS1_AXPBY_TPL_SPEC_AVAIL_CUBLAS( double, Kokkos::LayoutLeft, Kokkos::CudaSpace) -#endif -#if defined (KOKKOSKERNELS_INST_FLOAT) KOKKOSBLAS1_AXPBY_TPL_SPEC_AVAIL_CUBLAS( float, Kokkos::LayoutLeft, Kokkos::CudaSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) KOKKOSBLAS1_AXPBY_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) KOKKOSBLAS1_AXPBY_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) -#endif #endif } diff --git a/packages/kokkos-kernels/src/impl/tpls/KokkosBlas1_dot_tpl_spec_avail.hpp b/packages/kokkos-kernels/src/impl/tpls/KokkosBlas1_dot_tpl_spec_avail.hpp index 135dcc6d1bca..d3b0fabd71fc 100644 --- a/packages/kokkos-kernels/src/impl/tpls/KokkosBlas1_dot_tpl_spec_avail.hpp +++ b/packages/kokkos-kernels/src/impl/tpls/KokkosBlas1_dot_tpl_spec_avail.hpp @@ -72,18 +72,10 @@ Kokkos::View, \ Kokkos::MemoryTraits >, \ 1,1> { enum : bool { value = true }; }; -#if defined (KOKKOSKERNELS_INST_DOUBLE) KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_BLAS( double, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_FLOAT) KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_BLAS( float, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_BLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_BLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif #endif @@ -101,18 +93,10 @@ Kokkos::View, \ Kokkos::MemoryTraits >, \ 1,1> { enum : bool { value = true }; }; -#if defined (KOKKOSKERNELS_INST_DOUBLE) KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_CUBLAS( double, Kokkos::LayoutLeft, Kokkos::CudaSpace) -#endif -#if defined (KOKKOSKERNELS_INST_FLOAT) KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_CUBLAS( float, Kokkos::LayoutLeft, Kokkos::CudaSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) -#endif #endif diff --git a/packages/kokkos-kernels/src/impl/tpls/KokkosBlas1_iamax_tpl_spec_avail.hpp b/packages/kokkos-kernels/src/impl/tpls/KokkosBlas1_iamax_tpl_spec_avail.hpp index 8df32f62d80e..182aba311501 100644 --- a/packages/kokkos-kernels/src/impl/tpls/KokkosBlas1_iamax_tpl_spec_avail.hpp +++ b/packages/kokkos-kernels/src/impl/tpls/KokkosBlas1_iamax_tpl_spec_avail.hpp @@ -70,18 +70,10 @@ Kokkos::View, \ Kokkos::MemoryTraits >, \ 1> { enum : bool { value = true }; }; -#if defined (KOKKOSKERNELS_INST_DOUBLE) KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_BLAS( unsigned long, double, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_FLOAT) KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_BLAS( unsigned long, float, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_BLAS( unsigned long, Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_BLAS( unsigned long, Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif #endif @@ -104,41 +96,23 @@ Kokkos::View, \ Kokkos::MemoryTraits >, \ 1> { enum : bool { value = true }; }; -#if defined (KOKKOSKERNELS_INST_DOUBLE) KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS( unsigned long, double, Kokkos::LayoutLeft, Kokkos::CudaSpace) KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS( unsigned int, double, Kokkos::LayoutLeft, Kokkos::CudaSpace) -#endif -#if defined (KOKKOSKERNELS_INST_FLOAT) KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS( unsigned long, float, Kokkos::LayoutLeft, Kokkos::CudaSpace) KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS( unsigned int, float, Kokkos::LayoutLeft, Kokkos::CudaSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS( unsigned long, Kokkos::complex,Kokkos::LayoutLeft, Kokkos::CudaSpace) KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS( unsigned int, Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS( unsigned long, Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS( unsigned int, Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) -#endif -#if defined (KOKKOS_ENABLE_CUDA_UVM) -#if defined (KOKKOSKERNELS_INST_DOUBLE) KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS( unsigned long, double, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS( unsigned int, double, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) -#endif -#if defined (KOKKOSKERNELS_INST_FLOAT) KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS( unsigned long, float, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS( unsigned int, float, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS( unsigned long, Kokkos::complex,Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS( unsigned int, Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS( unsigned long, Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS( unsigned int, Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) -#endif -#endif #endif diff --git a/packages/kokkos-kernels/src/impl/tpls/KokkosBlas1_nrm1_tpl_spec_avail.hpp b/packages/kokkos-kernels/src/impl/tpls/KokkosBlas1_nrm1_tpl_spec_avail.hpp index 3ff2cf4703f7..5a44212e6738 100644 --- a/packages/kokkos-kernels/src/impl/tpls/KokkosBlas1_nrm1_tpl_spec_avail.hpp +++ b/packages/kokkos-kernels/src/impl/tpls/KokkosBlas1_nrm1_tpl_spec_avail.hpp @@ -70,18 +70,10 @@ Kokkos::View, \ Kokkos::MemoryTraits >, \ 1> { enum : bool { value = true }; }; -#if defined (KOKKOSKERNELS_INST_DOUBLE) KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_BLAS( double, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_FLOAT) KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_BLAS( float, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_BLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_BLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif #endif @@ -97,18 +89,10 @@ Kokkos::View, \ Kokkos::MemoryTraits >, \ 1> { enum : bool { value = true }; }; -#if defined (KOKKOSKERNELS_INST_DOUBLE) KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_CUBLAS( double, Kokkos::LayoutLeft, Kokkos::CudaSpace) -#endif -#if defined (KOKKOSKERNELS_INST_FLOAT) KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_CUBLAS( float, Kokkos::LayoutLeft, Kokkos::CudaSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) -#endif #endif diff --git a/packages/kokkos-kernels/src/impl/tpls/KokkosBlas1_nrm2_tpl_spec_avail.hpp b/packages/kokkos-kernels/src/impl/tpls/KokkosBlas1_nrm2_tpl_spec_avail.hpp index 3dd558ccd18a..3facb0c245f9 100644 --- a/packages/kokkos-kernels/src/impl/tpls/KokkosBlas1_nrm2_tpl_spec_avail.hpp +++ b/packages/kokkos-kernels/src/impl/tpls/KokkosBlas1_nrm2_tpl_spec_avail.hpp @@ -57,7 +57,6 @@ struct nrm2_tpl_spec_avail { namespace KokkosBlas { namespace Impl { - // Generic Host side BLAS (could be MKL or whatever) #ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS // double @@ -70,18 +69,10 @@ Kokkos::View, \ Kokkos::MemoryTraits >, \ 1> { enum : bool { value = true }; }; -#if defined (KOKKOSKERNELS_INST_DOUBLE) KOKKOSBLAS1_NRM2_TPL_SPEC_AVAIL_BLAS( double, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_FLOAT) KOKKOSBLAS1_NRM2_TPL_SPEC_AVAIL_BLAS( float, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) KOKKOSBLAS1_NRM2_TPL_SPEC_AVAIL_BLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) KOKKOSBLAS1_NRM2_TPL_SPEC_AVAIL_BLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif #endif @@ -97,18 +88,10 @@ Kokkos::View, \ Kokkos::MemoryTraits >, \ 1> { enum : bool { value = true }; }; -#if defined (KOKKOSKERNELS_INST_DOUBLE) KOKKOSBLAS1_NRM2_TPL_SPEC_AVAIL_CUBLAS( double, Kokkos::LayoutLeft, Kokkos::CudaSpace) -#endif -#if defined (KOKKOSKERNELS_INST_FLOAT) KOKKOSBLAS1_NRM2_TPL_SPEC_AVAIL_CUBLAS( float, Kokkos::LayoutLeft, Kokkos::CudaSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) KOKKOSBLAS1_NRM2_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) KOKKOSBLAS1_NRM2_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) -#endif #endif diff --git a/packages/kokkos-kernels/src/impl/tpls/KokkosBlas1_nrminf_tpl_spec_avail.hpp b/packages/kokkos-kernels/src/impl/tpls/KokkosBlas1_nrminf_tpl_spec_avail.hpp index 1ebf2e2f40e9..072abff90493 100644 --- a/packages/kokkos-kernels/src/impl/tpls/KokkosBlas1_nrminf_tpl_spec_avail.hpp +++ b/packages/kokkos-kernels/src/impl/tpls/KokkosBlas1_nrminf_tpl_spec_avail.hpp @@ -70,45 +70,10 @@ Kokkos::View, \ Kokkos::MemoryTraits >, \ 1> { enum : bool { value = true }; }; -#if defined (KOKKOSKERNELS_INST_DOUBLE) KOKKOSBLAS1_NRMINF_TPL_SPEC_AVAIL_BLAS( double, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_FLOAT) KOKKOSBLAS1_NRMINF_TPL_SPEC_AVAIL_BLAS( float, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) KOKKOSBLAS1_NRMINF_TPL_SPEC_AVAIL_BLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) KOKKOSBLAS1_NRMINF_TPL_SPEC_AVAIL_BLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif - -#endif - -// cuBLAS -#ifdef KOKKOSKERNELS_ENABLE_TPL_CUBLAS -// double -#define KOKKOSBLAS1_NRMINF_TPL_SPEC_AVAIL_CUBLAS( SCALAR, LAYOUT, MEMSPACE ) \ -template \ -struct nrminf_tpl_spec_avail< \ -Kokkos::View::mag_type, LAYOUT, Kokkos::HostSpace, \ - Kokkos::MemoryTraits >, \ -Kokkos::View, \ - Kokkos::MemoryTraits >, \ -1> { enum : bool { value = true }; }; - -#if defined (KOKKOSKERNELS_INST_DOUBLE) -KOKKOSBLAS1_NRMINF_TPL_SPEC_AVAIL_CUBLAS( double, Kokkos::LayoutLeft, Kokkos::CudaSpace) -#endif -#if defined (KOKKOSKERNELS_INST_FLOAT) -KOKKOSBLAS1_NRMINF_TPL_SPEC_AVAIL_CUBLAS( float, Kokkos::LayoutLeft, Kokkos::CudaSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) -KOKKOSBLAS1_NRMINF_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) -KOKKOSBLAS1_NRMINF_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) -#endif #endif diff --git a/packages/kokkos-kernels/src/impl/tpls/KokkosBlas1_nrminf_tpl_spec_decl.hpp b/packages/kokkos-kernels/src/impl/tpls/KokkosBlas1_nrminf_tpl_spec_decl.hpp index 5f7a102e77dd..b91e81891aba 100644 --- a/packages/kokkos-kernels/src/impl/tpls/KokkosBlas1_nrminf_tpl_spec_decl.hpp +++ b/packages/kokkos-kernels/src/impl/tpls/KokkosBlas1_nrminf_tpl_spec_decl.hpp @@ -83,6 +83,7 @@ Kokkos::View, \ typedef Kokkos::View, \ Kokkos::MemoryTraits > XV; \ typedef typename XV::size_type size_type; \ + typedef Kokkos::Details::InnerProductSpaceTraits IPT; \ \ static void nrminf (RV& R, const XV& X) \ { \ @@ -94,7 +95,7 @@ Kokkos::View, \ int N = numElems; \ int one = 1; \ int idx = HostBlas::iamax(N,X.data(),one)-1; \ - R() = X(idx); \ + R() = IPT::norm(X(idx)); \ } else { \ NrmInf::nrminf(R,X); \ } \ @@ -116,6 +117,7 @@ Kokkos::View, \ typedef Kokkos::View, \ Kokkos::MemoryTraits > XV; \ typedef typename XV::size_type size_type; \ + typedef Kokkos::Details::InnerProductSpaceTraits IPT; \ \ static void nrminf (RV& R, const XV& X) \ { \ @@ -127,7 +129,7 @@ Kokkos::View, \ int N = numElems; \ int one = 1; \ int idx = HostBlas::iamax(N,X.data(),one)-1; \ - R() = X(idx); \ + R() = IPT::norm(X(idx)); \ } else { \ NrmInf::nrminf(R,X); \ } \ @@ -220,176 +222,4 @@ KOKKOSBLAS1_CNRMINF_TPL_SPEC_DECL_BLAS( Kokkos::LayoutLeft, Kokkos::HostSpace, f #endif -// cuBLAS -#ifdef KOKKOSKERNELS_ENABLE_TPL_CUBLAS -#include - -namespace KokkosBlas { -namespace Impl { - -#define KOKKOSBLAS1_DNRMINF_TPL_SPEC_DECL_CUBLAS( LAYOUT, MEMSPACE, ETI_SPEC_AVAIL ) \ -template \ -struct NrmInf< \ -Kokkos::View >, \ -Kokkos::View, \ - Kokkos::MemoryTraits >, \ -1,true, ETI_SPEC_AVAIL > { \ - \ - typedef Kokkos::View > RV; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > XV; \ - typedef typename XV::size_type size_type; \ - \ - static void nrminf (RV& R, const XV& X) \ - { \ - Kokkos::Profiling::pushRegion("KokkosBlas::nrminf[TPL_CUBLAS,double]"); \ - const size_type numElems = X.extent(0); \ - if (numElems == 0) { Kokkos::deep_copy (R, 0.0); return; } \ - if (numElems < static_cast (INT_MAX)) { \ - nrminf_print_specialization(); \ - const int N = static_cast (numElems); \ - constexpr int one = 1; \ - int idx; \ - KokkosBlas::Impl::CudaBlasSingleton & s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - cublasIdamax(s.handle, N, X.data(), one, &idx); \ - Kokkos::deep_copy(R, subview(X,idx-1)); \ - } else { \ - NrmInf::nrminf(R,X); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ -}; - -#define KOKKOSBLAS1_SNRMINF_TPL_SPEC_DECL_CUBLAS( LAYOUT, MEMSPACE, ETI_SPEC_AVAIL ) \ -template \ -struct NrmInf< \ -Kokkos::View >, \ -Kokkos::View, \ - Kokkos::MemoryTraits >, \ -1,true, ETI_SPEC_AVAIL > { \ - \ - typedef Kokkos::View > RV; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > XV; \ - typedef typename XV::size_type size_type; \ - \ - static void nrminf (RV& R, const XV& X) \ - { \ - Kokkos::Profiling::pushRegion("KokkosBlas::nrminf[TPL_CUBLAS,float]"); \ - const size_type numElems = X.extent(0); \ - if (numElems == 0) { Kokkos::deep_copy (R, 0.0f);; return; } \ - if (numElems < static_cast (INT_MAX)) { \ - nrminf_print_specialization(); \ - const int N = static_cast (numElems); \ - constexpr int one = 1; \ - int idx; \ - KokkosBlas::Impl::CudaBlasSingleton & s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - cublasIsamax(s.handle, N, X.data(), one, &idx); \ - Kokkos::deep_copy(R, subview(X,idx-1)); \ - } else { \ - NrmInf::nrminf(R,X); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ -}; - -#define KOKKOSBLAS1_ZNRMINF_TPL_SPEC_DECL_CUBLAS( LAYOUT, MEMSPACE, ETI_SPEC_AVAIL ) \ -template \ -struct NrmInf< \ -Kokkos::View >, \ -Kokkos::View*, LAYOUT, Kokkos::Device, \ - Kokkos::MemoryTraits >, \ -1,true, ETI_SPEC_AVAIL > { \ - \ - typedef Kokkos::View > RV; \ - typedef Kokkos::View*, LAYOUT, Kokkos::Device, \ - Kokkos::MemoryTraits > XV; \ - typedef typename XV::size_type size_type; \ - typedef Kokkos::Details::InnerProductSpaceTraits> IPT; \ - \ - static void nrminf (RV& R, const XV& X) \ - { \ - Kokkos::Profiling::pushRegion("KokkosBlas::nrminf[TPL_CUBLAS,complex]"); \ - const size_type numElems = X.extent(0); \ - if (numElems == 0) { Kokkos::deep_copy (R, 0.0); return; } \ - if (numElems < static_cast (INT_MAX)) { \ - nrminf_print_specialization(); \ - const int N = static_cast (numElems); \ - constexpr int one = 1; \ - int idx; \ - KokkosBlas::Impl::CudaBlasSingleton & s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - cublasIzamax(s.handle, N, reinterpret_cast(X.data()), one, &idx); \ - Kokkos::complex R_cplx_val {0.0, 0.0}; \ - Kokkos::View, LAYOUT, Kokkos::HostSpace, Kokkos::MemoryTraits > R_cplx (&R_cplx_val); \ - Kokkos::deep_copy(R_cplx, subview(X,idx-1)); \ - R() = IPT::norm(R_cplx()); \ - } else { \ - NrmInf::nrminf(R,X); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ -}; - -#define KOKKOSBLAS1_CNRMINF_TPL_SPEC_DECL_CUBLAS( LAYOUT, MEMSPACE, ETI_SPEC_AVAIL ) \ -template \ -struct NrmInf< \ -Kokkos::View >, \ -Kokkos::View*, LAYOUT, Kokkos::Device, \ - Kokkos::MemoryTraits >, \ -1,true, ETI_SPEC_AVAIL > { \ - \ - typedef Kokkos::View > RV; \ - typedef Kokkos::View*, LAYOUT, Kokkos::Device, \ - Kokkos::MemoryTraits > XV; \ - typedef typename XV::size_type size_type; \ - typedef Kokkos::Details::InnerProductSpaceTraits> IPT; \ - \ - static void nrminf (RV& R, const XV& X) \ - { \ - Kokkos::Profiling::pushRegion("KokkosBlas::nrminf[TPL_CUBLAS,complex]"); \ - const size_type numElems = X.extent(0); \ - if (numElems == 0) { Kokkos::deep_copy (R, 0.0f); return; } \ - if (numElems < static_cast (INT_MAX)) { \ - nrminf_print_specialization(); \ - const int N = static_cast (numElems); \ - constexpr int one = 1; \ - int idx; \ - KokkosBlas::Impl::CudaBlasSingleton & s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - cublasIcamax(s.handle, N, reinterpret_cast(X.data()), one, &idx); \ - Kokkos::complex R_cplx_val {0.0f, 0.0f}; \ - Kokkos::View, LAYOUT, Kokkos::HostSpace, Kokkos::MemoryTraits > R_cplx (&R_cplx_val); \ - Kokkos::deep_copy(R_cplx, subview(X,idx-1)); \ - R() = IPT::norm(R_cplx()); \ - } else { \ - NrmInf::nrminf(R,X); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ -}; - -KOKKOSBLAS1_DNRMINF_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutLeft, Kokkos::CudaSpace, true) -KOKKOSBLAS1_DNRMINF_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutLeft, Kokkos::CudaSpace, false) - -KOKKOSBLAS1_SNRMINF_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutLeft, Kokkos::CudaSpace, true) -KOKKOSBLAS1_SNRMINF_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutLeft, Kokkos::CudaSpace, false) - -KOKKOSBLAS1_ZNRMINF_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutLeft, Kokkos::CudaSpace, true) -KOKKOSBLAS1_ZNRMINF_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutLeft, Kokkos::CudaSpace, false) - -KOKKOSBLAS1_CNRMINF_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutLeft, Kokkos::CudaSpace, true) -KOKKOSBLAS1_CNRMINF_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutLeft, Kokkos::CudaSpace, false) - -} -} - -#endif - #endif diff --git a/packages/kokkos-kernels/src/impl/tpls/KokkosBlas1_scal_tpl_spec_avail.hpp b/packages/kokkos-kernels/src/impl/tpls/KokkosBlas1_scal_tpl_spec_avail.hpp index 2b92355dd9e4..114923cca7db 100644 --- a/packages/kokkos-kernels/src/impl/tpls/KokkosBlas1_scal_tpl_spec_avail.hpp +++ b/packages/kokkos-kernels/src/impl/tpls/KokkosBlas1_scal_tpl_spec_avail.hpp @@ -71,18 +71,10 @@ Kokkos::View, \ Kokkos::MemoryTraits >, \ 1> { enum : bool { value = true }; }; -#if defined (KOKKOSKERNELS_INST_DOUBLE) KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_BLAS( double, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_FLOAT) KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_BLAS( float, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_BLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_BLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif #endif @@ -99,33 +91,15 @@ Kokkos::View, \ Kokkos::MemoryTraits >, \ 1> { enum : bool { value = true }; }; -#if defined (KOKKOSKERNELS_INST_DOUBLE) KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_CUBLAS( double, Kokkos::LayoutLeft, Kokkos::CudaSpace) -#endif -#if defined (KOKKOSKERNELS_INST_FLOAT) KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_CUBLAS( float, Kokkos::LayoutLeft, Kokkos::CudaSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) -#endif -#if defined (KOKKOS_ENABLE_CUDA_UVM) -#if defined (KOKKOSKERNELS_INST_DOUBLE) KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_CUBLAS( double, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) -#endif -#if defined (KOKKOSKERNELS_INST_FLOAT) KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_CUBLAS( float, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) -#endif -#endif #endif diff --git a/packages/kokkos-kernels/src/impl/tpls/KokkosBlas1_scal_tpl_spec_decl.hpp b/packages/kokkos-kernels/src/impl/tpls/KokkosBlas1_scal_tpl_spec_decl.hpp index 579d9b81a5c7..d866702f4f74 100644 --- a/packages/kokkos-kernels/src/impl/tpls/KokkosBlas1_scal_tpl_spec_decl.hpp +++ b/packages/kokkos-kernels/src/impl/tpls/KokkosBlas1_scal_tpl_spec_decl.hpp @@ -376,7 +376,6 @@ KOKKOSBLAS1_ZSCAL_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutLeft, Kokkos::CudaSpace, f KOKKOSBLAS1_CSCAL_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutLeft, Kokkos::CudaSpace, true) KOKKOSBLAS1_CSCAL_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutLeft, Kokkos::CudaSpace, false) -#if defined (KOKKOS_ENABLE_CUDA_UVM) KOKKOSBLAS1_DSCAL_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, true) KOKKOSBLAS1_DSCAL_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, false) @@ -388,7 +387,6 @@ KOKKOSBLAS1_ZSCAL_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutLeft, Kokkos::CudaUVMSpace KOKKOSBLAS1_CSCAL_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, true) KOKKOSBLAS1_CSCAL_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, false) -#endif } } diff --git a/packages/kokkos-kernels/src/impl/tpls/KokkosBlas2_gemv_tpl_spec_avail.hpp b/packages/kokkos-kernels/src/impl/tpls/KokkosBlas2_gemv_tpl_spec_avail.hpp index c1fd67f9ea17..5c6d1734dcdb 100644 --- a/packages/kokkos-kernels/src/impl/tpls/KokkosBlas2_gemv_tpl_spec_avail.hpp +++ b/packages/kokkos-kernels/src/impl/tpls/KokkosBlas2_gemv_tpl_spec_avail.hpp @@ -67,39 +67,15 @@ struct gemv_tpl_spec_avail< \ Kokkos::MemoryTraits > \ > { enum : bool { value = true }; }; -#if defined (KOKKOSKERNELS_INST_DOUBLE) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_BLAS( double, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_FLOAT) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_BLAS( float, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_BLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_BLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_DOUBLE) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_BLAS( double, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_FLOAT) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_BLAS( float, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_BLAS( Kokkos::complex, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_BLAS( Kokkos::complex, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace) -#endif #endif @@ -117,39 +93,15 @@ struct gemv_tpl_spec_avail< \ Kokkos::MemoryTraits > \ > { enum : bool { value = true }; }; -#if defined (KOKKOSKERNELS_INST_DOUBLE) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_CUBLAS( double, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace) -#endif -#if defined (KOKKOSKERNELS_INST_FLOAT) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_CUBLAS( float, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace) -#endif -#if defined (KOKKOSKERNELS_INST_DOUBLE) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_CUBLAS( double, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace) -#endif -#if defined (KOKKOSKERNELS_INST_FLOAT) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_CUBLAS( float, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace) -#endif #endif } diff --git a/packages/kokkos-kernels/src/impl/tpls/KokkosBlas3_gemm_tpl_spec_avail.hpp b/packages/kokkos-kernels/src/impl/tpls/KokkosBlas3_gemm_tpl_spec_avail.hpp index 49f6fe743c80..3b21c0e8a740 100644 --- a/packages/kokkos-kernels/src/impl/tpls/KokkosBlas3_gemm_tpl_spec_avail.hpp +++ b/packages/kokkos-kernels/src/impl/tpls/KokkosBlas3_gemm_tpl_spec_avail.hpp @@ -67,39 +67,15 @@ struct gemm_tpl_spec_avail< \ Kokkos::MemoryTraits > \ > { enum : bool { value = true }; }; -#if defined (KOKKOSKERNELS_INST_DOUBLE) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_BLAS( double, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_FLOAT) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_BLAS( float, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_BLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_BLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_DOUBLE) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_BLAS( double, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_FLOAT) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_BLAS( float, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_BLAS( Kokkos::complex, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_BLAS( Kokkos::complex, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace) -#endif #endif @@ -117,47 +93,23 @@ struct gemm_tpl_spec_avail< \ Kokkos::MemoryTraits > \ > { enum : bool { value = true }; }; -#if defined (KOKKOSKERNELS_INST_DOUBLE) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS( double, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace) KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS( double, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) -#endif -#if defined (KOKKOSKERNELS_INST_FLOAT) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS( float, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace) KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS( float, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace) KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace) KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) -#endif -#if defined (KOKKOSKERNELS_INST_DOUBLE) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS( double, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace) KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS( double, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace) -#endif -#if defined (KOKKOSKERNELS_INST_FLOAT) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS( float, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace) KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS( float, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace) KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace) KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace) -#endif #endif } diff --git a/packages/kokkos-kernels/src/impl/tpls/KokkosBlas3_trmm_tpl_spec_avail.hpp b/packages/kokkos-kernels/src/impl/tpls/KokkosBlas3_trmm_tpl_spec_avail.hpp index bce7cb5f5d27..03e2badcc176 100644 --- a/packages/kokkos-kernels/src/impl/tpls/KokkosBlas3_trmm_tpl_spec_avail.hpp +++ b/packages/kokkos-kernels/src/impl/tpls/KokkosBlas3_trmm_tpl_spec_avail.hpp @@ -66,39 +66,15 @@ struct trmm_tpl_spec_avail< \ Kokkos::MemoryTraits > \ > { enum : bool { value = true }; }; -#if defined (KOKKOSKERNELS_INST_DOUBLE) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_BLAS( double, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_FLOAT) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_BLAS( float, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_BLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_BLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_DOUBLE) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_BLAS( double, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_FLOAT) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_BLAS( float, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_BLAS( Kokkos::complex, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_BLAS( Kokkos::complex, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace) -#endif #endif // KOKKOSKERNELS_ENABLE_TPL_BLAS @@ -114,47 +90,23 @@ struct trmm_tpl_spec_avail< \ Kokkos::MemoryTraits > \ > { enum : bool { value = true }; }; -#if defined (KOKKOSKERNELS_INST_DOUBLE) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS( double, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace) KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS( double, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) -#endif -#if defined (KOKKOSKERNELS_INST_FLOAT) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS( float, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace) KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS( float, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace) KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace) KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) -#endif -#if defined (KOKKOSKERNELS_INST_DOUBLE) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS( double, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace) KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS( double, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace) -#endif -#if defined (KOKKOSKERNELS_INST_FLOAT) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS( float, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace) KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS( float, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace) KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace) KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace) -#endif #endif // KOKKOSKERNELS_ENABLE_TPL_CUBLAS } // namespace Impl diff --git a/packages/kokkos-kernels/src/impl/tpls/KokkosBlas3_trsm_tpl_spec_avail.hpp b/packages/kokkos-kernels/src/impl/tpls/KokkosBlas3_trsm_tpl_spec_avail.hpp index 93808e3eb01a..29a04fb715da 100644 --- a/packages/kokkos-kernels/src/impl/tpls/KokkosBlas3_trsm_tpl_spec_avail.hpp +++ b/packages/kokkos-kernels/src/impl/tpls/KokkosBlas3_trsm_tpl_spec_avail.hpp @@ -66,39 +66,15 @@ struct trsm_tpl_spec_avail< \ Kokkos::MemoryTraits > \ > { enum : bool { value = true }; }; -#if defined (KOKKOSKERNELS_INST_DOUBLE) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_BLAS( double, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_FLOAT) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_BLAS( float, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_BLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_BLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_DOUBLE) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_BLAS( double, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_FLOAT) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_BLAS( float, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_BLAS( Kokkos::complex, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_BLAS( Kokkos::complex, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace) -#endif #endif @@ -114,47 +90,23 @@ struct trsm_tpl_spec_avail< \ Kokkos::MemoryTraits > \ > { enum : bool { value = true }; }; -#if defined (KOKKOSKERNELS_INST_DOUBLE) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS( double, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace) KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS( double, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) -#endif -#if defined (KOKKOSKERNELS_INST_FLOAT) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS( float, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace) KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS( float, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace) KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace) KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) -#endif -#if defined (KOKKOSKERNELS_INST_DOUBLE) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS( double, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace) KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS( double, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace) -#endif -#if defined (KOKKOSKERNELS_INST_FLOAT) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS( float, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace) KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS( float, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace) KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace) KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace) -#endif #endif } diff --git a/packages/kokkos-kernels/src/impl/tpls/KokkosBlas_gesv_tpl_spec_avail.hpp b/packages/kokkos-kernels/src/impl/tpls/KokkosBlas_gesv_tpl_spec_avail.hpp index 917a55fec467..e25a9aa3f18a 100644 --- a/packages/kokkos-kernels/src/impl/tpls/KokkosBlas_gesv_tpl_spec_avail.hpp +++ b/packages/kokkos-kernels/src/impl/tpls/KokkosBlas_gesv_tpl_spec_avail.hpp @@ -66,22 +66,10 @@ struct gesv_tpl_spec_avail< \ Kokkos::MemoryTraits > \ > { enum : bool { value = true }; }; -#if defined (KOKKOSKERNELS_INST_DOUBLE) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) KOKKOSBLAS_GESV_TPL_SPEC_AVAIL_BLAS( double, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_FLOAT) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) KOKKOSBLAS_GESV_TPL_SPEC_AVAIL_BLAS( float, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) KOKKOSBLAS_GESV_TPL_SPEC_AVAIL_BLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) KOKKOSBLAS_GESV_TPL_SPEC_AVAIL_BLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif /* #if defined (KOKKOSKERNELS_INST_DOUBLE) \ && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) @@ -114,22 +102,10 @@ struct gesv_tpl_spec_avail< \ Kokkos::MemoryTraits > \ > { enum : bool { value = true }; }; -#if defined (KOKKOSKERNELS_INST_DOUBLE) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) KOKKOSBLAS_GESV_TPL_SPEC_AVAIL_MAGMA( double, Kokkos::LayoutLeft, Kokkos::CudaSpace) -#endif -#if defined (KOKKOSKERNELS_INST_FLOAT) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) KOKKOSBLAS_GESV_TPL_SPEC_AVAIL_MAGMA( float, Kokkos::LayoutLeft, Kokkos::CudaSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) KOKKOSBLAS_GESV_TPL_SPEC_AVAIL_MAGMA( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) KOKKOSBLAS_GESV_TPL_SPEC_AVAIL_MAGMA( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) -#endif /* #if defined (KOKKOSKERNELS_INST_DOUBLE) \ diff --git a/packages/kokkos-kernels/src/impl/tpls/KokkosBlas_trtri_tpl_spec_avail.hpp b/packages/kokkos-kernels/src/impl/tpls/KokkosBlas_trtri_tpl_spec_avail.hpp index fa651f531f72..4b602bd765d9 100644 --- a/packages/kokkos-kernels/src/impl/tpls/KokkosBlas_trtri_tpl_spec_avail.hpp +++ b/packages/kokkos-kernels/src/impl/tpls/KokkosBlas_trtri_tpl_spec_avail.hpp @@ -78,55 +78,31 @@ KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL(SCALAR, LAYOUTA, MEMSPACE) #define KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_MAGMA( SCALAR , LAYOUTA, MEMSPACE ) #endif // KOKKOSKERNELS_ENABLE_TPL_MAGMA -#if defined (KOKKOSKERNELS_INST_DOUBLE) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_BLAS( double, Kokkos::LayoutLeft, Kokkos::HostSpace) KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_MAGMA( double, Kokkos::LayoutLeft, Kokkos::CudaSpace) KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_MAGMA( double, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) -#endif -#if defined (KOKKOSKERNELS_INST_FLOAT) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_BLAS( float, Kokkos::LayoutLeft, Kokkos::HostSpace) KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_MAGMA( float, Kokkos::LayoutLeft, Kokkos::CudaSpace) KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_MAGMA( float, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_BLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HostSpace) KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_MAGMA( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_MAGMA( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_BLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HostSpace) KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_MAGMA( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_MAGMA( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) -#endif -#if defined (KOKKOSKERNELS_INST_DOUBLE) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_BLAS( double, Kokkos::LayoutRight, Kokkos::HostSpace) KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_MAGMA( double, Kokkos::LayoutRight, Kokkos::CudaSpace) KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_MAGMA( double, Kokkos::LayoutRight, Kokkos::CudaUVMSpace) -#endif -#if defined (KOKKOSKERNELS_INST_FLOAT) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_BLAS( float, Kokkos::LayoutRight, Kokkos::HostSpace) KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_MAGMA( float, Kokkos::LayoutRight, Kokkos::CudaSpace) KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_MAGMA( float, Kokkos::LayoutRight, Kokkos::CudaUVMSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_BLAS( Kokkos::complex, Kokkos::LayoutRight, Kokkos::HostSpace) KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_MAGMA( Kokkos::complex, Kokkos::LayoutRight, Kokkos::CudaSpace) KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_MAGMA( Kokkos::complex, Kokkos::LayoutRight, Kokkos::CudaUVMSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_BLAS( Kokkos::complex, Kokkos::LayoutRight, Kokkos::HostSpace) KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_MAGMA( Kokkos::complex, Kokkos::LayoutRight, Kokkos::CudaSpace) KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_MAGMA( Kokkos::complex, Kokkos::LayoutRight, Kokkos::CudaUVMSpace) -#endif } // namespace Impl } // namespace KokkosBlas diff --git a/packages/kokkos-kernels/src/impl/tpls/KokkosSparse_spmv_tpl_spec_avail.hpp b/packages/kokkos-kernels/src/impl/tpls/KokkosSparse_spmv_tpl_spec_avail.hpp index abeab8c2149a..a6749be8c8e6 100644 --- a/packages/kokkos-kernels/src/impl/tpls/KokkosSparse_spmv_tpl_spec_avail.hpp +++ b/packages/kokkos-kernels/src/impl/tpls/KokkosSparse_spmv_tpl_spec_avail.hpp @@ -71,117 +71,22 @@ struct spmv_tpl_spec_avail, int, int, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace) -#endif - -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) \ - && defined (KOKKOSKERNELS_INST_ORDINAL_INT) \ - && defined (KOKKOSKERNELS_INST_OFFSET_INT) KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex, int, int, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace) -#endif - -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) \ - && defined (KOKKOSKERNELS_INST_ORDINAL_INT) \ - && defined (KOKKOSKERNELS_INST_OFFSET_INT) KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex, int, int, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace) -#endif - -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) \ - && defined (KOKKOSKERNELS_INST_ORDINAL_INT) \ - && defined (KOKKOSKERNELS_INST_OFFSET_INT) KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex, int, int, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace) -#endif - -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) \ - && defined (KOKKOSKERNELS_INST_ORDINAL_INT) \ - && defined (KOKKOSKERNELS_INST_OFFSET_INT) KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex, int, int, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) -#endif - -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) \ - && defined (KOKKOSKERNELS_INST_ORDINAL_INT) \ - && defined (KOKKOSKERNELS_INST_OFFSET_INT) KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex, int, int, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) -#endif - -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) \ - && defined (KOKKOSKERNELS_INST_ORDINAL_INT) \ - && defined (KOKKOSKERNELS_INST_OFFSET_INT) KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex, int, int, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace) -#endif - -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) \ - && defined (KOKKOSKERNELS_INST_ORDINAL_INT) \ - && defined (KOKKOSKERNELS_INST_OFFSET_INT) KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex, int, int, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace) -#endif //CUDA_VERSION by itself cannot determine whether the generic cuSPARSE API is available: //cuSPARSE version 10.1.105 does not have the generic API, but it comes with the same CUDA_VERSION (10010) as 10.1.243 which does. @@ -190,122 +95,52 @@ struct spmv_tpl_spec_avail, int64_t, size_t, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace) -#endif - -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) \ - && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) \ - && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex, int64_t, size_t, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace) -#endif - -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) \ - && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) \ - && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex, int64_t, size_t, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace) -#endif - -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) \ - && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) \ - && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex, int64_t, size_t, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace) -#endif - -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) \ - && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) \ - && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex, int64_t, size_t, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) -#endif - -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) \ - && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) \ - && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex, int64_t, size_t, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) -#endif - -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) \ - && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) \ - && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex, int64_t, size_t, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace) -#endif - -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) \ - && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) \ - && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex, int64_t, size_t, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace) -#endif #endif // CUSPARSE >= 10.3 (nested, implies >= 9.0) #endif // CUDA/CUSPARSE >= 9.0? #endif // KOKKOSKERNELS_ENABLE_TPL_CUSPARSE +#ifdef KOKKOSKERNELS_ENABLE_TPL_MKL +#define KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_MKL(SCALAR, EXECSPACE) \ +template <> \ +struct spmv_tpl_spec_avail, Kokkos::MemoryTraits, const int, \ + const SCALAR*, Kokkos::LayoutLeft, Kokkos::Device, Kokkos::MemoryTraits, \ + SCALAR*, Kokkos::LayoutLeft, Kokkos::Device, Kokkos::MemoryTraits > { \ + enum : bool { value = true }; \ +}; + +#ifdef KOKKOS_ENABLE_SERIAL +KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_MKL(float, Kokkos::Serial) +KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_MKL(double, Kokkos::Serial) +KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_MKL(Kokkos::complex, Kokkos::Serial) +KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_MKL(Kokkos::complex, Kokkos::Serial) +#endif + +#ifdef KOKKOS_ENABLE_OPENMP +KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_MKL(float, Kokkos::OpenMP) +KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_MKL(double, Kokkos::OpenMP) +KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_MKL(Kokkos::complex, Kokkos::OpenMP) +KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_MKL(Kokkos::complex, Kokkos::OpenMP) +#endif + +#endif + // Specialization struct which defines whether a specialization exists template + +namespace KokkosSparse +{ +namespace Impl +{ + +#if (__INTEL_MKL__ > 2017) + //MKL 2018 and above: use new interface: sparse_matrix_t and mkl_sparse_?_mv() + + inline void mkl_safe_call(int errcode) + { + if(errcode != SPARSE_STATUS_SUCCESS) + throw std::runtime_error("MKL returned non-success error code"); + } + + inline sparse_operation_t mode_kk_to_mkl(char mode_kk) + { + switch(toupper(mode_kk)) + { + case 'N': + return SPARSE_OPERATION_NON_TRANSPOSE; + case 'T': + return SPARSE_OPERATION_TRANSPOSE; + case 'H': + return SPARSE_OPERATION_CONJUGATE_TRANSPOSE; + default:; + } + throw std::invalid_argument("Invalid mode for MKL (should be one of N, T, H)"); + } + + inline void spmv_mkl(sparse_operation_t op, float alpha, float beta, + int m, int n, const int* Arowptrs, const int* Aentries, const float* Avalues, + const float* x, float* y) + { + sparse_matrix_t A_mkl; + matrix_descr A_descr; + A_descr.type = SPARSE_MATRIX_TYPE_GENERAL; + A_descr.mode = SPARSE_FILL_MODE_FULL; + A_descr.diag = SPARSE_DIAG_NON_UNIT; + mkl_safe_call(mkl_sparse_s_create_csr(&A_mkl, SPARSE_INDEX_BASE_ZERO, m, n, + const_cast(Arowptrs), const_cast(Arowptrs + 1), + const_cast(Aentries), const_cast(Avalues))); + mkl_safe_call(mkl_sparse_s_mv(op, alpha, A_mkl, A_descr, x, beta, y)); + } + + inline void spmv_mkl(sparse_operation_t op, double alpha, double beta, + int m, int n, const int* Arowptrs, const int* Aentries, const double* Avalues, + const double* x, double* y) + { + sparse_matrix_t A_mkl; + matrix_descr A_descr; + A_descr.type = SPARSE_MATRIX_TYPE_GENERAL; + A_descr.mode = SPARSE_FILL_MODE_FULL; + A_descr.diag = SPARSE_DIAG_NON_UNIT; + mkl_safe_call(mkl_sparse_d_create_csr(&A_mkl, SPARSE_INDEX_BASE_ZERO, m, n, + const_cast(Arowptrs), const_cast(Arowptrs + 1), + const_cast(Aentries), const_cast(Avalues))); + mkl_safe_call(mkl_sparse_d_mv(op, alpha, A_mkl, A_descr, x, beta, y)); + } + + inline void spmv_mkl(sparse_operation_t op, Kokkos::complex alpha, Kokkos::complex beta, + int m, int n, const int* Arowptrs, const int* Aentries, const Kokkos::complex* Avalues, + const Kokkos::complex* x, Kokkos::complex* y) + { + sparse_matrix_t A_mkl; + matrix_descr A_descr; + A_descr.type = SPARSE_MATRIX_TYPE_GENERAL; + A_descr.mode = SPARSE_FILL_MODE_FULL; + A_descr.diag = SPARSE_DIAG_NON_UNIT; + mkl_safe_call(mkl_sparse_c_create_csr(&A_mkl, SPARSE_INDEX_BASE_ZERO, m, n, + const_cast(Arowptrs), const_cast(Arowptrs + 1), + const_cast(Aentries), (MKL_Complex8*) Avalues)); + MKL_Complex8& alpha_mkl = reinterpret_cast(alpha); + MKL_Complex8& beta_mkl = reinterpret_cast(beta); + mkl_safe_call(mkl_sparse_c_mv( + op, alpha_mkl, A_mkl, A_descr, + reinterpret_cast(x), beta_mkl, reinterpret_cast(y))); + } + + inline void spmv_mkl(sparse_operation_t op, Kokkos::complex alpha, Kokkos::complex beta, + int m, int n, const int* Arowptrs, const int* Aentries, const Kokkos::complex* Avalues, + const Kokkos::complex* x, Kokkos::complex* y) + { + sparse_matrix_t A_mkl; + matrix_descr A_descr; + A_descr.type = SPARSE_MATRIX_TYPE_GENERAL; + A_descr.mode = SPARSE_FILL_MODE_FULL; + A_descr.diag = SPARSE_DIAG_NON_UNIT; + mkl_safe_call(mkl_sparse_z_create_csr(&A_mkl, SPARSE_INDEX_BASE_ZERO, m, n, + const_cast(Arowptrs), const_cast(Arowptrs + 1), + const_cast(Aentries), (MKL_Complex16*) Avalues)); + MKL_Complex16& alpha_mkl = reinterpret_cast(alpha); + MKL_Complex16& beta_mkl = reinterpret_cast(beta); + mkl_safe_call(mkl_sparse_z_mv( + op, alpha_mkl, A_mkl, A_descr, + reinterpret_cast(x), beta_mkl, reinterpret_cast(y))); + } + +#define KOKKOSSPARSE_SPMV_MKL(SCALAR, EXECSPACE, COMPILE_LIBRARY) \ + template<> \ + struct SPMV, Kokkos::MemoryTraits, int const, \ + SCALAR const*, Kokkos::LayoutLeft, Kokkos::Device, Kokkos::MemoryTraits, \ + SCALAR*, Kokkos::LayoutLeft, Kokkos::Device, Kokkos::MemoryTraits, \ + true, COMPILE_LIBRARY> { \ + \ + using device_type = Kokkos::Device; \ + using AMatrix = CrsMatrix, int const>; \ + using XVector = Kokkos::View>; \ + using YVector = Kokkos::View>; \ + using coefficient_type = typename YVector::non_const_value_type; \ + using Controls = KokkosKernels::Experimental::Controls; \ + \ + static void spmv (const Controls&, \ + const char mode[], \ + const coefficient_type& alpha, \ + const AMatrix& A, \ + const XVector& x, \ + const coefficient_type& beta, \ + const YVector& y) { \ + std::string label = "KokkosSparse::spmv[TPL_MKL," + Kokkos::ArithTraits::name() + "]"; \ + Kokkos::Profiling::pushRegion(label); \ + spmv_mkl(mode_kk_to_mkl(mode[0]), alpha, beta, A.numRows(), A.numCols(), \ + A.graph.row_map.data(), A.graph.entries.data(), A.values.data(), x.data(), y.data()); \ + Kokkos::Profiling::popRegion(); \ + } \ + }; +#endif + +#if (__INTEL_MKL__ == 2017) + //MKL 2017: use old interface: mkl_?csrmv + inline char mode_kk_to_mkl(char mode_kk) + { + switch(toupper(mode_kk)) + { + case 'N': + return 'N'; + case 'T': + return 'T'; + case 'H': + return 'C'; + default:; + } + throw std::invalid_argument("Invalid mode for MKL (should be one of N, T, H)"); + } + + + //void mkl_scsrmv(const char *transa, const MKL_INT *m, const MKL_INT *k, const float *alpha, const char *matdescra, const float *val, const MKL_INT *indx, const MKL_INT *pn trb, const MKL_INT *pntre, const float *x, const float *beta, float *y); + inline void spmv_mkl(char mode, float alpha, float beta, int m, int n, const int* Arowptrs, const int* Aentries, const float* Avalues, const float* x, float* y) + { + mkl_scsrmv(&mode, &m, &n, &alpha, "G**C", Avalues, Aentries, Arowptrs, Arowptrs + 1, x, &beta, y); + } + + inline void spmv_mkl(char mode, double alpha, double beta, int m, int n, const int* Arowptrs, const int* Aentries, const double* Avalues, const double* x, double* y) + { + mkl_dcsrmv(&mode, &m, &n, &alpha, "G**C", Avalues, Aentries, Arowptrs, Arowptrs + 1, x, &beta, y); + } + + inline void spmv_mkl(char mode, Kokkos::complex alpha, Kokkos::complex beta, int m, int n, const int* Arowptrs, const int* Aentries, const Kokkos::complex* Avalues, const Kokkos::complex* x, Kokkos::complex* y) + { + const MKL_Complex8* alpha_mkl = reinterpret_cast(&alpha); + const MKL_Complex8* beta_mkl = reinterpret_cast(&beta); + const MKL_Complex8* Avalues_mkl = reinterpret_cast(Avalues); + const MKL_Complex8* x_mkl = reinterpret_cast(x); + MKL_Complex8* y_mkl = reinterpret_cast(y); + mkl_ccsrmv(&mode, &m, &n, alpha_mkl, "G**C", Avalues_mkl, Aentries, Arowptrs, Arowptrs + 1, x_mkl, beta_mkl, y_mkl); + } + + inline void spmv_mkl(char mode, Kokkos::complex alpha, Kokkos::complex beta, int m, int n, const int* Arowptrs, const int* Aentries, const Kokkos::complex* Avalues, const Kokkos::complex* x, Kokkos::complex* y) + { + const MKL_Complex16* alpha_mkl = reinterpret_cast(&alpha); + const MKL_Complex16* beta_mkl = reinterpret_cast(&beta); + const MKL_Complex16* Avalues_mkl = reinterpret_cast(Avalues); + const MKL_Complex16* x_mkl = reinterpret_cast(x); + MKL_Complex16* y_mkl = reinterpret_cast(y); + mkl_zcsrmv(&mode, &m, &n, alpha_mkl, "G**C", Avalues_mkl, Aentries, Arowptrs, Arowptrs + 1, x_mkl, beta_mkl, y_mkl); + } + +#define KOKKOSSPARSE_SPMV_MKL(SCALAR, EXECSPACE, COMPILE_LIBRARY) \ + template<> \ + struct SPMV, Kokkos::MemoryTraits, int const, \ + SCALAR const*, Kokkos::LayoutLeft, Kokkos::Device, Kokkos::MemoryTraits, \ + SCALAR*, Kokkos::LayoutLeft, Kokkos::Device, Kokkos::MemoryTraits, \ + true, COMPILE_LIBRARY> { \ + \ + using device_type = Kokkos::Device; \ + using AMatrix = CrsMatrix, int const>; \ + using XVector = Kokkos::View>; \ + using YVector = Kokkos::View>; \ + using coefficient_type = typename YVector::non_const_value_type; \ + using Controls = KokkosKernels::Experimental::Controls; \ + \ + static void spmv (const Controls&, \ + const char mode[], \ + const coefficient_type& alpha, \ + const AMatrix& A, \ + const XVector& x, \ + const coefficient_type& beta, \ + const YVector& y) { \ + std::string label = "KokkosSparse::spmv[TPL_MKL," + Kokkos::ArithTraits::name() + "]"; \ + Kokkos::Profiling::pushRegion(label); \ + spmv_mkl(mode_kk_to_mkl(mode[0]), alpha, beta, A.numRows(), A.numCols(), \ + A.graph.row_map.data(), A.graph.entries.data(), A.values.data(), x.data(), y.data()); \ + Kokkos::Profiling::popRegion(); \ + } \ + }; +#endif + +#ifdef KOKKOS_ENABLE_SERIAL + KOKKOSSPARSE_SPMV_MKL(float, Kokkos::Serial, true) + KOKKOSSPARSE_SPMV_MKL(double, Kokkos::Serial, true) + KOKKOSSPARSE_SPMV_MKL(Kokkos::complex, Kokkos::Serial, true) + KOKKOSSPARSE_SPMV_MKL(Kokkos::complex, Kokkos::Serial, true) +#endif + +#ifdef KOKKOS_ENABLE_OPENMP + KOKKOSSPARSE_SPMV_MKL(float, Kokkos::OpenMP, true) + KOKKOSSPARSE_SPMV_MKL(double, Kokkos::OpenMP, true) + KOKKOSSPARSE_SPMV_MKL(Kokkos::complex, Kokkos::OpenMP, true) + KOKKOSSPARSE_SPMV_MKL(Kokkos::complex, Kokkos::OpenMP, true) +#endif + +#undef KOKKOSSPARSE_SPMV_MKL +} +} +#endif + #endif // KOKKOSPARSE_SPMV_TPL_SPEC_DECL_HPP_ diff --git a/packages/kokkos-kernels/src/sparse/KokkosSparse_CrsMatrix.hpp b/packages/kokkos-kernels/src/sparse/KokkosSparse_CrsMatrix.hpp index d866a6360159..d734d9ac3ac5 100644 --- a/packages/kokkos-kernels/src/sparse/KokkosSparse_CrsMatrix.hpp +++ b/packages/kokkos-kernels/src/sparse/KokkosSparse_CrsMatrix.hpp @@ -411,7 +411,7 @@ class CrsMatrix { typedef SizeType size_type; //! Type of a host-memory mirror of the sparse matrix. - typedef CrsMatrix HostMirror; + typedef CrsMatrix HostMirror; //! Type of the graph structure of the sparse matrix. typedef Kokkos::StaticCrsGraph StaticCrsGraphType; //! Type of the graph structure of the sparse matrix - consistent with Kokkos. @@ -473,13 +473,13 @@ class CrsMatrix { {} //! Copy constructor (shallow copy). - template + template KOKKOS_INLINE_FUNCTION - CrsMatrix (const CrsMatrix & B) : + CrsMatrix (const CrsMatrix & B) : graph (B.graph.entries, B.graph.row_map), values (B.values), dev_config (B.dev_config), @@ -494,14 +494,36 @@ class CrsMatrix { //as the constructor of StaticCrsGraph does not allow copy from non const version. } + //! Deep copy constructor (can cross spaces) + template + CrsMatrix (const std::string&, + const CrsMatrix& mat_) + { + typename row_map_type::non_const_type rowmap(Kokkos::ViewAllocateWithoutInitializing("rowmap"), mat_.graph.row_map.extent(0)); + index_type cols(Kokkos::ViewAllocateWithoutInitializing("cols"), mat_.nnz()); + values = values_type(Kokkos::ViewAllocateWithoutInitializing("values"), mat_.nnz()); + Kokkos::deep_copy(rowmap, mat_.graph.row_map); + Kokkos::deep_copy(cols, mat_.graph.entries); + Kokkos::deep_copy(values, mat_.values); + + numCols_ = mat_.numCols(); + graph = StaticCrsGraphType(cols, rowmap); + +#ifdef KOKKOS_USE_CUSPARSE + cusparseCreate (&cusparse_handle); + cusparseCreateMatDescr (&cusparse_descr); +#endif // KOKKOS_USE_CUSPARSE + } + /// \brief Construct with a graph that will be shared. /// /// Allocate the values array for subsquent fill. - CrsMatrix (const std::string& arg_label, - const staticcrsgraph_type& arg_graph) : - graph (arg_graph), - values (arg_label, arg_graph.entries.extent(0)), - numCols_ (maximum_entry (arg_graph) + 1) + template + CrsMatrix (const std::string& label, + const Kokkos::StaticCrsGraph& graph_) : + graph (graph_.entries, graph_.row_map), + values (label, graph_.entries.extent(0)), + numCols_ (maximum_entry (graph_) + 1) {} /// \brief Constructor that copies raw arrays of host data in @@ -609,11 +631,12 @@ class CrsMatrix { /// \param rows [in/out] The row map (containing the offsets to the /// data in each row). /// \param cols [in/out] The column indices. - CrsMatrix (const std::string& /* label */, + template + CrsMatrix (const std::string&, const OrdinalType& ncols, const values_type& vals, - const staticcrsgraph_type& graph_) : - graph (graph_), + const Kokkos::StaticCrsGraph& graph_) : + graph (graph_.entries, graph_.row_map), values (vals), numCols_ (ncols) { @@ -888,7 +911,6 @@ ctor_impl (const std::string &label, row_lengths[i] = rows[i + 1] - rows[i]; } - str = label; graph = Kokkos::create_staticcrsgraph (str.append (".graph"), row_lengths); typename values_type::HostMirror h_values = Kokkos::create_mirror_view (values); typename index_type::HostMirror h_entries = Kokkos::create_mirror_view (graph.entries); diff --git a/packages/kokkos-kernels/src/sparse/KokkosSparse_spgemm.hpp b/packages/kokkos-kernels/src/sparse/KokkosSparse_spgemm.hpp index 72b700040130..ef4abfc20b19 100644 --- a/packages/kokkos-kernels/src/sparse/KokkosSparse_spgemm.hpp +++ b/packages/kokkos-kernels/src/sparse/KokkosSparse_spgemm.hpp @@ -53,7 +53,6 @@ namespace KokkosSparse { template void spgemm_symbolic(KernelHandle& kh, const AMatrix& A, const bool Amode, const BMatrix& B, const bool Bmode, CMatrix& C) { - using graph_type = typename CMatrix::staticcrsgraph_type; using row_map_type = typename CMatrix::row_map_type::non_const_type; using entries_type = typename CMatrix::index_type::non_const_type; using values_type = typename CMatrix::values_type::non_const_type; @@ -77,8 +76,7 @@ void spgemm_symbolic(KernelHandle& kh, const AMatrix& A, const bool Amode, c_nnz_size); } - graph_type graphC(entriesC, row_mapC); - C = CMatrix("matrix", graphC); + C = CMatrix("C=AB", A.numRows(), B.numCols(), c_nnz_size, valuesC, row_mapC, entriesC); } template diff --git a/packages/kokkos-kernels/src/sparse/KokkosSparse_spmv.hpp b/packages/kokkos-kernels/src/sparse/KokkosSparse_spmv.hpp index ca83cb217bda..aca370e4760c 100644 --- a/packages/kokkos-kernels/src/sparse/KokkosSparse_spmv.hpp +++ b/packages/kokkos-kernels/src/sparse/KokkosSparse_spmv.hpp @@ -157,12 +157,23 @@ spmv (KokkosKernels::Experimental::Controls controls, #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE //cuSPARSE does not support the conjugate mode (C), and cuSPARSE 9 only supports the normal (N) mode. + if(std::is_same::value || + std::is_same::value) + { #if (9000 <= CUDA_VERSION) - useFallback = useFallback || (mode[0] != NoTranspose[0]); + useFallback = useFallback || (mode[0] != NoTranspose[0]); #endif #if defined(CUSPARSE_VERSION) && (10300 <= CUSPARSE_VERSION) - useFallback = useFallback || (mode[0] == Conjugate[0]); + useFallback = useFallback || (mode[0] == Conjugate[0]); +#endif + } #endif + +#ifdef KOKKOSKERNELS_ENABLE_TPL_MKL + if(std::is_same::value) + { + useFallback = useFallback || (mode[0] == Conjugate[0]); + } #endif if(useFallback) diff --git a/packages/kokkos-kernels/src/sparse/KokkosSparse_sptrsv.hpp b/packages/kokkos-kernels/src/sparse/KokkosSparse_sptrsv.hpp index aafd3655905e..2ac041201e5d 100644 --- a/packages/kokkos-kernels/src/sparse/KokkosSparse_sptrsv.hpp +++ b/packages/kokkos-kernels/src/sparse/KokkosSparse_sptrsv.hpp @@ -108,11 +108,17 @@ namespace Experimental { Kokkos::MemoryTraits > Entries_Internal; + #ifdef KK_TRISOLVE_TIMERS + Kokkos::Timer timer_sptrsv; + #endif RowMap_Internal rowmap_i = rowmap; Entries_Internal entries_i = entries; KokkosSparse::Impl::SPTRSV_SYMBOLIC::sptrsv_symbolic (&tmp_handle, rowmap_i, entries_i); + #ifdef KK_TRISOLVE_TIMERS + std::cout << " > sptrsv_symbolic time = " << timer_sptrsv.seconds() << std::endl; + #endif } // sptrsv_symbolic template > Values_Internal; + #ifdef KK_TRISOLVE_TIMERS + Kokkos::Timer timer_sptrsv; + #endif auto sptrsv_handle = handle->get_sptrsv_handle(); if (sptrsv_handle->get_algorithm() == KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE) { RowMap_Internal rowmap_i = rowmap; @@ -189,7 +198,9 @@ namespace Experimental { else { KokkosSparse::Experimental::sptrsv_symbolic (handle, rowmap, entries); } - + #ifdef KK_TRISOLVE_TIMERS + std::cout << " + sptrsv_symbolic time = " << timer_sptrsv.seconds() << std::endl; + #endif } // sptrsv_symbolic template host_graph_t generate_supernodal_graph(bool col_major, graph_t &graph, int nsuper, const input_size_type *nb) { + #ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE + double time_seconds = 0.0; + Kokkos::Timer timer; + #endif using size_type = typename graph_t::size_type; using cols_view_host_t = typename host_graph_t::entries_type::non_const_type; @@ -476,13 +480,19 @@ generate_supernodal_graph(bool col_major, graph_t &graph, int nsuper, const inpu // count non-empty supernodal blocks row_map_view_host_t hr ("rowmap_view", nsuper+1); integer_view_host_t check ("check", nsuper); + integer_view_host_t idxs ("idxs", nsuper); Kokkos::deep_copy (hr, 0); Kokkos::deep_copy (check, 0); + #ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE + timer.reset (); + #endif int nblocks = 0; for (int s = 0; s < nsuper; s++) { int j1 = nb[s]; int j2 = j1+1; // based on the first row + + size_type nidxs = 0; for (size_type i = row_map_host (j1); i < row_map_host (j2); i++) { int s2 = map (entries_host (i)); // supernodal blocks may not be filled with zeros @@ -493,10 +503,16 @@ generate_supernodal_graph(bool col_major, graph_t &graph, int nsuper, const inpu nblocks ++; // count blocks per row for col_major hr (s2+1) ++; + // keep track of non-zero block ids + idxs (nidxs) = s2; + nidxs ++; } } // reset check - Kokkos::deep_copy (check, 0); + //Kokkos::deep_copy (check, 0); + for (size_type i = 0; i < nidxs; i++) { + check (idxs(i)) = 0; + } } cols_view_host_t hc ("colmap_view", nblocks); @@ -506,11 +522,18 @@ generate_supernodal_graph(bool col_major, graph_t &graph, int nsuper, const inpu hr (s+1) += hr (s); } } + #ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE + time_seconds = timer.seconds (); + std::cout << " > Generate Supernodal Graph: count blocks : " << time_seconds << std::endl; + timer.reset (); + #endif nblocks = 0; for (int s = 0; s < nsuper; s++) { int j1 = nb[s]; int j2 = j1+1; // based on the first row + + size_type nidxs = 0; for (size_type i = row_map_host (j1); i < row_map_host (j2); i++) { int s2 = map (entries_host (i)); // supernodal blocks may not be filled with zeros @@ -525,19 +548,25 @@ generate_supernodal_graph(bool col_major, graph_t &graph, int nsuper, const inpu hc (nblocks) = s2; } nblocks ++; + // keep track of non-zero block ids + idxs (nidxs) = s2; + nidxs ++; } } if (!col_major) { hr (s+1) = nblocks; } // reset check - if (!col_major) { + /*if (!col_major) { for (size_type s2 = hr(s); s2 < hr(s+1); s2++) { check (hc(s2)) = 0; } } else { // NOTE: nonzero supernodes in s-th col are not stored Kokkos::deep_copy (check, 0); + }*/ + for (size_type i = 0; i < nidxs; i++) { + check (idxs(i)) = 0; } } // fix hr @@ -547,10 +576,21 @@ generate_supernodal_graph(bool col_major, graph_t &graph, int nsuper, const inpu } hr (0) = 0; } + #ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE + time_seconds = timer.seconds (); + std::cout << " > Generate Supernodal Graph: compress graph : " << time_seconds + << " (col_major = " << col_major << ")" << std::endl; + timer.reset (); + #endif + // sort column ids per row for (int s = 0; s < nsuper; s++) { std::sort(&(hc (hr (s))), &(hc (hr (s+1)))); } + #ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE + time_seconds = timer.seconds (); + std::cout << " > Generate Supernodal Graph: sort graph : " << time_seconds << std::endl << std::endl; + #endif host_graph_t static_graph (hc, hr); return static_graph; @@ -1018,17 +1058,32 @@ void sptrsv_supernodal_symbolic( // save the supernodal info in the handles for L/U solves handleL->set_supernodes (nsuper, supercols_view, etree); handleU->set_supernodes (nsuper, supercols_view, etree); + #ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE + time_seconds = tic.seconds (); + std::cout << " Deep-copy graph Time: " << time_seconds << std::endl; + tic.reset (); + #endif if (handleL->get_algorithm () == SPTRSVAlgorithm::SUPERNODAL_DAG || handleL->get_algorithm () == SPTRSVAlgorithm::SUPERNODAL_SPMV_DAG) { // generate supernodal graphs for DAG scheduling auto supL = generate_supernodal_graph (!col_majorL, graphL_host, nsuper, supercols); auto supU = generate_supernodal_graph ( col_majorU, graphU_host, nsuper, supercols); + #ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE + time_seconds = tic.seconds (); + std::cout << " Compute Supernodal Graph Time: " << time_seconds << std::endl; + tic.reset (); + #endif auto dagL = generate_supernodal_dag (nsuper, supL, supU); auto dagU = generate_supernodal_dag (nsuper, supU, supL); handleL->set_supernodal_dag (dagL); handleU->set_supernodal_dag (dagU); + #ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE + time_seconds = tic.seconds (); + std::cout << " Compute DAG Time: " << time_seconds << std::endl; + tic.reset (); + #endif } // =================================================================== diff --git a/packages/kokkos-kernels/src/sparse/impl/KokkosSparse_spmv_impl.hpp b/packages/kokkos-kernels/src/sparse/impl/KokkosSparse_spmv_impl.hpp index 7b91f95e0992..1061d60d9ac6 100644 --- a/packages/kokkos-kernels/src/sparse/impl/KokkosSparse_spmv_impl.hpp +++ b/packages/kokkos-kernels/src/sparse/impl/KokkosSparse_spmv_impl.hpp @@ -318,12 +318,86 @@ spmv_beta_no_transpose (const KokkosKernels::Experimental::Controls& controls, typename YVector::const_value_type& beta, const YVector& y) { - typedef typename AMatrix::ordinal_type ordinal_type; + typedef typename AMatrix::non_const_ordinal_type ordinal_type; typedef typename AMatrix::execution_space execution_space; if (A.numRows () <= static_cast (0)) { return; } +#if defined(KOKKOS_ENABLE_SERIAL) + if(std::is_same::value) { + /// serial impl + typedef typename AMatrix::non_const_value_type value_type; + typedef typename AMatrix::non_const_size_type size_type; + + const size_type *__restrict__ row_map_ptr = A.graph.row_map.data(); + const ordinal_type *__restrict__ col_idx_ptr = A.graph.entries.data(); + const value_type *__restrict__ values_ptr = A.values.data(); + + typename YVector::non_const_value_type *__restrict__ y_ptr = y.data(); + typename XVector::const_value_type *__restrict__ x_ptr = x.data(); + + const typename YVector::non_const_value_type zero(0); + const ordinal_type nrow = A.numRows(); + if (alpha == zero) { + if (dobeta == 0) { + /// not working with kkosDev2_CUDA110_GCC92_cpp17/ + ///memset(y_ptr, 0, sizeof(typename YVector::value_type)*nrow); + for (int i=0;i::value) && @@ -418,45 +492,113 @@ spmv_beta_transpose (typename YVector::const_value_type& alpha, KokkosBlas::scal (y, beta, y); } - // Assuming that no row contains duplicate entries, NNZPerRow - // cannot be more than the number of columns of the matrix. Thus, - // the appropriate type is ordinal_type. - const ordinal_type NNZPerRow = A.nnz () / A.numRows (); +#if defined(KOKKOS_ENABLE_SERIAL) || defined(KOKKOS_ENABLE_OPENMP) || defined(KOKKOS_ENABLE_THREADS) + { + int impl_thread_pool_size(0); +#if defined(KOKKOS_ENABLE_SERIAL) + if (std::is_same::value) + impl_thread_pool_size = 1; +#endif +#if defined(KOKKOS_ENABLE_OPENMP) + if (std::is_same::value) + impl_thread_pool_size = Kokkos::OpenMP::impl_thread_pool_size(); +#endif +#if defined(KOKKOS_ENABLE_THREADS) + if (std::is_same::value) + impl_thread_pool_size = Kokkos::Threads::impl_thread_pool_size(); +#endif - int vector_length = 1; - bool use_teams = KokkosKernels::Impl::kk_is_gpu_exec_space(); - int max_vector_length = 1; + if (impl_thread_pool_size == 1) { + /// serial impl + typedef typename AMatrix::non_const_value_type value_type; + typedef Kokkos::Details::ArithTraits ATV; + const size_type *__restrict__ row_map_ptr = A.graph.row_map.data(); + const ordinal_type *__restrict__ col_idx_ptr = A.graph.entries.data(); + const value_type *__restrict__ values_ptr = A.values.data(); + + typename YVector::value_type *__restrict__ y_ptr = y.data(); + typename XVector::value_type *__restrict__ x_ptr = x.data(); + + const typename YVector::non_const_value_type zero(0); + const ordinal_type nrow = A.numRows(); + if (alpha == zero) { + /// do nothing + } else { + for (int i=0;i(); + int max_vector_length = 1; #ifdef KOKKOS_ENABLE_CUDA - if(std::is_same::value) - max_vector_length = 32; + if(std::is_same::value) + max_vector_length = 32; #endif #ifdef KOKKOS_ENABLE_HIP - if(std::is_same::value) - max_vector_length = 64; + if(std::is_same::value) + max_vector_length = 64; #endif - if(use_teams) { - while( (vector_length*2*3 <= NNZPerRow) && (vector_length < max_vector_length) ) - vector_length*=2; - } - - typedef SPMV_Transpose_Functor OpType; - - typename AMatrix::const_ordinal_type nrow = A.numRows(); - - OpType op (alpha, A, x, y); - - if(use_teams) { - const ordinal_type rows_per_thread = RowsPerThread (NNZPerRow); - const ordinal_type team_size = Kokkos::TeamPolicy(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag()); - const ordinal_type rows_per_team = rows_per_thread * team_size; - op.rows_per_team = rows_per_team; - const size_type nteams = (nrow+rows_per_team-1)/rows_per_team; - Kokkos::parallel_for("KokkosSparse::spmv", Kokkos::TeamPolicy< execution_space > - ( nteams , team_size , vector_length ) , op ); - } - else { - Kokkos::parallel_for("KokkosSparse::spmv", Kokkos::RangePolicy< execution_space > - ( 0 , nrow ) , op ); + if(use_teams) { + while( (vector_length*2*3 <= NNZPerRow) && (vector_length < max_vector_length) ) + vector_length*=2; + } + + typedef SPMV_Transpose_Functor OpType; + + typename AMatrix::const_ordinal_type nrow = A.numRows(); + + OpType op (alpha, A, x, y); + + if(use_teams) { + const ordinal_type rows_per_thread = RowsPerThread (NNZPerRow); + const ordinal_type team_size = Kokkos::TeamPolicy(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag()); + const ordinal_type rows_per_team = rows_per_thread * team_size; + op.rows_per_team = rows_per_team; + const size_type nteams = (nrow+rows_per_team-1)/rows_per_team; + Kokkos::parallel_for("KokkosSparse::spmv", Kokkos::TeamPolicy< execution_space > + ( nteams , team_size , vector_length ) , op ); + } + else { + Kokkos::parallel_for("KokkosSparse::spmv", Kokkos::RangePolicy< execution_space > + ( 0 , nrow ) , op ); + } } } diff --git a/packages/kokkos-kernels/src/sparse/impl/KokkosSparse_spmv_impl_omp.hpp b/packages/kokkos-kernels/src/sparse/impl/KokkosSparse_spmv_impl_omp.hpp index 72c8a969fe87..7ac4936f5194 100644 --- a/packages/kokkos-kernels/src/sparse/impl/KokkosSparse_spmv_impl_omp.hpp +++ b/packages/kokkos-kernels/src/sparse/impl/KokkosSparse_spmv_impl_omp.hpp @@ -69,7 +69,7 @@ void spmv_raw_openmp_no_transpose(typename YVector::const_value_type& s_a, AMatr typename YVector::const_value_type zero = 0; #pragma omp parallel { -#ifdef KOKKOS_COMPILER_INTEL +#if defined(KOKKOS_COMPILER_INTEL) && !defined(__clang__) __assume_aligned(x_ptr, 64); __assume_aligned(y_ptr, 64); #endif diff --git a/packages/kokkos-kernels/src/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp b/packages/kokkos-kernels/src/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp index 1f8164a3b0d0..0332b82e499d 100644 --- a/packages/kokkos-kernels/src/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp +++ b/packages/kokkos-kernels/src/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp @@ -2656,6 +2656,10 @@ cudaProfilerStop(); size_type node_count = 0; + #ifdef profile_supernodal_etree + Kokkos::Timer sptrsv_timer; + sptrsv_timer.reset(); + #endif for ( size_type lvl = 0; lvl < nlevels; ++lvl ) { { size_type lvl_nodes = hnodes_per_level(lvl); @@ -2716,7 +2720,6 @@ cudaProfilerStart(); thandle.get_algorithm () == SPTRSVAlgorithm::SUPERNODAL_ETREE || thandle.get_algorithm () == SPTRSVAlgorithm::SUPERNODAL_DAG) { - //#define profile_supernodal_etree #ifdef profile_supernodal_etree size_t flops = 0; Kokkos::Timer timer; @@ -2884,6 +2887,13 @@ cudaProfilerStop(); } // scope for if-block } // end for lvl + #ifdef profile_supernodal_etree + Kokkos::fence(); + double sptrsv_time_seconds = sptrsv_timer.seconds (); + std::cout << " + Execution space : " << execution_space::name () << std::endl; + std::cout << " + Memory space : " << memory_space::name () << std::endl; + std::cout << " + SpTrsv(lower) time: " << sptrsv_time_seconds << std::endl << std::endl; + #endif } // end lower_tri_solve @@ -2954,6 +2964,10 @@ cudaProfilerStop(); size_type node_count = 0; // This must stay serial; would be nice to try out Cuda's graph stuff to reduce kernel launch overhead + #ifdef profile_supernodal_etree + Kokkos::Timer sptrsv_timer; + sptrsv_timer.reset(); + #endif for ( size_type lvl = 0; lvl < nlevels; ++lvl ) { size_type lvl_nodes = hnodes_per_level(lvl); @@ -3279,6 +3293,13 @@ cudaProfilerStop(); #endif } // end if } // end for lvl + #ifdef profile_supernodal_etree + Kokkos::fence(); + double sptrsv_time_seconds = sptrsv_timer.seconds (); + std::cout << " + SpTrsv(uppper) time: " << sptrsv_time_seconds << std::endl << std::endl; + std::cout <<" + Execution space : " << execution_space::name () << std::endl; + std::cout << " + Memory space : " << memory_space::name () << std::endl; + #endif } // end upper_tri_solve diff --git a/packages/kokkos-kernels/src/sparse/impl/KokkosSparse_sptrsv_symbolic_impl.hpp b/packages/kokkos-kernels/src/sparse/impl/KokkosSparse_sptrsv_symbolic_impl.hpp index 19694063f06a..45ebfe9e0086 100644 --- a/packages/kokkos-kernels/src/sparse/impl/KokkosSparse_sptrsv_symbolic_impl.hpp +++ b/packages/kokkos-kernels/src/sparse/impl/KokkosSparse_sptrsv_symbolic_impl.hpp @@ -167,6 +167,7 @@ template < class TriSolveHandle, class RowMapType, class EntriesType > void lower_tri_symbolic (TriSolveHandle &thandle, const RowMapType drow_map, const EntriesType dentries) { #ifdef TRISOLVE_SYMB_TIMERS Kokkos::Timer timer_sym_lowertri_total; + Kokkos::Timer timer; #endif using namespace KokkosSparse::Experimental; @@ -397,6 +398,23 @@ void lower_tri_symbolic (TriSolveHandle &thandle, const RowMapType drow_map, con work_offset_host (s) = 0; } } else { + //#define profile_supernodal_etree + #ifdef profile_supernodal_etree + // min, max, tot size of supernodes + signed_integral_t max_nsrow = 0; + signed_integral_t min_nsrow = 0; + signed_integral_t tot_nsrow = 0; + + signed_integral_t max_nscol = 0; + signed_integral_t min_nscol = 0; + signed_integral_t tot_nscol = 0; + + // min, max, tot num of leaves + signed_integral_t max_nleave = 0; + signed_integral_t min_nleave = 0; + signed_integral_t tot_nleave = 0; + #endif + /* initialize the ready tasks with leaves */ const int *parents = thandle.get_etree_parents (); integer_view_host_t check ("check", nsuper); @@ -421,22 +439,6 @@ void lower_tri_symbolic (TriSolveHandle &thandle, const RowMapType drow_map, con size_type num_done = 0; size_type level = 0; - //#define profile_supernodal_etree - #ifdef profile_supernodal_etree - // min, max, tot size of supernodes - signed_integral_t max_nsrow = 0; - signed_integral_t min_nsrow = 0; - signed_integral_t tot_nsrow = 0; - - signed_integral_t max_nscol = 0; - signed_integral_t min_nscol = 0; - signed_integral_t tot_nscol = 0; - - // min, max, tot num of leaves - signed_integral_t max_nleave = 0; - signed_integral_t min_nleave = 0; - signed_integral_t tot_nleave = 0; - #endif while (num_done < nsuper) { nodes_per_level (level) = 0; // look for ready-tasks @@ -564,9 +566,15 @@ void lower_tri_symbolic (TriSolveHandle &thandle, const RowMapType drow_map, con std::cout << " * numer of leaves: min = " << min_nleave << "\t max = " << max_nleave << "\t avg = " << tot_nleave/level << std::endl; std::cout << " * level = " << level << std::endl; #endif + #ifdef TRISOLVE_SYMB_TIMERS + std::cout << " + scheduling time = " << timer.seconds() << std::endl; + #endif // Set number of level equal to be the number of supernodal columns thandle.set_num_levels (level); } + #ifdef TRISOLVE_SYMB_TIMERS + timer.reset(); + #endif // workspace size if (thandle.get_algorithm () == SPTRSVAlgorithm::SUPERNODAL_SPMV || thandle.get_algorithm () == SPTRSVAlgorithm::SUPERNODAL_SPMV_DAG) { @@ -590,6 +598,10 @@ void lower_tri_symbolic (TriSolveHandle &thandle, const RowMapType drow_map, con Kokkos::deep_copy (dnodes_per_level, nodes_per_level); Kokkos::deep_copy (dlevel_list, level_list); + #ifdef TRISOLVE_SYMB_TIMERS + std::cout << " + workspace time = " << timer.seconds() << std::endl; + #endif + thandle.set_symbolic_complete(); } #endif @@ -604,6 +616,7 @@ template < class TriSolveHandle, class RowMapType, class EntriesType > void upper_tri_symbolic ( TriSolveHandle &thandle, const RowMapType drow_map, const EntriesType dentries ) { #ifdef TRISOLVE_SYMB_TIMERS Kokkos::Timer timer_sym_uppertri_total; + Kokkos::Timer timer; #endif using namespace KokkosSparse::Experimental; @@ -826,6 +839,21 @@ void upper_tri_symbolic ( TriSolveHandle &thandle, const RowMapType drow_map, co else { /* schduling from bottom to top (as for L-solve) * * then reverse it for U-solve */ + #ifdef profile_supernodal_etree + // min, max, tot size of supernodes + signed_integral_t max_nsrow = 0; + signed_integral_t min_nsrow = 0; + signed_integral_t tot_nsrow = 0; + + signed_integral_t max_nscol = 0; + signed_integral_t min_nscol = 0; + signed_integral_t tot_nscol = 0; + + // min, max, tot num of leaves + signed_integral_t max_nleave = 0; + signed_integral_t min_nleave = 0; + signed_integral_t tot_nleave = 0; + #endif /* initialize the ready tasks with leaves */ const int *parents = thandle.get_etree_parents (); @@ -860,21 +888,6 @@ void upper_tri_symbolic ( TriSolveHandle &thandle, const RowMapType drow_map, co size_type num_done = 0; size_type level = 0; - #ifdef profile_supernodal_etree - // min, max, tot size of supernodes - signed_integral_t max_nsrow = 0; - signed_integral_t min_nsrow = 0; - signed_integral_t tot_nsrow = 0; - - signed_integral_t max_nscol = 0; - signed_integral_t min_nscol = 0; - signed_integral_t tot_nscol = 0; - - // min, max, tot num of leaves - signed_integral_t max_nleave = 0; - signed_integral_t min_nleave = 0; - signed_integral_t tot_nleave = 0; - #endif while (num_done < nsuper) { nodes_per_level (level) = 0; // look for ready-tasks @@ -1013,10 +1026,16 @@ void upper_tri_symbolic ( TriSolveHandle &thandle, const RowMapType drow_map, co diag_kernel_type_by_level (level) = 3; } } + #ifdef TRISOLVE_SYMB_TIMERS + std::cout << " + scheduling time = " << timer.seconds() << std::endl; + #endif // Set number of levels thandle.set_num_levels (num_level); } + #ifdef TRISOLVE_SYMB_TIMERS + timer.reset(); + #endif // workspace size if (thandle.get_algorithm () == SPTRSVAlgorithm::SUPERNODAL_SPMV || thandle.get_algorithm () == SPTRSVAlgorithm::SUPERNODAL_SPMV_DAG) { @@ -1039,6 +1058,9 @@ void upper_tri_symbolic ( TriSolveHandle &thandle, const RowMapType drow_map, co Kokkos::deep_copy (dnodes_grouped_by_level, nodes_grouped_by_level); Kokkos::deep_copy (dnodes_per_level, nodes_per_level); Kokkos::deep_copy (dlevel_list, level_list); + #ifdef TRISOLVE_SYMB_TIMERS + std::cout << " + workspace time = " << timer.seconds() << std::endl; + #endif thandle.set_symbolic_complete (); } diff --git a/packages/kokkos-kernels/test_common/KokkosKernels_MatrixConverter.cpp b/packages/kokkos-kernels/test_common/KokkosKernels_MatrixConverter.cpp index 3f3fe11bae97..41fb5ebc2c7f 100644 --- a/packages/kokkos-kernels/test_common/KokkosKernels_MatrixConverter.cpp +++ b/packages/kokkos-kernels/test_common/KokkosKernels_MatrixConverter.cpp @@ -53,8 +53,6 @@ int main (int argc, char* argv[]){ typedef int size_type; typedef int idx; typedef double wt; - - Kokkos::initialize(argc,argv); bool symmetrize = false, remove_diagonal = false, transpose = false; char *in_mtx = NULL, *out_bin = NULL; @@ -92,204 +90,208 @@ int main (int argc, char* argv[]){ exit(1); } - typedef Kokkos::DefaultHostExecutionSpace MyExecSpace; - typedef typename KokkosSparse::CrsMatrix crstmat_t; - typedef typename crstmat_t::StaticCrsGraphType graph_t; - typedef typename graph_t::row_map_type::non_const_type row_map_view_t; - typedef typename graph_t::entries_type::non_const_type cols_view_t; - typedef typename crstmat_t::values_type::non_const_type values_view_t; + Kokkos::initialize(argc,argv); + { + + typedef Kokkos::DefaultHostExecutionSpace MyExecSpace; + + typedef typename KokkosSparse::CrsMatrix crstmat_t; + typedef typename crstmat_t::StaticCrsGraphType graph_t; + typedef typename graph_t::row_map_type::non_const_type row_map_view_t; + typedef typename graph_t::entries_type::non_const_type cols_view_t; + typedef typename crstmat_t::values_type::non_const_type values_view_t; - typedef typename graph_t::row_map_type::const_type c_row_map_view_t; - typedef typename graph_t::entries_type::const_type c_cols_view_t; - typedef typename crstmat_t::values_type::const_type c_values_view_t; + typedef typename graph_t::row_map_type::const_type c_row_map_view_t; + typedef typename graph_t::entries_type::const_type c_cols_view_t; + typedef typename crstmat_t::values_type::const_type c_values_view_t; - crstmat_t a_crsmat = KokkosKernels::Impl::read_kokkos_crst_matrix(in_mtx); + crstmat_t a_crsmat = KokkosKernels::Impl::read_kokkos_crst_matrix(in_mtx); - c_row_map_view_t orm = a_crsmat.graph.row_map; - c_cols_view_t oentries = a_crsmat.graph.entries; - c_values_view_t ovalues = a_crsmat.values; + c_row_map_view_t orm = a_crsmat.graph.row_map; + c_cols_view_t oentries = a_crsmat.graph.entries; + c_values_view_t ovalues = a_crsmat.values; - const size_type *prm = orm.data(); - const idx *pentries = oentries.data(); - const wt *pvals = ovalues.data(); + const size_type *prm = orm.data(); + const idx *pentries = oentries.data(); + const wt *pvals = ovalues.data(); - idx numrows = a_crsmat.numRows(); - //idx numcols = a_crsmat.numCols(); - idx nnz = ovalues.extent(0); - std::cout << "numrows :" << numrows << " nnz:" << nnz << std::endl; - //Kokkos::deep_copy(new_rowmap, a_crsmat.graph.row_map); + idx numrows = a_crsmat.numRows(); + //idx numcols = a_crsmat.numCols(); + idx nnz = ovalues.extent(0); + std::cout << "numrows :" << numrows << " nnz:" << nnz << std::endl; + //Kokkos::deep_copy(new_rowmap, a_crsmat.graph.row_map); - if (remove_diagonal) { - std::vector nrm(numrows + 1, 0); - std::vector nentries(nnz + 1); - std::vector nvals(nnz + 1); + if (remove_diagonal) { + std::vector nrm(numrows + 1, 0); + std::vector nentries(nnz + 1); + std::vector nvals(nnz + 1); - for (idx i = 0; i < numrows; ++i){ + for (idx i = 0; i < numrows; ++i){ - size_type begin = prm[i]; - size_type end = prm[i+1]; - for (size_type j = begin; j < end; ++ j){ - idx col = pentries[j]; - //wt val = pvals[j]; + size_type begin = prm[i]; + size_type end = prm[i+1]; + for (size_type j = begin; j < end; ++ j){ + idx col = pentries[j]; + //wt val = pvals[j]; - if (i == col){ - nrm[i] = 1; - break; - } + if (i == col){ + nrm[i] = 1; + break; + } + } } - } - size_type prefix = 0; - for (idx i = 0; i <= numrows; ++i){ - size_type current = nrm[i]; - nrm[i] = prefix; - prefix += current; + size_type prefix = 0; + for (idx i = 0; i <= numrows; ++i){ + size_type current = nrm[i]; + nrm[i] = prefix; + prefix += current; - } + } - for (idx i = 0; i <= numrows; ++i){ - nrm[i] = prm[i] - nrm[i]; - } + for (idx i = 0; i <= numrows; ++i){ + nrm[i] = prm[i] - nrm[i]; + } - for (idx i = 0; i < numrows; ++i){ + for (idx i = 0; i < numrows; ++i){ - size_type begin = prm[i]; - size_type end = prm[i+1]; + size_type begin = prm[i]; + size_type end = prm[i+1]; - size_type obegin = nrm[i]; + size_type obegin = nrm[i]; - for (size_type j = begin; j < end; ++ j){ - idx col = pentries[j]; - wt val = pvals[j]; - if (i != col){ - nentries[obegin] = col; - nvals[obegin++] = val; - } - } - if (obegin != nrm[i+1]){ - std::cout << "i:" << i << " nrm[i+1]:" << nrm[i+1] << " obegin:" << obegin << std::endl; - exit(1); + for (size_type j = begin; j < end; ++ j){ + idx col = pentries[j]; + wt val = pvals[j]; + if (i != col){ + nentries[obegin] = col; + nvals[obegin++] = val; + } + } + if (obegin != nrm[i+1]){ + std::cout << "i:" << i << " nrm[i+1]:" << nrm[i+1] << " obegin:" << obegin << std::endl; + exit(1); + } } - } - row_map_view_t new_rowmap ("new rowmap", numrows + 1); + row_map_view_t new_rowmap ("new rowmap", numrows + 1); - cols_view_t new_entries("new colmap", nrm[numrows]); - values_view_t new_values("new values", nrm[numrows ]); + cols_view_t new_entries("new colmap", nrm[numrows]); + values_view_t new_values("new values", nrm[numrows ]); - for (idx i = 0; i <= numrows; ++i){ - new_rowmap(i) = nrm[i]; - } - - for (size_type i = 0; i < nrm[numrows ]; ++i){ - new_entries(i) = nentries[i]; - new_values(i) = nvals[i]; - } + for (idx i = 0; i <= numrows; ++i){ + new_rowmap(i) = nrm[i]; + } - graph_t transpose_graph(new_entries, new_rowmap); - crstmat_t transpose_matrix("transpose", numrows, new_values, transpose_graph); - a_crsmat = transpose_matrix; + for (size_type i = 0; i < nrm[numrows ]; ++i){ + new_entries(i) = nentries[i]; + new_values(i) = nvals[i]; + } + graph_t transpose_graph(new_entries, new_rowmap); + crstmat_t transpose_matrix("transpose", numrows, new_values, transpose_graph); + a_crsmat = transpose_matrix; - orm = a_crsmat.graph.row_map; - oentries = a_crsmat.graph.entries; - ovalues = a_crsmat.values; - prm = orm.data(); - pentries = oentries.data(); - pvals = ovalues.data(); + orm = a_crsmat.graph.row_map; + oentries = a_crsmat.graph.entries; + ovalues = a_crsmat.values; - numrows = a_crsmat.numRows(); - //numcols = a_crsmat.numCols(); - nnz = ovalues.extent(0); - } + prm = orm.data(); + pentries = oentries.data(); + pvals = ovalues.data(); - if (symmetrize) { + numrows = a_crsmat.numRows(); + //numcols = a_crsmat.numCols(); + nnz = ovalues.extent(0); + } - row_map_view_t new_rowmap; - cols_view_t new_entries; + if (symmetrize) { - KokkosKernels::Impl::symmetrize_graph_symbolic_hashmap - - (numrows, orm, oentries, new_rowmap, new_entries); - values_view_t new_values("new_values",new_entries.extent(0)); + row_map_view_t new_rowmap; + cols_view_t new_entries; - cols_view_t out_adj ("out_adj", new_entries.extent(0)); - values_view_t out_vals("out_vals", new_entries.extent(0)); + KokkosKernels::Impl::symmetrize_graph_symbolic_hashmap + + (numrows, orm, oentries, new_rowmap, new_entries); + values_view_t new_values("new_values",new_entries.extent(0)); - KokkosKernels::Impl::kk_sort_graph - (new_rowmap, new_entries, new_values, out_adj, out_vals); - new_entries = out_adj; - new_values = out_vals; + cols_view_t out_adj ("out_adj", new_entries.extent(0)); + values_view_t out_vals("out_vals", new_entries.extent(0)); - graph_t symmetric_graph(new_entries, new_rowmap); - crstmat_t symmetric_marix("transpose", numrows, new_values, symmetric_graph); - a_crsmat = symmetric_marix; + KokkosKernels::Impl::kk_sort_graph + (new_rowmap, new_entries, new_values, out_adj, out_vals); + new_entries = out_adj; + new_values = out_vals; - orm = a_crsmat.graph.row_map; - oentries = a_crsmat.graph.entries; - ovalues = a_crsmat.values; + graph_t symmetric_graph(new_entries, new_rowmap); + crstmat_t symmetric_marix("transpose", numrows, new_values, symmetric_graph); + a_crsmat = symmetric_marix; - prm = orm.data(); - pentries = oentries.data(); - pvals = ovalues.data(); + orm = a_crsmat.graph.row_map; + oentries = a_crsmat.graph.entries; + ovalues = a_crsmat.values; - numrows = a_crsmat.numRows(); - //numcols = a_crsmat.numCols(); - nnz = ovalues.extent(0); - } - if (transpose) { - row_map_view_t new_rowmap ("new_rowmap", a_crsmat.numCols() + 1); - cols_view_t new_entries ("new_rowmap", a_crsmat.nnz()); - values_view_t new_values ("new_rowmap", a_crsmat.nnz()); - - KokkosKernels::Impl::transpose_matrix< - c_row_map_view_t, c_cols_view_t, c_values_view_t, - row_map_view_t, cols_view_t, values_view_t, row_map_view_t, MyExecSpace>( - a_crsmat.numRows(), a_crsmat.numCols(), - a_crsmat.graph.row_map, a_crsmat.graph.entries, a_crsmat.values, - new_rowmap, new_entries, new_values); - - std::cout << 1 << std::endl; - cols_view_t out_adj ("out_adj", new_entries.extent(0)); - values_view_t out_vals("out_vals", new_entries.extent(0)); - std::cout << 2 << std::endl; - KokkosKernels::Impl::kk_sort_graph - (new_rowmap, new_entries, new_values, out_adj, out_vals); - new_entries = out_adj; - new_values = out_vals; - std::cout << 3 << std::endl; - MyExecSpace().fence(); - KokkosKernels::Impl::kk_print_1Dview(out_adj); - KokkosKernels::Impl::kk_print_1Dview(out_vals); - - graph_t transpose_graph(new_entries, new_rowmap); - crstmat_t transpose_matrix("transpose", a_crsmat.numRows(), new_values, transpose_graph); - a_crsmat = transpose_matrix; - - orm = a_crsmat.graph.row_map; - oentries = a_crsmat.graph.entries; - ovalues = a_crsmat.values; - - prm = orm.data(); - pentries = oentries.data(); - pvals = ovalues.data(); - - numrows = a_crsmat.numRows(); - //numcols = a_crsmat.numCols(); - nnz = ovalues.extent(0); - } + prm = orm.data(); + pentries = oentries.data(); + pvals = ovalues.data(); + numrows = a_crsmat.numRows(); + //numcols = a_crsmat.numCols(); + nnz = ovalues.extent(0); + } + if (transpose) { + row_map_view_t new_rowmap ("new_rowmap", a_crsmat.numCols() + 1); + cols_view_t new_entries ("new_rowmap", a_crsmat.nnz()); + values_view_t new_values ("new_rowmap", a_crsmat.nnz()); + + KokkosKernels::Impl::transpose_matrix< + c_row_map_view_t, c_cols_view_t, c_values_view_t, + row_map_view_t, cols_view_t, values_view_t, row_map_view_t, MyExecSpace>( + a_crsmat.numRows(), a_crsmat.numCols(), + a_crsmat.graph.row_map, a_crsmat.graph.entries, a_crsmat.values, + new_rowmap, new_entries, new_values); + + std::cout << 1 << std::endl; + cols_view_t out_adj ("out_adj", new_entries.extent(0)); + values_view_t out_vals("out_vals", new_entries.extent(0)); + std::cout << 2 << std::endl; + KokkosKernels::Impl::kk_sort_graph + (new_rowmap, new_entries, new_values, out_adj, out_vals); + new_entries = out_adj; + new_values = out_vals; + std::cout << 3 << std::endl; + MyExecSpace().fence(); + KokkosKernels::Impl::kk_print_1Dview(out_adj); + KokkosKernels::Impl::kk_print_1Dview(out_vals); + + graph_t transpose_graph(new_entries, new_rowmap); + crstmat_t transpose_matrix("transpose", a_crsmat.numRows(), new_values, transpose_graph); + a_crsmat = transpose_matrix; + + orm = a_crsmat.graph.row_map; + oentries = a_crsmat.graph.entries; + ovalues = a_crsmat.values; + + prm = orm.data(); + pentries = oentries.data(); + pvals = ovalues.data(); + + numrows = a_crsmat.numRows(); + //numcols = a_crsmat.numCols(); + nnz = ovalues.extent(0); + } - KokkosKernels::Impl::write_kokkos_crst_matrix (a_crsmat, out_bin); + KokkosKernels::Impl::write_kokkos_crst_matrix (a_crsmat, out_bin); + } Kokkos::finalize(); diff --git a/packages/kokkos-kernels/test_common/KokkosKernels_TestUtils.hpp b/packages/kokkos-kernels/test_common/KokkosKernels_TestUtils.hpp index bf86768d1690..f3a34ba12316 100644 --- a/packages/kokkos-kernels/test_common/KokkosKernels_TestUtils.hpp +++ b/packages/kokkos-kernels/test_common/KokkosKernels_TestUtils.hpp @@ -46,6 +46,8 @@ #define KOKKOSKERNELS_TEST_UTILS_HPP #include "KokkosKernels_Utils.hpp" +#include "Kokkos_ArithTraits.hpp" + namespace Test { template::value> struct multivector_layout_adapter; @@ -83,16 +85,15 @@ namespace Test { template void EXPECT_NEAR_KK(Scalar1 val1, Scalar2 val2, Scalar3 tol) { typedef Kokkos::Details::ArithTraits AT1; - typedef Kokkos::Details::ArithTraits AT2; typedef Kokkos::Details::ArithTraits AT3; - EXPECT_NEAR(double(AT1::abs(val1)),double(AT2::abs(val2)),double(AT3::abs(tol))); + EXPECT_LE((double) AT1::abs(val1 - val2), (double) AT3::abs(tol)); } template void EXPECT_NEAR_KK_1DVIEW(ViewType1 v1, ViewType2 v2, Scalar tol) { size_t v1_size = v1.extent(0); size_t v2_size = v2.extent(0); - EXPECT_NEAR_KK(v1_size, v2_size, 0); + EXPECT_EQ(v1_size, v2_size); typename ViewType1::HostMirror h_v1 = Kokkos::create_mirror_view(v1); @@ -121,6 +122,8 @@ namespace Test { typedef typename ViewTypeA::value_type ScalarA; typedef typename ViewTypeB::value_type ScalarB; typedef typename ViewTypeC::value_type ScalarC; + typedef Kokkos::View SubviewTypeA; + typedef Kokkos::View SubviewTypeB; typedef Kokkos::Details::ArithTraits APT; typedef typename APT::mag_type mag_type; ScalarA alpha; @@ -130,11 +133,19 @@ namespace Test { void operator() (const typename Kokkos::TeamPolicy::member_type& team) const { Kokkos::parallel_for(Kokkos::TeamThreadRange(team,C_rows), [&] (const int& i) { // Give each kokkos thread a vector of A - auto a_vec = A_t ? Kokkos::subview(A, Kokkos::ALL(), i) : Kokkos::subview(A, i, Kokkos::ALL()); + SubviewTypeA a_vec; + if(A_t) + a_vec = Kokkos::subview(A, Kokkos::ALL(), i); + else + a_vec = Kokkos::subview(A, i, Kokkos::ALL()); // Have all vector lanes perform the dot product Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,C_cols), [&] (const int& j) { - auto b_vec = B_t ? Kokkos::subview(B, j, Kokkos::ALL()) : Kokkos::subview(B, Kokkos::ALL(), j); + SubviewTypeB b_vec; + if(B_t) + b_vec = Kokkos::subview(B, j, Kokkos::ALL()); + else + b_vec = Kokkos::subview(B, Kokkos::ALL(), j); ScalarC ab = ScalarC(0); for (int k = 0; k < A_cols; k++) { auto a = A_c ? APT::conj(a_vec(k)) : a_vec(k); @@ -149,7 +160,7 @@ namespace Test { // C(i,:,:) = alpha * (A(i,:,:) * B(i,:,:)) + beta * C(i,:,:) template struct Functor_BatchedVanillaGEMM { - bool A_t, B_t, A_c, B_c; + bool A_t, B_t, A_c, B_c, batch_size_last_dim = false; ViewTypeA A; ViewTypeB B; ViewTypeC C; @@ -157,25 +168,35 @@ namespace Test { using ScalarA = typename ViewTypeA::value_type; using ScalarB = typename ViewTypeB::value_type; using ScalarC = typename ViewTypeC::value_type; + using SubviewTypeA = typename Kokkos::View; + using SubviewTypeB = typename Kokkos::View; + using SubviewTypeC = typename Kokkos::View; + ScalarA alpha; ScalarC beta; KOKKOS_INLINE_FUNCTION void operator()(const typename Kokkos::TeamPolicy::member_type& team) const { int i = team.league_rank(); + SubviewTypeA _A; + SubviewTypeB _B; + SubviewTypeC _C; - auto _A = Kokkos::subview(A, i, Kokkos::ALL(), Kokkos::ALL()); - auto _B = Kokkos::subview(B, i, Kokkos::ALL(), Kokkos::ALL()); - auto _C = Kokkos::subview(C, i, Kokkos::ALL(), Kokkos::ALL()); - using SubviewTypeA = decltype(_A); - using SubviewTypeB = decltype(_B); - using SubviewTypeC = decltype(_C); + if (batch_size_last_dim) { + _A = Kokkos::subview(A, Kokkos::ALL(), Kokkos::ALL(), i); + _B = Kokkos::subview(B, Kokkos::ALL(), Kokkos::ALL(), i); + _C = Kokkos::subview(C, Kokkos::ALL(), Kokkos::ALL(), i); + } else { + _A = Kokkos::subview(A, i, Kokkos::ALL(), Kokkos::ALL()); + _B = Kokkos::subview(B, i, Kokkos::ALL(), Kokkos::ALL()); + _C = Kokkos::subview(C, i, Kokkos::ALL(), Kokkos::ALL()); + } struct SharedVanillaGEMM vgemm; vgemm.A_t = A_t; vgemm.B_t = B_t; vgemm.A_c = A_c; vgemm.B_c = B_c; - vgemm.C_rows = C.extent(1); - vgemm.C_cols = C.extent(2); - vgemm.A_cols = A_t?A.extent(1):A.extent(2); + vgemm.C_rows = batch_size_last_dim ? C.extent(0) : C.extent(1); + vgemm.C_cols = batch_size_last_dim ? C.extent(1) : C.extent(2); + vgemm.A_cols = batch_size_last_dim ? (A_t?A.extent(0):A.extent(1)) : (A_t?A.extent(1):A.extent(2)); vgemm.A = _A; vgemm.B = _B; vgemm.C = _C; @@ -188,9 +209,48 @@ namespace Test { void run() { Kokkos::parallel_for( "Test::VanillaGEMM", - Kokkos::TeamPolicy(C.extent(0), Kokkos::AUTO, 16), + Kokkos::TeamPolicy(batch_size_last_dim ? C.extent(2) : C.extent(0), Kokkos::AUTO, 16), *this); } }; + + template + class epsilon { + public: + constexpr static double value = std::numeric_limits::epsilon(); + }; + + // explicit epsilon specializations + #if defined(KOKKOS_HALF_T_IS_FLOAT) && !KOKKOS_HALF_T_IS_FLOAT + template<> + class epsilon { + public: + constexpr static double value = 0.0009765625F; + }; + #endif // KOKKOS_HALF_T_IS_FLOAT + + //Get the interval for Kokkos::fill_random + //For real, interval is (-mag, mag) + //For complex, both real and imaginary parts will have interval (-mag, mag) + template + inline void getRandomBounds(double mag, Scalar& start, Scalar& end) + { + start = -mag * Kokkos::ArithTraits::one(); + end = mag * Kokkos::ArithTraits::one(); + } + + template<> + inline void getRandomBounds(double mag, Kokkos::complex& start, Kokkos::complex& end) + { + start = Kokkos::complex(-mag, -mag); + end = Kokkos::complex(mag, mag); + } + + template<> + inline void getRandomBounds(double mag, Kokkos::complex& start, Kokkos::complex& end) + { + start = Kokkos::complex(-mag, -mag); + end = Kokkos::complex(mag, mag); + } } #endif diff --git a/packages/kokkos-kernels/test_common/Test_Common_ArithTraits.hpp b/packages/kokkos-kernels/test_common/Test_Common_ArithTraits.hpp index bba54ff6f076..4fab021e66e2 100644 --- a/packages/kokkos-kernels/test_common/Test_Common_ArithTraits.hpp +++ b/packages/kokkos-kernels/test_common/Test_Common_ArithTraits.hpp @@ -1634,7 +1634,8 @@ int runAllArithTraitsHostTests (std::ostream& out, const int verbose) success = success && curSuccess; curSuccess = testArithTraitsOnHost (out, verbose); success = success && curSuccess; curSuccess = testArithTraitsOnHost (out, verbose); -#if !defined( KOKKOS_ENABLE_CUDA ) && !defined( KOKKOS_ENABLE_HIP ) +#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP) && \ + !defined(KOKKOS_ENABLE_SYCL) // This would spill tons of warnings about host device stuff otherwise success = success && curSuccess; curSuccess = testArithTraitsOnHost (out, verbose); success = success && curSuccess; curSuccess = testArithTraitsOnHost, DeviceType> (out, verbose); diff --git a/packages/kokkos-kernels/unit_test/batched/Test_Batched_SerialEigendecomposition.hpp b/packages/kokkos-kernels/unit_test/batched/Test_Batched_SerialEigendecomposition.hpp index 6ee4ebef1818..b63998b75fe5 100644 --- a/packages/kokkos-kernels/unit_test/batched/Test_Batched_SerialEigendecomposition.hpp +++ b/packages/kokkos-kernels/unit_test/batched/Test_Batched_SerialEigendecomposition.hpp @@ -1,5 +1,6 @@ /// \author Kyungjoo Kim (kyukim@sandia.gov) +/* #include "gtest/gtest.h" #include "Kokkos_Core.hpp" #include "Kokkos_Random.hpp" @@ -111,3 +112,4 @@ int test_batched_serial_eigendecomposition() { return 0; } +*/ diff --git a/packages/kokkos-kernels/unit_test/batched/Test_Batched_SerialEigendecomposition_Real.hpp b/packages/kokkos-kernels/unit_test/batched/Test_Batched_SerialEigendecomposition_Real.hpp index 7108f56bbb27..344438e7193e 100644 --- a/packages/kokkos-kernels/unit_test/batched/Test_Batched_SerialEigendecomposition_Real.hpp +++ b/packages/kokkos-kernels/unit_test/batched/Test_Batched_SerialEigendecomposition_Real.hpp @@ -1,3 +1,4 @@ +/* #if defined(KOKKOSKERNELS_INST_FLOAT) TEST_F( TestCategory, batched_scalar_serial_eigendecomposition_float ) { test_batched_serial_eigendecomposition(); @@ -9,5 +10,5 @@ TEST_F( TestCategory, batched_scalar_serial_eigendecomposition_double ) { test_batched_serial_eigendecomposition(); } #endif - +*/ diff --git a/packages/kokkos-kernels/unit_test/batched/Test_Batched_TeamVectorEigendecomposition.hpp b/packages/kokkos-kernels/unit_test/batched/Test_Batched_TeamVectorEigendecomposition.hpp index 9dd2a6b04818..a02c701acdbe 100644 --- a/packages/kokkos-kernels/unit_test/batched/Test_Batched_TeamVectorEigendecomposition.hpp +++ b/packages/kokkos-kernels/unit_test/batched/Test_Batched_TeamVectorEigendecomposition.hpp @@ -1,5 +1,6 @@ /// \author Kyungjoo Kim (kyukim@sandia.gov) +/* #include "gtest/gtest.h" #include "Kokkos_Core.hpp" #include "Kokkos_Random.hpp" @@ -114,3 +115,4 @@ int test_batched_teamvector_eigendecomposition() { return 0; } +*/ diff --git a/packages/kokkos-kernels/unit_test/batched/Test_Batched_TeamVectorEigendecomposition_Real.hpp b/packages/kokkos-kernels/unit_test/batched/Test_Batched_TeamVectorEigendecomposition_Real.hpp index 14b3c61f4de5..b4646c30271f 100644 --- a/packages/kokkos-kernels/unit_test/batched/Test_Batched_TeamVectorEigendecomposition_Real.hpp +++ b/packages/kokkos-kernels/unit_test/batched/Test_Batched_TeamVectorEigendecomposition_Real.hpp @@ -1,3 +1,4 @@ +/* #if defined(KOKKOSKERNELS_INST_FLOAT) TEST_F( TestCategory, batched_scalar_teamvector_eigendecomposition_float ) { test_batched_teamvector_eigendecomposition(); @@ -9,5 +10,4 @@ TEST_F( TestCategory, batched_scalar_teamvector_eigendecomposition_double ) { test_batched_teamvector_eigendecomposition(); } #endif - - +*/ diff --git a/packages/kokkos-kernels/unit_test/blas/Test_Blas1_abs.hpp b/packages/kokkos-kernels/unit_test/blas/Test_Blas1_abs.hpp index acdb167d1d9a..d1cb36d3684f 100644 --- a/packages/kokkos-kernels/unit_test/blas/Test_Blas1_abs.hpp +++ b/packages/kokkos-kernels/unit_test/blas/Test_Blas1_abs.hpp @@ -2,7 +2,6 @@ #include #include #include -#include #include namespace Test { @@ -23,7 +22,7 @@ namespace Test { Kokkos::LayoutRight, Kokkos::LayoutLeft>::type,Device> BaseTypeB; - double eps = std::is_same::value?2*1e-5:1e-7; + typename AT::mag_type eps = AT::epsilon()*10; BaseTypeA b_x("X",N); BaseTypeB b_y("Y",N); @@ -42,29 +41,38 @@ namespace Test { Kokkos::Random_XorShift64_Pool rand_pool(13718); - Kokkos::fill_random(b_x,rand_pool,ScalarA(1)); - Kokkos::fill_random(b_y,rand_pool,ScalarB(1)); + { + ScalarA randStart, randEnd; + Test::getRandomBounds(1.0, randStart, randEnd); + Kokkos::fill_random(b_x,rand_pool,randStart,randEnd); + } + { + ScalarB randStart, randEnd; + Test::getRandomBounds(1.0, randStart, randEnd); + Kokkos::fill_random(b_y,rand_pool,randStart,randEnd); + } Kokkos::deep_copy(b_org_y,b_y); Kokkos::deep_copy(h_b_x,b_x); Kokkos::deep_copy(h_b_y,b_y); - ScalarA expected_result(0); - for(int i=0;i rand_pool(13718); - Kokkos::fill_random(b_x,rand_pool,ScalarA(1)); - Kokkos::fill_random(b_y,rand_pool,ScalarB(1)); + { + ScalarA randStart, randEnd; + Test::getRandomBounds(1.0, randStart, randEnd); + Kokkos::fill_random(b_x,rand_pool,randStart,randEnd); + } + { + ScalarB randStart, randEnd; + Test::getRandomBounds(1.0, randStart, randEnd); + Kokkos::fill_random(b_y,rand_pool,randStart,randEnd); + } Kokkos::deep_copy(b_org_y,b_y); Kokkos::deep_copy(h_b_x,b_x); - Kokkos::deep_copy(h_b_y,b_y); typename ViewTypeA::const_type c_x = x; - ScalarA* expected_result = new ScalarA[K]; - for(int j=0;j r("Dot::Result",K); + typename AT::mag_type eps = AT::epsilon()*10; + //Test and verify non-const input KokkosBlas::abs(y,x); - KokkosBlas::dot(r,y,y); - for(int k=0;k AT; + typedef Kokkos::ArithTraits MAT; typedef Kokkos::View rand_pool(13718); - Kokkos::fill_random(b_a,rand_pool,ScalarA(10)); - Kokkos::fence(); + ScalarA randStart, randEnd; + Test::getRandomBounds(10.0, randStart, randEnd); + Kokkos::fill_random(b_a,rand_pool,randStart,randEnd); + Kokkos::deep_copy(h_b_a,b_a); typename ViewTypeA::const_type c_a = a; @@ -36,7 +39,13 @@ namespace Test { typename AT::mag_type expected_result = 0; for(int i=0;i::imag is 0 if T is real. + expected_result += MAT::abs(AT::real(h_a(i))) + MAT::abs(AT::imag(h_a(i))); + } typename AT::mag_type nonconst_result = KokkosBlas::asum(a); EXPECT_NEAR_KK( nonconst_result, expected_result, eps*expected_result); diff --git a/packages/kokkos-kernels/unit_test/blas/Test_Blas1_axpby.hpp b/packages/kokkos-kernels/unit_test/blas/Test_Blas1_axpby.hpp index f2bc692d09eb..84943b1bc7a0 100644 --- a/packages/kokkos-kernels/unit_test/blas/Test_Blas1_axpby.hpp +++ b/packages/kokkos-kernels/unit_test/blas/Test_Blas1_axpby.hpp @@ -31,6 +31,7 @@ namespace Test { BaseTypeB b_org_y("Org_Y",N); + auto h_b_org_y = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), b_org_y); ViewTypeA x = Kokkos::subview(b_x,Kokkos::ALL(),0); ViewTypeB y = Kokkos::subview(b_y,Kokkos::ALL(),0); typename ViewTypeA::const_type c_x = x; @@ -44,26 +45,38 @@ namespace Test { Kokkos::Random_XorShift64_Pool rand_pool(13718); - Kokkos::fill_random(b_x,rand_pool,ScalarA(10)); - Kokkos::fill_random(b_y,rand_pool,ScalarB(10)); + { + ScalarA randStart, randEnd; + Test::getRandomBounds(10.0, randStart, randEnd); + Kokkos::fill_random(b_x,rand_pool,randStart,randEnd); + } + { + ScalarB randStart, randEnd; + Test::getRandomBounds(10.0, randStart, randEnd); + Kokkos::fill_random(b_y,rand_pool,randStart,randEnd); + } Kokkos::deep_copy(b_org_y,b_y); + Kokkos::deep_copy(h_b_org_y, b_org_y); Kokkos::deep_copy(h_b_x,b_x); - Kokkos::deep_copy(h_b_y,b_y); - - ScalarA expected_result = 0; - for(int i=0;i @@ -93,10 +106,19 @@ namespace Test { Kokkos::Random_XorShift64_Pool rand_pool(13718); - Kokkos::fill_random(b_x,rand_pool,ScalarA(10)); - Kokkos::fill_random(b_y,rand_pool,ScalarB(10)); + { + ScalarA randStart, randEnd; + Test::getRandomBounds(10.0, randStart, randEnd); + Kokkos::fill_random(b_x,rand_pool,randStart,randEnd); + } + { + ScalarB randStart, randEnd; + Test::getRandomBounds(10.0, randStart, randEnd); + Kokkos::fill_random(b_y,rand_pool,randStart,randEnd); + } Kokkos::deep_copy(b_org_y,b_y); + auto h_b_org_y = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), b_org_y); Kokkos::deep_copy(h_b_x,b_x); Kokkos::deep_copy(h_b_y,b_y); @@ -105,36 +127,32 @@ namespace Test { ScalarB b = 5; typename ViewTypeA::const_type c_x = x; - ScalarA* expected_result = new ScalarA[K]; - for(int j=0;j::value?2*1e-5:1e-7; Kokkos::View r("Dot::Result",K); - typedef Kokkos::Details::ArithTraits AT; - KokkosBlas::axpby(a,x,b,y); - KokkosBlas::dot(r,y,y); - for(int k=0;k::value, Kokkos::LayoutRight, Kokkos::LayoutLeft>::type,Device> BaseTypeB; + using MagnitudeA = typename Kokkos::ArithTraits::mag_type; ScalarA a = 3; - double eps = std::is_same::value?2*1e-5:1e-7; + double eps = std::is_same::value?2e-5:1e-7; BaseTypeA b_x("X",N); BaseTypeB b_y("Y",N); BaseTypeB b_org_y("Org_Y",N); - ViewTypeA x = Kokkos::subview(b_x,Kokkos::ALL(),0); ViewTypeB y = Kokkos::subview(b_y,Kokkos::ALL(),0); @@ -43,26 +43,40 @@ namespace Test { Kokkos::Random_XorShift64_Pool rand_pool(13718); - Kokkos::fill_random(b_x,rand_pool,ScalarA(10)); - Kokkos::fill_random(b_y,rand_pool,ScalarB(10)); + { + ScalarA randStart, randEnd; + Test::getRandomBounds(10.0, randStart, randEnd); + Kokkos::fill_random(x,rand_pool,randStart,randEnd); + } + { + ScalarB randStart, randEnd; + Test::getRandomBounds(10.0, randStart, randEnd); + Kokkos::fill_random(y,rand_pool,randStart,randEnd); + } Kokkos::deep_copy(b_org_y,b_y); + auto h_b_org_y = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), b_org_y); Kokkos::deep_copy(h_b_x,b_x); - Kokkos::deep_copy(h_b_y,b_y); - ScalarA expected_result = 0; + KokkosBlas::axpy(a,x,y); + Kokkos::deep_copy(h_b_y, b_y); + for(int i=0;i @@ -92,10 +106,19 @@ namespace Test { Kokkos::Random_XorShift64_Pool rand_pool(13718); - Kokkos::fill_random(b_x,rand_pool,ScalarA(10)); - Kokkos::fill_random(b_y,rand_pool,ScalarB(10)); + { + ScalarA randStart, randEnd; + Test::getRandomBounds(10.0, randStart, randEnd); + Kokkos::fill_random(b_x,rand_pool,randStart,randEnd); + } + { + ScalarB randStart, randEnd; + Test::getRandomBounds(10.0, randStart, randEnd); + Kokkos::fill_random(b_y,rand_pool,randStart,randEnd); + } Kokkos::deep_copy(b_org_y,b_y); + auto h_b_org_y = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), b_org_y); Kokkos::deep_copy(h_b_x,b_x); Kokkos::deep_copy(h_b_y,b_y); @@ -103,33 +126,28 @@ namespace Test { ScalarA a = 3; typename ViewTypeA::const_type c_x = x; - ScalarA* expected_result = new ScalarA[K]; - for(int j=0;j::value?2*1e-5:1e-7; - Kokkos::View r("Dot::Result",K); - KokkosBlas::axpy(a,x,y); - KokkosBlas::dot(r,y,y); - for(int k=0;k rand_pool(13718); - Kokkos::fill_random(b_a,rand_pool,ScalarA(10)); - Kokkos::fill_random(b_b,rand_pool,ScalarB(10)); - - Kokkos::fence(); + { + ScalarA randStart, randEnd; + Test::getRandomBounds(10.0, randStart, randEnd); + Kokkos::fill_random(b_a,rand_pool,randStart,randEnd); + } + { + ScalarB randStart, randEnd; + Test::getRandomBounds(10.0, randStart, randEnd); + Kokkos::fill_random(b_b,rand_pool,randStart,randEnd); + } Kokkos::deep_copy(h_b_a,b_a); Kokkos::deep_copy(h_b_b,b_b); @@ -92,10 +98,16 @@ namespace Test { Kokkos::Random_XorShift64_Pool rand_pool(13718); - Kokkos::fill_random(b_a,rand_pool,ScalarA(10)); - Kokkos::fill_random(b_b,rand_pool,ScalarB(10)); - - Kokkos::fence(); + { + ScalarA randStart, randEnd; + Test::getRandomBounds(10.0, randStart, randEnd); + Kokkos::fill_random(b_a,rand_pool,randStart,randEnd); + } + { + ScalarB randStart, randEnd; + Test::getRandomBounds(10.0, randStart, randEnd); + Kokkos::fill_random(b_b,rand_pool,randStart,randEnd); + } Kokkos::deep_copy(h_b_a,b_a); Kokkos::deep_copy(h_b_b,b_b); diff --git a/packages/kokkos-kernels/unit_test/blas/Test_Blas1_iamax.hpp b/packages/kokkos-kernels/unit_test/blas/Test_Blas1_iamax.hpp index 166c25c1a8a5..5e989125539d 100644 --- a/packages/kokkos-kernels/unit_test/blas/Test_Blas1_iamax.hpp +++ b/packages/kokkos-kernels/unit_test/blas/Test_Blas1_iamax.hpp @@ -29,9 +29,9 @@ namespace Test { Kokkos::Random_XorShift64_Pool rand_pool(13718); - Kokkos::fill_random(b_a,rand_pool,ScalarA(10)); - - Kokkos::fence(); + ScalarA randStart, randEnd; + Test::getRandomBounds(10.0, randStart, randEnd); + Kokkos::fill_random(b_a,rand_pool,randStart,randEnd); Kokkos::deep_copy(h_b_a,b_a); @@ -115,9 +115,9 @@ namespace Test { Kokkos::Random_XorShift64_Pool rand_pool(13718); - Kokkos::fill_random(b_a,rand_pool,ScalarA(10)); - - Kokkos::fence(); + ScalarA randStart, randEnd; + Test::getRandomBounds(10.0, randStart, randEnd); + Kokkos::fill_random(b_a,rand_pool,randStart,randEnd); Kokkos::deep_copy(h_b_a,b_a); diff --git a/packages/kokkos-kernels/unit_test/blas/Test_Blas1_mult.hpp b/packages/kokkos-kernels/unit_test/blas/Test_Blas1_mult.hpp index fcab767dcc20..1f6856a9348a 100644 --- a/packages/kokkos-kernels/unit_test/blas/Test_Blas1_mult.hpp +++ b/packages/kokkos-kernels/unit_test/blas/Test_Blas1_mult.hpp @@ -29,7 +29,7 @@ namespace Test { ScalarA a = 3; ScalarB b = 5; - double eps = std::is_same::value?2*1e-5:1e-7; + double eps = std::is_same::value?1e-4:1e-7; BaseTypeA b_x("X",N); BaseTypeB b_y("Y",N); @@ -53,33 +53,52 @@ namespace Test { Kokkos::Random_XorShift64_Pool rand_pool(13718); - Kokkos::fill_random(b_x,rand_pool,ScalarA(10)); - Kokkos::fill_random(b_y,rand_pool,ScalarB(10)); - Kokkos::fill_random(b_z,rand_pool,ScalarC(10)); + { + ScalarA randStart, randEnd; + Test::getRandomBounds(10.0, randStart, randEnd); + Kokkos::fill_random(b_x,rand_pool,randStart,randEnd); + } + { + ScalarB randStart, randEnd; + Test::getRandomBounds(10.0, randStart, randEnd); + Kokkos::fill_random(b_y,rand_pool,randStart,randEnd); + } + { + ScalarC randStart, randEnd; + Test::getRandomBounds(10.0, randStart, randEnd); + Kokkos::fill_random(b_z,rand_pool,randStart,randEnd); + } Kokkos::deep_copy(b_org_z,b_z); + auto h_b_org_z = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), b_org_z); Kokkos::deep_copy(h_b_x,b_x); Kokkos::deep_copy(h_b_y,b_y); - Kokkos::deep_copy(h_b_z,b_z); - ScalarA expected_result = 0; - for(int i=0;i @@ -118,11 +137,24 @@ namespace Test { Kokkos::Random_XorShift64_Pool rand_pool(13718); - Kokkos::fill_random(b_x,rand_pool,ScalarA(10)); - Kokkos::fill_random(b_y,rand_pool,ScalarB(10)); - Kokkos::fill_random(b_z,rand_pool,ScalarC(10)); + { + ScalarA randStart, randEnd; + Test::getRandomBounds(10.0, randStart, randEnd); + Kokkos::fill_random(b_x,rand_pool,randStart,randEnd); + } + { + ScalarB randStart, randEnd; + Test::getRandomBounds(10.0, randStart, randEnd); + Kokkos::fill_random(b_y,rand_pool,randStart,randEnd); + } + { + ScalarC randStart, randEnd; + Test::getRandomBounds(10.0, randStart, randEnd); + Kokkos::fill_random(b_z,rand_pool,randStart,randEnd); + } Kokkos::deep_copy(b_org_z,b_z); + auto h_b_org_z = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), b_org_z); Kokkos::deep_copy(h_b_x,b_x); Kokkos::deep_copy(h_b_y,b_y); @@ -133,33 +165,28 @@ namespace Test { typename ViewTypeA::const_type c_x = x; typename ViewTypeB::const_type c_y = y; - ScalarC* expected_result = new ScalarC[K]; - for(int j=0;j::value?2*1e-5:1e-7; - - Kokkos::View r("Dot::Result",K); + double eps = std::is_same::value?1e-4:1e-7; KokkosBlas::mult(b,z,a,x,y); - KokkosBlas::dot(r,z,z); - for(int k=0;k AT; + typedef Kokkos::ArithTraits AT; + typedef typename AT::mag_type mag_type; + typedef Kokkos::ArithTraits MAT; typedef Kokkos::View rand_pool(13718); - Kokkos::fill_random(b_a,rand_pool,ScalarA(10)); - - Kokkos::fence(); + ScalarA randStart, randEnd; + Test::getRandomBounds(10.0, randStart, randEnd); + Kokkos::fill_random(b_a,rand_pool,randStart,randEnd); Kokkos::deep_copy(h_b_a,b_a); typename ViewTypeA::const_type c_a = a; - double eps = std::is_same::value?2*1e-5:1e-7; + double eps = (std::is_same::mag_type, float>::value ? 1e-4 : 1e-7); - typename AT::mag_type expected_result = 0; + mag_type expected_result = 0; for(int i=0;i::imag is 0 if T is real. + expected_result += MAT::abs(AT::real(h_a(i))) + MAT::abs(AT::imag(h_a(i))); + } - typename AT::mag_type const_result = KokkosBlas::nrm1(c_a); - EXPECT_NEAR_KK( const_result, expected_result, eps*expected_result); + mag_type nonconst_result = KokkosBlas::nrm1(a); + EXPECT_NEAR_KK( nonconst_result, expected_result, eps * expected_result ); + mag_type const_result = KokkosBlas::nrm1(c_a); + EXPECT_NEAR_KK( const_result, expected_result, eps * expected_result ); } template @@ -53,6 +61,8 @@ namespace Test { typedef typename ViewTypeA::value_type ScalarA; typedef Kokkos::Details::ArithTraits AT; + typedef typename AT::mag_type mag_type; + typedef Kokkos::ArithTraits MAT; typedef multivector_layout_adapter vfA_type; @@ -68,38 +78,36 @@ namespace Test { Kokkos::Random_XorShift64_Pool rand_pool(13718); - Kokkos::fill_random(b_a,rand_pool,ScalarA(10)); - - Kokkos::fence(); + ScalarA randStart, randEnd; + Test::getRandomBounds(10.0, randStart, randEnd); + Kokkos::fill_random(b_a,rand_pool,randStart,randEnd); Kokkos::deep_copy(h_b_a,b_a); typename ViewTypeA::const_type c_a = a; - typename AT::mag_type* expected_result = new typename AT::mag_type[K]; - for(int j=0;j::mag_type, float>::value ? 1e-4 : 1e-7); + + Kokkos::View expected_result("Expected Nrm1", K); + for(int k = 0; k < K; k++) + { + expected_result(k) = MAT::zero(); for(int i=0;i::value?2*1e-5:1e-7; - - Kokkos::View r("Dot::Result",K); + Kokkos::View r("Nrm1::Result",K); + Kokkos::View c_r("Nrm1::ConstResult",K); - KokkosBlas::nrm1(r,a); - for(int k=0;k rand_pool(13718); - Kokkos::fill_random(b_a,rand_pool,ScalarA(1)); - - Kokkos::fence(); + ScalarA randStart, randEnd; + Test::getRandomBounds(1.0, randStart, randEnd); + Kokkos::fill_random(b_a,rand_pool,randStart,randEnd); Kokkos::deep_copy(h_b_a,b_a); @@ -69,9 +69,9 @@ namespace Test { Kokkos::Random_XorShift64_Pool rand_pool(13718); - Kokkos::fill_random(b_a,rand_pool,ScalarA(1)); - - Kokkos::fence(); + ScalarA randStart, randEnd; + Test::getRandomBounds(1.0, randStart, randEnd); + Kokkos::fill_random(b_a,rand_pool,randStart,randEnd); Kokkos::deep_copy(h_b_a,b_a); diff --git a/packages/kokkos-kernels/unit_test/blas/Test_Blas1_nrm2_squared.hpp b/packages/kokkos-kernels/unit_test/blas/Test_Blas1_nrm2_squared.hpp index ac116b8987e7..aef2e2e95e50 100644 --- a/packages/kokkos-kernels/unit_test/blas/Test_Blas1_nrm2_squared.hpp +++ b/packages/kokkos-kernels/unit_test/blas/Test_Blas1_nrm2_squared.hpp @@ -27,9 +27,9 @@ namespace Test { Kokkos::Random_XorShift64_Pool rand_pool(13718); - Kokkos::fill_random(b_a,rand_pool,ScalarA(1)); - - Kokkos::fence(); + ScalarA randStart, randEnd; + Test::getRandomBounds(1.0, randStart, randEnd); + Kokkos::fill_random(b_a,rand_pool,randStart,randEnd); Kokkos::deep_copy(h_b_a,b_a); @@ -68,9 +68,9 @@ namespace Test { Kokkos::Random_XorShift64_Pool rand_pool(13718); - Kokkos::fill_random(b_a,rand_pool,ScalarA(1)); - - Kokkos::fence(); + ScalarA randStart, randEnd; + Test::getRandomBounds(1.0, randStart, randEnd); + Kokkos::fill_random(b_a,rand_pool,randStart,randEnd); Kokkos::deep_copy(h_b_a,b_a); diff --git a/packages/kokkos-kernels/unit_test/blas/Test_Blas1_nrminf.hpp b/packages/kokkos-kernels/unit_test/blas/Test_Blas1_nrminf.hpp index f328a720b7d1..0893045deefb 100644 --- a/packages/kokkos-kernels/unit_test/blas/Test_Blas1_nrminf.hpp +++ b/packages/kokkos-kernels/unit_test/blas/Test_Blas1_nrminf.hpp @@ -27,9 +27,9 @@ namespace Test { Kokkos::Random_XorShift64_Pool rand_pool(13718); - Kokkos::fill_random(b_a,rand_pool,ScalarA(10)); - - Kokkos::fence(); + ScalarA randStart, randEnd; + Test::getRandomBounds(10.0, randStart, randEnd); + Kokkos::fill_random(b_a,rand_pool,randStart,randEnd); Kokkos::deep_copy(h_b_a,b_a); @@ -70,9 +70,9 @@ namespace Test { Kokkos::Random_XorShift64_Pool rand_pool(13718); - Kokkos::fill_random(b_a,rand_pool,ScalarA(10)); - - Kokkos::fence(); + ScalarA randStart, randEnd; + Test::getRandomBounds(10.0, randStart, randEnd); + Kokkos::fill_random(b_a,rand_pool,randStart,randEnd); Kokkos::deep_copy(h_b_a,b_a); @@ -98,13 +98,12 @@ namespace Test { EXPECT_NEAR_KK( nonconst_result, exp_result, eps*exp_result); } - /* KokkosBlas::nrminf(r,c_a); + KokkosBlas::nrminf(r,c_a); for(int k=0;k rand_pool(13718); - Kokkos::fill_random(b_x,rand_pool,ScalarA(1)); - Kokkos::fill_random(b_y,rand_pool,ScalarB(1)); - - Kokkos::fence(); + { + ScalarA randStart, randEnd; + Test::getRandomBounds(1.0, randStart, randEnd); + Kokkos::fill_random(b_x,rand_pool,randStart,randEnd); + } + { + ScalarB randStart, randEnd; + Test::getRandomBounds(1.0, randStart, randEnd); + Kokkos::fill_random(b_y,rand_pool,randStart,randEnd); + } Kokkos::deep_copy(b_org_y,b_y); @@ -99,10 +105,16 @@ namespace Test { Kokkos::Random_XorShift64_Pool rand_pool(13718); - Kokkos::fill_random(b_x,rand_pool,ScalarA(1)); - Kokkos::fill_random(b_y,rand_pool,ScalarB(1)); - - Kokkos::fence(); + { + ScalarA randStart, randEnd; + Test::getRandomBounds(1.0, randStart, randEnd); + Kokkos::fill_random(b_x,rand_pool,randStart,randEnd); + } + { + ScalarB randStart, randEnd; + Test::getRandomBounds(1.0, randStart, randEnd); + Kokkos::fill_random(b_y,rand_pool,randStart,randEnd); + } Kokkos::deep_copy(b_org_y,b_y); diff --git a/packages/kokkos-kernels/unit_test/blas/Test_Blas1_scal.hpp b/packages/kokkos-kernels/unit_test/blas/Test_Blas1_scal.hpp index f59b8d49ea48..254850f1ae6c 100644 --- a/packages/kokkos-kernels/unit_test/blas/Test_Blas1_scal.hpp +++ b/packages/kokkos-kernels/unit_test/blas/Test_Blas1_scal.hpp @@ -25,13 +25,10 @@ namespace Test { ScalarA a(3); typename AT::mag_type eps = AT::epsilon()*1000; - typename AT::mag_type zero = AT::abs( AT::zero() ); - typename AT::mag_type one = AT::abs( AT::one() ); BaseTypeA b_x("X",N); BaseTypeB b_y("Y",N); BaseTypeB b_org_y("Org_Y",N); - ViewTypeA x = Kokkos::subview(b_x,Kokkos::ALL(),0); ViewTypeB y = Kokkos::subview(b_y,Kokkos::ALL(),0); @@ -46,35 +43,35 @@ namespace Test { Kokkos::Random_XorShift64_Pool rand_pool(13718); - Kokkos::fill_random(b_x,rand_pool,ScalarA(1)); - Kokkos::fill_random(b_y,rand_pool,ScalarB(1)); - - Kokkos::fence(); + { + ScalarA randStart, randEnd; + Test::getRandomBounds(1.0, randStart, randEnd); + Kokkos::fill_random(b_x,rand_pool,randStart,randEnd); + } + { + ScalarB randStart, randEnd; + Test::getRandomBounds(1.0, randStart, randEnd); + Kokkos::fill_random(b_y,rand_pool,randStart,randEnd); + } Kokkos::deep_copy(b_org_y,b_y); Kokkos::deep_copy(h_b_x,b_x); Kokkos::deep_copy(h_b_y,b_y); - ScalarA expected_result(0); - for(int i=0;i rand_pool(13718); - Kokkos::fill_random(b_x,rand_pool,ScalarA(1)); - Kokkos::fill_random(b_y,rand_pool,ScalarB(1)); + { + ScalarA randStart, randEnd; + Test::getRandomBounds(1.0, randStart, randEnd); + Kokkos::fill_random(b_x,rand_pool,randStart,randEnd); + } + { + ScalarB randStart, randEnd; + Test::getRandomBounds(1.0, randStart, randEnd); + Kokkos::fill_random(b_y,rand_pool,randStart,randEnd); + } Kokkos::fence(); Kokkos::deep_copy(b_org_y,b_y); Kokkos::deep_copy(h_b_x,b_x); - Kokkos::deep_copy(h_b_y,b_y); ScalarA a(3.0); typename ViewTypeA::const_type c_x = x; - ScalarA* expected_result = new ScalarA[K]; - for(int j=0;j r("Dot::Result",K); KokkosBlas::scal(y,a,x); - KokkosBlas::dot(r,y,y); - for(int k=0;k params("Params",K); for(int j=0; j rand_pool(13718); - Kokkos::fill_random(b_a,rand_pool,ScalarA(10)); - - Kokkos::fence(); + ScalarA randStart, randEnd; + Test::getRandomBounds(10.0, randStart, randEnd); + Kokkos::fill_random(b_a,rand_pool,randStart,randEnd); Kokkos::deep_copy(h_b_a,b_a); @@ -51,7 +51,6 @@ namespace Test { void impl_test_sum_mv(int N, int K) { typedef typename ViewTypeA::value_type ScalarA; - typedef Kokkos::Details::ArithTraits AT; typedef multivector_layout_adapter vfA_type; @@ -67,9 +66,9 @@ namespace Test { Kokkos::Random_XorShift64_Pool rand_pool(13718); - Kokkos::fill_random(b_a,rand_pool,ScalarA(10)); - - Kokkos::fence(); + ScalarA randStart, randEnd; + Test::getRandomBounds(10.0, randStart, randEnd); + Kokkos::fill_random(b_a,rand_pool,randStart,randEnd); Kokkos::deep_copy(h_b_a,b_a); @@ -79,7 +78,7 @@ namespace Test { for(int j=0;j::value?2*1e-5:1e-7; diff --git a/packages/kokkos-kernels/unit_test/blas/Test_Blas1_team_dot.hpp b/packages/kokkos-kernels/unit_test/blas/Test_Blas1_team_dot.hpp index 158dcf573341..f3c819da3b14 100644 --- a/packages/kokkos-kernels/unit_test/blas/Test_Blas1_team_dot.hpp +++ b/packages/kokkos-kernels/unit_test/blas/Test_Blas1_team_dot.hpp @@ -46,8 +46,6 @@ namespace Test { Kokkos::fill_random(b_a,rand_pool,ScalarA(10)); Kokkos::fill_random(b_b,rand_pool,ScalarB(10)); - Kokkos::fence(); - Kokkos::deep_copy(h_b_a,b_a); Kokkos::deep_copy(h_b_b,b_b); @@ -150,8 +148,6 @@ namespace Test { Kokkos::fill_random(b_a,rand_pool,ScalarA(10)); Kokkos::fill_random(b_b,rand_pool,ScalarB(10)); - Kokkos::fence(); - Kokkos::deep_copy(h_b_a,b_a); Kokkos::deep_copy(h_b_b,b_b); diff --git a/packages/kokkos-kernels/unit_test/blas/Test_Blas1_team_nrm2.hpp b/packages/kokkos-kernels/unit_test/blas/Test_Blas1_team_nrm2.hpp index 4c654c7eaebb..99147053ed3b 100644 --- a/packages/kokkos-kernels/unit_test/blas/Test_Blas1_team_nrm2.hpp +++ b/packages/kokkos-kernels/unit_test/blas/Test_Blas1_team_nrm2.hpp @@ -33,8 +33,6 @@ namespace Test { Kokkos::fill_random(b_a,rand_pool,ScalarA(10)); - Kokkos::fence(); - Kokkos::deep_copy(h_b_a,b_a); typename ViewTypeA::const_type c_a = a; diff --git a/packages/kokkos-kernels/unit_test/blas/Test_Blas1_team_scal.hpp b/packages/kokkos-kernels/unit_test/blas/Test_Blas1_team_scal.hpp index 6b33caa2622c..fb6ef4487d1a 100644 --- a/packages/kokkos-kernels/unit_test/blas/Test_Blas1_team_scal.hpp +++ b/packages/kokkos-kernels/unit_test/blas/Test_Blas1_team_scal.hpp @@ -57,8 +57,6 @@ namespace Test { Kokkos::fill_random(b_x,rand_pool,ScalarA(1)); Kokkos::fill_random(b_y,rand_pool,ScalarB(1)); - Kokkos::fence(); - Kokkos::deep_copy(b_org_y,b_y); Kokkos::deep_copy(h_b_x,b_x); @@ -132,8 +130,6 @@ namespace Test { Kokkos::fill_random(b_x,rand_pool,ScalarA(1)); Kokkos::fill_random(b_y,rand_pool,ScalarB(1)); - Kokkos::fence(); - Kokkos::deep_copy(b_org_y,b_y); Kokkos::deep_copy(h_b_x,b_x); diff --git a/packages/kokkos-kernels/unit_test/blas/Test_Blas1_team_update.hpp b/packages/kokkos-kernels/unit_test/blas/Test_Blas1_team_update.hpp index dcc9d1e4868c..5298a6798dc4 100644 --- a/packages/kokkos-kernels/unit_test/blas/Test_Blas1_team_update.hpp +++ b/packages/kokkos-kernels/unit_test/blas/Test_Blas1_team_update.hpp @@ -66,8 +66,6 @@ namespace Test { Kokkos::fill_random(b_y,rand_pool,ScalarB(10)); Kokkos::fill_random(b_z,rand_pool,ScalarC(10)); - Kokkos::fence(); - Kokkos::deep_copy(b_org_z,b_z); Kokkos::deep_copy(h_b_x,b_x); @@ -149,8 +147,6 @@ namespace Test { Kokkos::fill_random(b_y,rand_pool,ScalarB(10)); Kokkos::fill_random(b_z,rand_pool,ScalarC(10)); - Kokkos::fence(); - Kokkos::deep_copy(b_org_z,b_z); Kokkos::deep_copy(h_b_x,b_x); diff --git a/packages/kokkos-kernels/unit_test/blas/Test_Blas1_update.hpp b/packages/kokkos-kernels/unit_test/blas/Test_Blas1_update.hpp index 8bfcdbe5cc9a..0ece3ae74c9d 100644 --- a/packages/kokkos-kernels/unit_test/blas/Test_Blas1_update.hpp +++ b/packages/kokkos-kernels/unit_test/blas/Test_Blas1_update.hpp @@ -54,35 +54,52 @@ namespace Test { Kokkos::Random_XorShift64_Pool rand_pool(13718); - Kokkos::fill_random(b_x,rand_pool,ScalarA(10)); - Kokkos::fill_random(b_y,rand_pool,ScalarB(10)); - Kokkos::fill_random(b_z,rand_pool,ScalarC(10)); - - Kokkos::fence(); + { + ScalarA randStart, randEnd; + Test::getRandomBounds(10.0, randStart, randEnd); + Kokkos::fill_random(b_x,rand_pool,randStart,randEnd); + } + { + ScalarB randStart, randEnd; + Test::getRandomBounds(10.0, randStart, randEnd); + Kokkos::fill_random(b_y,rand_pool,randStart,randEnd); + } + { + ScalarC randStart, randEnd; + Test::getRandomBounds(10.0, randStart, randEnd); + Kokkos::fill_random(b_z,rand_pool,randStart,randEnd); + } Kokkos::deep_copy(b_org_z,b_z); + auto h_b_org_z = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), b_org_z); + auto h_org_z = Kokkos::subview(h_b_org_z, Kokkos::ALL(), 0); Kokkos::deep_copy(h_b_x,b_x); Kokkos::deep_copy(h_b_y,b_y); Kokkos::deep_copy(h_b_z,b_z); - ScalarA expected_result = 0; - for(int i=0;i @@ -119,13 +136,24 @@ namespace Test { Kokkos::Random_XorShift64_Pool rand_pool(13718); - Kokkos::fill_random(b_x,rand_pool,ScalarA(10)); - Kokkos::fill_random(b_y,rand_pool,ScalarB(10)); - Kokkos::fill_random(b_z,rand_pool,ScalarC(10)); - - Kokkos::fence(); + { + ScalarA randStart, randEnd; + Test::getRandomBounds(10.0, randStart, randEnd); + Kokkos::fill_random(b_x,rand_pool,randStart,randEnd); + } + { + ScalarB randStart, randEnd; + Test::getRandomBounds(10.0, randStart, randEnd); + Kokkos::fill_random(b_y,rand_pool,randStart,randEnd); + } + { + ScalarC randStart, randEnd; + Test::getRandomBounds(10.0, randStart, randEnd); + Kokkos::fill_random(b_z,rand_pool,randStart,randEnd); + } Kokkos::deep_copy(b_org_z,b_z); + auto h_b_org_z = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), b_org_z); Kokkos::deep_copy(h_b_x,b_x); Kokkos::deep_copy(h_b_y,b_y); @@ -137,33 +165,28 @@ namespace Test { typename ViewTypeA::const_type c_x = x; typename ViewTypeB::const_type c_y = y; - ScalarC* expected_result = new ScalarC[K]; - for(int j=0;j::value?2*1e-5:1e-7; - Kokkos::View r("Dot::Result",K); - KokkosBlas::update(a,x,b,y,c,z); - KokkosBlas::dot(r,z,z); - for(int k=0;k::value ? 2*1e-5 : 1e-7); + double eps = (std::is_same::mag_type, float>::value ? 1e-3 : 1e-10); int ldx; int ldy; @@ -61,59 +61,80 @@ namespace Test { Kokkos::Random_XorShift64_Pool rand_pool(13718); - Kokkos::fill_random(b_x,rand_pool,ScalarX(10)); - Kokkos::fill_random(b_y,rand_pool,ScalarY(10)); - Kokkos::fill_random(b_A,rand_pool,ScalarA(10)); - - Kokkos::fence(); + { + ScalarX randStart, randEnd; + Test::getRandomBounds(10.0, randStart, randEnd); + Kokkos::fill_random(b_x,rand_pool,randStart,randEnd); + } + { + ScalarY randStart, randEnd; + Test::getRandomBounds(10.0, randStart, randEnd); + Kokkos::fill_random(b_y,rand_pool,randStart,randEnd); + } + { + ScalarA randStart, randEnd; + Test::getRandomBounds(10.0, randStart, randEnd); + Kokkos::fill_random(b_A,rand_pool,randStart,randEnd); + } Kokkos::deep_copy(b_org_y,b_y); + auto h_b_org_y = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), b_org_y); + auto h_org_y = Kokkos::subview(h_b_org_y, Kokkos::ALL(), 0); Kokkos::deep_copy(h_b_x,b_x); Kokkos::deep_copy(h_b_y,b_y); Kokkos::deep_copy(h_b_A,b_A); typedef Kokkos::Details::ArithTraits KAT; - ScalarY expected_result = KAT:: zero(); + Kokkos::View expected("expected aAx+by", ldy); if(mode[0] == 'N') { for(int i = 0; i < M; i++) { - ScalarY y_i = KAT::zero (); + ScalarY y_i = beta * h_org_y(i); for(int j = 0; j < N; j++) { - y_i += h_A(i,j) * h_x(j); + y_i += alpha * h_A(i,j) * h_x(j); } - expected_result += (beta * h_y(i) + alpha * y_i) * (beta * h_y(i) + alpha * y_i) ; + expected(i) = y_i; } } else if(mode[0] == 'T') { for(int j = 0; j < N; j++) { - ScalarY y_j = KAT::zero (); + ScalarY y_j = beta * h_org_y(j); for(int i = 0; i < M; i++) { - y_j += h_A(i,j) * h_x(i); + y_j += alpha * h_A(i,j) * h_x(i); } - expected_result += (beta * h_y(j) + alpha * y_j) * (beta * h_y(j) + alpha * y_j) ; + expected(j) = y_j; } } else if(mode[0] == 'C') { for(int j = 0; j < N; j++) { - ScalarY y_j = KAT::zero (); + ScalarY y_j = beta * h_org_y(j); for(int i = 0; i < M; i++) { - y_j += KAT::conj (h_A(i,j)) * h_x(i); + y_j += alpha * KAT::conj (h_A(i,j)) * h_x(i); } - expected_result += (beta * h_y(j) + alpha * y_j) * (beta * h_y(j) + alpha * y_j) ; + expected(j) = y_j; } } KokkosBlas::gemv(mode, alpha, A, x, beta, y); - ScalarY nonconst_nonconst_result = KokkosBlas::dot(y, y); - EXPECT_NEAR_KK( nonconst_nonconst_result, expected_result, eps*expected_result); + Kokkos::deep_copy(h_b_y, b_y); + for(int i = 0; i < ldy; i++) + { + EXPECT_NEAR_KK(expected(i), h_y(i), eps * expected(i)); + } Kokkos::deep_copy(b_y, b_org_y); KokkosBlas::gemv(mode, alpha,A ,c_x, beta, y); - ScalarY const_nonconst_result = KokkosBlas::dot(y, y); - EXPECT_NEAR_KK( const_nonconst_result, expected_result, eps*expected_result); + Kokkos::deep_copy(h_b_y, b_y); + for(int i = 0; i < ldy; i++) + { + EXPECT_NEAR_KK(expected(i), h_y(i), eps); + } Kokkos::deep_copy(b_y, b_org_y); KokkosBlas::gemv(mode, alpha, c_A, c_x, beta, y); - ScalarY const_const_result = KokkosBlas::dot(y, y); - EXPECT_NEAR_KK( const_const_result, expected_result, eps*expected_result); + Kokkos::deep_copy(h_b_y, b_y); + for(int i = 0; i < ldy; i++) + { + EXPECT_NEAR_KK(expected(i), h_y(i), eps); + } } } @@ -203,7 +224,7 @@ TEST_F( TestCategory, gemv_complex_double ) { Kokkos::Profiling::popRegion(); Kokkos::Profiling::pushRegion("KokkosBlas::Test::gemv_conj_complex_double"); - test_gemv,Kokkos::complex,Kokkos::complex,TestExecSpace> ("T"); + test_gemv,Kokkos::complex,Kokkos::complex,TestExecSpace> ("C"); Kokkos::Profiling::popRegion(); } #endif diff --git a/packages/kokkos-kernels/unit_test/blas/Test_Blas2_team_gemv.hpp b/packages/kokkos-kernels/unit_test/blas/Test_Blas2_team_gemv.hpp index 124941bfd85e..f8a7f7c1be53 100644 --- a/packages/kokkos-kernels/unit_test/blas/Test_Blas2_team_gemv.hpp +++ b/packages/kokkos-kernels/unit_test/blas/Test_Blas2_team_gemv.hpp @@ -64,8 +64,6 @@ namespace Test { Kokkos::fill_random(b_y,rand_pool,ScalarY(10)); Kokkos::fill_random(b_A,rand_pool,ScalarA(10)); - Kokkos::fence(); - Kokkos::deep_copy(b_org_y,b_y); Kokkos::deep_copy(h_b_x,b_x); diff --git a/packages/kokkos-kernels/unit_test/blas/Test_Blas3_gemm.hpp b/packages/kokkos-kernels/unit_test/blas/Test_Blas3_gemm.hpp index 451b7fedacc9..580de25397e4 100644 --- a/packages/kokkos-kernels/unit_test/blas/Test_Blas3_gemm.hpp +++ b/packages/kokkos-kernels/unit_test/blas/Test_Blas3_gemm.hpp @@ -115,8 +115,6 @@ namespace Test { Kokkos::deep_copy(C2,C); - Kokkos::fence(); - struct VanillaGEMM vgemm; vgemm.A_t = A_t; vgemm.B_t = B_t; vgemm.A_c = A_c; vgemm.B_c = B_c; @@ -130,8 +128,6 @@ namespace Test { KokkosBlas::gemm(TA,TB,alpha,A,B,beta,C); - Kokkos::fence(); - mag_type diff_C = 0; struct DiffGEMM diffgemm; diffgemm.N = N; diff --git a/packages/kokkos-kernels/unit_test/blas/Test_Blas3_trmm.hpp b/packages/kokkos-kernels/unit_test/blas/Test_Blas3_trmm.hpp index 9f72bd5e6321..4c8d154c15a4 100644 --- a/packages/kokkos-kernels/unit_test/blas/Test_Blas3_trmm.hpp +++ b/packages/kokkos-kernels/unit_test/blas/Test_Blas3_trmm.hpp @@ -121,7 +121,6 @@ namespace Test { Kokkos::parallel_for("KokkosBlas::Test::NonUnitDiagTRMM", Kokkos::RangePolicy(0,K), nudtrmm); } Kokkos::fill_random(B, rand_pool, Kokkos::rand, ScalarA>::max()); - Kokkos::fence(); Kokkos::deep_copy(host_A, A); // Make host_A a lower triangle @@ -162,11 +161,9 @@ namespace Test { vgemm.beta = beta; Kokkos::parallel_for("KokkosBlas::Test::VanillaGEMM", Kokkos::TeamPolicy(M,Kokkos::AUTO,16), vgemm); } - Kokkos::fence(); Kokkos::deep_copy(host_B_expected, B_expected); KokkosBlas::trmm(side, uplo, trans, diag, alpha, A, B); - Kokkos::fence(); Kokkos::deep_copy(host_B_actual, B); bool test_flag = true; diff --git a/packages/kokkos-kernels/unit_test/blas/Test_Blas3_trsm.hpp b/packages/kokkos-kernels/unit_test/blas/Test_Blas3_trsm.hpp index 8fec44b63737..ca9c40ae7e65 100644 --- a/packages/kokkos-kernels/unit_test/blas/Test_Blas3_trsm.hpp +++ b/packages/kokkos-kernels/unit_test/blas/Test_Blas3_trsm.hpp @@ -127,8 +127,6 @@ namespace Test { ScalarA alpha_trmm = ScalarA(1)/alpha; ScalarA beta = ScalarA(0); - Kokkos::fence(); - if ((uplo[0]=='L')||(uplo[0]=='l')) { for (int i = 0; i < K-1; i++) for (int j = i+1; j < K; j++) diff --git a/packages/kokkos-kernels/unit_test/graph/Test_Graph_mis2.hpp b/packages/kokkos-kernels/unit_test/graph/Test_Graph_mis2.hpp index 30d32fb2dc94..4080a17f80ae 100644 --- a/packages/kokkos-kernels/unit_test/graph/Test_Graph_mis2.hpp +++ b/packages/kokkos-kernels/unit_test/graph/Test_Graph_mis2.hpp @@ -47,6 +47,7 @@ #include #include "KokkosGraph_MIS2.hpp" +#include "KokkosGraph_ExplicitCoarsening.hpp" #include "KokkosSparse_CrsMatrix.hpp" #include "KokkosKernels_IOUtils.hpp" #include "KokkosKernels_SparseUtils.hpp" @@ -194,9 +195,73 @@ void test_mis2_coarsening(lno_t numVerts, size_type nnz, lno_t bandwidth, lno_t //Check that every label is in the range [0, numClusters) for(lno_t i = 0; i < numVerts; i++) EXPECT_TRUE(0 <= labelsHost(i) && labelsHost(i) < numClusters); + //Test explicit coarsening given the labels, with and without compressing the result + rowmap_t coarseRowmapNC, coarseRowmapC; + entries_t coarseEntriesNC, coarseEntriesC; + KokkosGraph::Experimental::graph_explicit_coarsen + (symRowmap, symEntries, labels, numClusters, coarseRowmapNC, coarseEntriesNC, false); + KokkosGraph::Experimental::graph_explicit_coarsen + (symRowmap, symEntries, labels, numClusters, coarseRowmapC, coarseEntriesC, true); + EXPECT_EQ(coarseRowmapC.extent(0), numClusters + 1); + EXPECT_EQ(coarseRowmapNC.extent(0), numClusters + 1); + //Check that coarse graph doesn't have more edges than fine graph + EXPECT_LE(coarseEntriesC.extent(0), symEntries.extent(0)); + EXPECT_LE(coarseEntriesNC.extent(0), symEntries.extent(0)); + //Verify compression is working. + auto hostRowmapNC = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), coarseRowmapNC); + auto hostEntriesNC = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), coarseEntriesNC); + auto hostRowmapC = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), coarseRowmapC); + auto hostEntriesC = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), coarseEntriesC); + for(lno_t i = 0; i < numClusters; i++) + { + //std::set maintains uniqueness as well as ascending order of elements. + //So it should exactly match the entries in the compressed version. + std::set uniqueEntries; + for(size_type j = hostRowmapNC(i); j < hostRowmapNC(i + 1); j++) + { + uniqueEntries.insert(hostEntriesNC(j)); + } + size_type compressedRowLen = hostRowmapC(i + 1) - hostRowmapC(i); + ASSERT_EQ(uniqueEntries.size(), compressedRowLen); + auto it = uniqueEntries.begin(); + for(size_type j = hostRowmapC(i); j < hostRowmapC(i + 1); j++) + { + EXPECT_EQ(*it, hostEntriesC(j)); + it++; + } + } } } +template +void test_mis2_coarsening_zero_rows() +{ + using crsMat = KokkosSparse::CrsMatrix; + using graph_type = typename crsMat::StaticCrsGraphType; + using c_rowmap_t = typename graph_type::row_map_type; + using c_entries_t = typename graph_type::entries_type; + using rowmap_t = typename c_rowmap_t::non_const_type; + using entries_t = typename c_entries_t::non_const_type; + rowmap_t fineRowmap; + entries_t fineEntries; + //note: MIS2 coarsening first calls MIS2 on the fine graph, so this covers the zero-row case for MIS2 alone. + lno_t numClusters; + auto labels = graph_mis2_coarsen(fineRowmap, fineEntries, numClusters, KokkosGraph::MIS2_FAST); + EXPECT_EQ(numClusters, 0); + EXPECT_EQ(labels.extent(0), 0); + //coarsen, should also produce a graph with 0 rows/entries + rowmap_t coarseRowmap; + entries_t coarseEntries; + KokkosGraph::Experimental::graph_explicit_coarsen + (fineRowmap, fineEntries, labels, 0, coarseRowmap, coarseEntries, false); + EXPECT_LE(coarseRowmap.extent(0), 1); + EXPECT_EQ(coarseEntries.extent(0), 0); + KokkosGraph::Experimental::graph_explicit_coarsen + (fineRowmap, fineEntries, labels, 0, coarseRowmap, coarseEntries, true); + EXPECT_LE(coarseRowmap.extent(0), 1); + EXPECT_EQ(coarseEntries.extent(0), 0); +} + #define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ TEST_F(TestCategory, graph##_##graph_mis2##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) \ { \ @@ -206,9 +271,11 @@ void test_mis2_coarsening(lno_t numVerts, size_type nnz, lno_t bandwidth, lno_t } \ TEST_F(TestCategory, graph##_##graph_mis2_coarsening##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) \ { \ + test_mis2_coarsening(5000, 5000 * 200, 2000, 10); \ test_mis2_coarsening(5000, 5000 * 20, 1000, 10); \ test_mis2_coarsening(50, 50 * 10, 40, 10); \ test_mis2_coarsening(5, 5 * 3, 5, 0); \ + test_mis2_coarsening_zero_rows(); \ } #if defined(KOKKOSKERNELS_INST_DOUBLE) diff --git a/packages/kokkos-kernels/unit_test/sparse/Test_Sparse_CrsMatrix.hpp b/packages/kokkos-kernels/unit_test/sparse/Test_Sparse_CrsMatrix.hpp index 13513fef14e1..85b427d445d6 100644 --- a/packages/kokkos-kernels/unit_test/sparse/Test_Sparse_CrsMatrix.hpp +++ b/packages/kokkos-kernels/unit_test/sparse/Test_Sparse_CrsMatrix.hpp @@ -189,9 +189,46 @@ testCrsMatrix () //printf ("A is %d by %d\n", A.numRows (), A.numCols ()); } +template +void +testCrsMatrixHostMirror () +{ + using namespace Test; + using crs_matrix = KokkosSparse::CrsMatrix; + using crs_matrix_host = typename crs_matrix::HostMirror; + using crs_graph = typename crs_matrix::StaticCrsGraphType; + using crs_graph_host = typename crs_graph::HostMirror; + crs_matrix A = makeCrsMatrix(); + typename crs_matrix::values_type::HostMirror valuesHost("values host", A.nnz()); + typename crs_matrix::row_map_type::HostMirror rowmapHost("rowmap host", A.numRows() + 1); + typename crs_matrix::index_type::HostMirror entriesHost("entries host", A.nnz()); + crs_graph_host graphHost(entriesHost, rowmapHost); + //Test the two CrsMatrix constructors that take the StaticCrsGraph + crs_matrix_host Ahost1("Ahost1", graphHost); + crs_matrix_host Ahost2("Ahost2", A.numCols(), valuesHost, graphHost); + //Test deep copy constructor (can copy between any two spaces) + { + crs_matrix Bdev("B device", Ahost1); + crs_matrix_host Bhost("B host", A); + } + //Test the empty (0x0, 0 entries) case - zero-length rowmap. + typename crs_graph::row_map_type::non_const_type zeroRowmap; + typename crs_graph::entries_type zeroEntries; + typename crs_matrix::values_type zeroValues; + crs_matrix zero("ZeroRow", 0, 0, 0, zeroValues, zeroRowmap, zeroEntries); + crs_matrix_host zeroHost("zero1Host", zero); + EXPECT_EQ(zeroHost.numRows(), 0); + EXPECT_EQ(zeroHost.numCols(), 0); + EXPECT_EQ(zeroHost.nnz(), 0); + EXPECT_EQ(zeroHost.graph.row_map.extent(0), 0); +} + #define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ TEST_F( TestCategory, sparse ## _ ## crsmatrix ## _ ## SCALAR ## _ ## ORDINAL ## _ ## OFFSET ## _ ## DEVICE ) { \ testCrsMatrix (); \ +} \ +TEST_F( TestCategory, sparse ## _ ## crsmatrix_host_mirror ## _ ## SCALAR ## _ ## ORDINAL ## _ ## OFFSET ## _ ## DEVICE ) { \ + testCrsMatrixHostMirror (); \ } diff --git a/packages/kokkos/.jenkins b/packages/kokkos/.jenkins index 889abe33f830..001171d648e7 100644 --- a/packages/kokkos/.jenkins +++ b/packages/kokkos/.jenkins @@ -5,6 +5,8 @@ pipeline { CCACHE_DIR = '/tmp/ccache' CCACHE_MAXSIZE = '10G' CCACHE_CPP2 = 'true' + BUILD_JOBS = 8 + SITE = 'Jenkins' } stages { stage('Clang-Format') { @@ -28,25 +30,27 @@ pipeline { dockerfile { filename 'Dockerfile.sycl' dir 'scripts/docker' - additionalBuildArgs '--build-arg BASE=intel/oneapi-basekit:devel-ubuntu18.04' - label 'docker' + label 'nvidia-docker && volta' args '-v /tmp/ccache.kokkos:/tmp/ccache' } } steps { sh 'ccache --zero-stats' - sh '''rm -rf build && mkdir -p build && cd build && \ + sh '''rm -rf build && \ cmake \ - -DCMAKE_BUILD_TYPE=Debug \ + -DCMAKE_BUILD_TYPE=Release \ -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \ -DCMAKE_CXX_COMPILER=clang++ \ - -DCMAKE_CXX_FLAGS="-Werror" \ + -DCMAKE_CXX_FLAGS="-Werror -Wno-unknown-cuda-version -Wno-gnu-zero-variadic-macro-arguments" \ + -DKokkos_ARCH_VOLTA70=ON \ -DKokkos_ENABLE_COMPILER_WARNINGS=ON \ + -DKokkos_ENABLE_EXAMPLES=ON \ -DKokkos_ENABLE_TESTS=ON \ -DKokkos_ENABLE_SYCL=ON \ + -DKokkos_ENABLE_UNSUPPORTED_ARCHS=ON \ -DCMAKE_CXX_STANDARD=17 \ - .. && \ - make -j8 && ctest --verbose''' + -DBUILD_NAME=${STAGE_NAME} \ + -P cmake/KokkosCI.cmake''' } post { always { @@ -68,11 +72,12 @@ pipeline { OMP_NUM_THREADS = 8 OMP_PLACES = 'threads' OMP_PROC_BIND = 'spread' + LC_ALL = 'C' } steps { sh 'ccache --zero-stats' sh 'echo "/opt/rocm/llvm/lib" > /etc/ld.so.conf.d/llvm.conf && ldconfig' - sh '''rm -rf build && mkdir -p build && cd build && \ + sh '''rm -rf build && \ cmake \ -DCMAKE_BUILD_TYPE=Debug \ -DCMAKE_CXX_COMPILER=hipcc \ @@ -83,8 +88,8 @@ pipeline { -DKokkos_ENABLE_HIP=ON \ -DKokkos_ARCH_VEGA906=ON \ -DKokkos_ENABLE_OPENMP=ON \ - .. && \ - make -j8 && ctest --verbose''' + -DBUILD_NAME=${STAGE_NAME} \ + -P cmake/KokkosCI.cmake''' } post { always { @@ -102,9 +107,12 @@ pipeline { args '-v /tmp/ccache.kokkos:/tmp/ccache --device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --group-add video --env HIP_VISIBLE_DEVICES=$HIP_VISIBLE_DEVICES' } } + environment { + LC_ALL = 'C' + } steps { sh 'ccache --zero-stats' - sh '''rm -rf build && mkdir -p build && cd build && \ + sh '''rm -rf build && \ cmake \ -DCMAKE_BUILD_TYPE=RelWithDebInfo \ -DCMAKE_CXX_COMPILER=hipcc \ @@ -114,8 +122,8 @@ pipeline { -DKokkos_ENABLE_TESTS=ON \ -DKokkos_ENABLE_HIP=ON \ -DKokkos_ARCH_VEGA906=ON \ - .. && \ - make -j8 && ctest --verbose''' + -DBUILD_NAME=${STAGE_NAME} \ + -P cmake/KokkosCI.cmake''' } post { always { @@ -134,19 +142,19 @@ pipeline { } steps { sh 'ccache --zero-stats' - sh '''rm -rf build && mkdir -p build && cd build && \ + sh '''rm -rf build && \ cmake \ - -DCMAKE_BUILD_TYPE=Debug \ + -DCMAKE_BUILD_TYPE=RelWithDebInfo \ -DCMAKE_CXX_COMPILER=clang++ \ - -DCMAKE_CXX_FLAGS="-Wno-unknown-cuda-version -Werror" \ + -DCMAKE_CXX_FLAGS="-Wno-unknown-cuda-version -Werror -Wno-undefined-internal -Wno-pass-failed" \ -DKokkos_ENABLE_COMPILER_WARNINGS=ON \ -DKokkos_ENABLE_TESTS=ON \ -DKokkos_ENABLE_TUNING=ON \ -DKokkos_ENABLE_OPENMPTARGET=ON \ -DKokkos_ARCH_VOLTA70=ON \ -DCMAKE_CXX_STANDARD=17 \ - .. && \ - make -j8 && ctest --verbose''' + -DBUILD_NAME=${STAGE_NAME} \ + -P cmake/KokkosCI.cmake''' } post { always { @@ -165,7 +173,7 @@ pipeline { } steps { sh 'ccache --zero-stats' - sh '''rm -rf build && mkdir -p build && cd build && \ + sh '''rm -rf build && \ cmake \ -DCMAKE_BUILD_TYPE=Release \ -DCMAKE_CXX_CLANG_TIDY="clang-tidy;-warnings-as-errors=*" \ @@ -179,8 +187,8 @@ pipeline { -DKokkos_ENABLE_CUDA_LAMBDA=ON \ -DKokkos_ENABLE_TUNING=ON \ -DKokkos_ARCH_VOLTA70=ON \ - .. && \ - make -j8 && ctest --verbose''' + -DBUILD_NAME=${STAGE_NAME} \ + -P cmake/KokkosCI.cmake''' } post { always { @@ -222,7 +230,7 @@ pipeline { dockerfile { filename 'Dockerfile.nvcc' dir 'scripts/docker' - additionalBuildArgs '--build-arg BASE=nvidia/cuda:11.0-devel --build-arg ADDITIONAL_PACKAGES="g++-8 gfortran" --build-arg CMAKE_VERSION=3.17.3' + additionalBuildArgs '--build-arg BASE=nvidia/cuda:11.0-devel --build-arg ADDITIONAL_PACKAGES="g++-8 gfortran clang" --build-arg CMAKE_VERSION=3.17.3' label 'nvidia-docker && volta' args '-v /tmp/ccache.kokkos:/tmp/ccache --env NVIDIA_VISIBLE_DEVICES=$NVIDIA_VISIBLE_DEVICES' } @@ -236,7 +244,7 @@ pipeline { steps { sh 'ccache --zero-stats' sh '''rm -rf install && mkdir -p install && \ - rm -rf build && mkdir -p build && cd build && \ + rm -rf build && \ cmake \ -DCMAKE_BUILD_TYPE=Release \ -DCMAKE_CXX_COMPILER=g++-8 \ @@ -248,11 +256,10 @@ pipeline { -DKokkos_ENABLE_CUDA_LAMBDA=OFF \ -DKokkos_ENABLE_CUDA_UVM=ON \ -DKokkos_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE=ON \ - -DKokkos_ARCH_VOLTA70=ON \ - -DCMAKE_INSTALL_PREFIX=${PWD}/../install \ - .. && \ - make -j8 install && \ - cd .. && \ + -DCMAKE_INSTALL_PREFIX=${PWD}/install \ + -DBUILD_NAME=${STAGE_NAME} \ + -DTARGET=install \ + -P cmake/KokkosCI.cmake && \ rm -rf build-tests && mkdir -p build-tests && cd build-tests && \ export CMAKE_PREFIX_PATH=${PWD}/../install && \ cmake \ @@ -271,7 +278,11 @@ pipeline { -DCMAKE_CXX_FLAGS=-Werror \ -DCMAKE_CXX_STANDARD=17 \ .. && \ - make -j8 && ctest --verbose''' + make -j8 && ctest --verbose && \ + cd ../.. && \ + cmake -B build_cmake_installed_different_compiler/build -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_CXX_FLAGS=-Werror -DCMAKE_CXX_STANDARD=17 build_cmake_installed_different_compiler && \ + cmake --build build_cmake_installed_different_compiler/build --target all && \ + cmake --build build_cmake_installed_different_compiler/build --target test''' } post { always { @@ -284,14 +295,14 @@ pipeline { dockerfile { filename 'Dockerfile.nvcc' dir 'scripts/docker' - additionalBuildArgs '--build-arg BASE=nvidia/cuda:10.1-devel --build-arg CMAKE_VERSION=3.15.5' + additionalBuildArgs '--build-arg BASE=nvidia/cuda:10.1-devel' label 'nvidia-docker && volta' args '-v /tmp/ccache.kokkos:/tmp/ccache --env NVIDIA_VISIBLE_DEVICES=$NVIDIA_VISIBLE_DEVICES' } } steps { sh 'ccache --zero-stats' - sh '''rm -rf build && mkdir -p build && cd build && \ + sh '''rm -rf build && \ cmake \ -DCMAKE_BUILD_TYPE=Debug \ -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \ @@ -305,9 +316,10 @@ pipeline { -DKokkos_ENABLE_CUDA=ON \ -DKokkos_ENABLE_CUDA_LAMBDA=ON \ -DKokkos_ENABLE_LIBDL=OFF \ - .. && \ - make -j8 && ctest --verbose && \ - cd ../example/build_cmake_in_tree && \ + -DBUILD_NAME=${STAGE_NAME} \ + -DTARGET=install \ + -P cmake/KokkosCI.cmake && \ + cd example/build_cmake_in_tree && \ rm -rf build && mkdir -p build && cd build && \ cmake -DCMAKE_CXX_STANDARD=14 .. && make -j8 && ctest --verbose''' } @@ -330,7 +342,7 @@ pipeline { OMP_PROC_BIND = 'true' } steps { - sh '''rm -rf build && mkdir -p build && cd build && \ + sh '''rm -rf build && \ cmake \ -DCMAKE_BUILD_TYPE=Release \ -DCMAKE_CXX_STANDARD=14 \ @@ -339,8 +351,9 @@ pipeline { -DKokkos_ENABLE_TESTS=ON \ -DKokkos_ENABLE_OPENMP=ON \ -DKokkos_ENABLE_LIBDL=OFF \ - .. && \ - make -j8 && ctest --verbose && gcc -I$PWD/../core/src/ ../core/unit_test/tools/TestCInterface.c''' + -DBUILD_NAME=${STAGE_NAME} \ + -P cmake/KokkosCI.cmake && \ + gcc -I$PWD/core/src core/unit_test/tools/TestCInterface.c''' } } } diff --git a/packages/kokkos/.travis.yml b/packages/kokkos/.travis.yml index d156e91ee098..04ef01c1602c 100644 --- a/packages/kokkos/.travis.yml +++ b/packages/kokkos/.travis.yml @@ -4,7 +4,6 @@ language: cpp os: - linux - - osx compiler: - gcc @@ -30,7 +29,7 @@ branches: - /^release/ env: - - + - # - BACKEND="OPENMP" - BACKEND="PTHREAD" - CMAKE_BUILD_TYPE=Debug COVERAGE=yes GTEST_FILTER="-*DeathTest*" @@ -42,38 +41,40 @@ env: matrix: exclude: -# Apple GCC is just an alias to AppleClang - - os: osx - compiler: gcc -# Apple Clang doesn't support OpenMP - - os: osx - compiler: clang - env: CMAKE_BUILD_TYPE=Debug BACKEND="OPENMP" COVERAGE=yes GTEST_FILTER="-*DeathTest*" - - os: osx - compiler: clang - env: CMAKE_BUILD_TYPE=Release BACKEND="OPENMP" -# We do this as canary - os: linux compiler: gcc env: CMAKE_BUILD_TYPE=Release BACKEND="OPENMP" +# Install newer CMake. The distribution comes with CMake 3.12.4 but we require at least 3.16 +install: + - CMAKE_VERSION=3.17.1 + - CMAKE_DIR=/opt/cmake + - CMAKE_KEY=2D2CEF1034921684 && + CMAKE_URL=https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION} && + CMAKE_SCRIPT=cmake-${CMAKE_VERSION}-Linux-x86_64.sh && + CMAKE_SHA256=cmake-${CMAKE_VERSION}-SHA-256.txt && + wget --quiet ${CMAKE_URL}/${CMAKE_SHA256} && + wget --quiet ${CMAKE_URL}/${CMAKE_SHA256}.asc && + wget --quiet ${CMAKE_URL}/${CMAKE_SCRIPT} && + #gpg --keyserver pool.sks-keyservers.net --recv-keys ${CMAKE_KEY} && + #gpg --verify ${CMAKE_SHA256}.asc ${CMAKE_SHA256} && + #grep ${CMAKE_SCRIPT} ${CMAKE_SHA256} | sha256sum --check && + mkdir -p ${CMAKE_DIR} && + sh ${CMAKE_SCRIPT} --skip-license --prefix=${CMAKE_DIR} && + rm cmake* + - PATH=${CMAKE_DIR}/bin:$PATH + - cd ${TRAVIS_BUILD_DIR} + before_script: - - if [[ ${TRAVIS_OS_NAME} == "osx" ]]; then - brew update; - export HOMEBREW_NO_AUTO_UPDATE=1; - brew ls --versions ccache > /dev/null || brew install ccache; - export PATH=/usr/local/opt/ccache/libexec:$PATH; - export CXXFLAGS="${CXXFLAGS} -Wno-unused-command-line-argument"; - if [[ ${BACKEND} == "OPENMP" ]]; then brew install libomp; fi - fi - ccache -z - - if [[ ${COVERAGE} ]]; then export CXX="${CXX} --coverage"; fi + - if [[ ${COVERAGE} ]]; then export CXX="${CXX} --coverage"; export BUILD_NAME_SUFFIX="-Coverage"; fi - if [[ ! ${CMAKE_BUILD_TYPE} ]]; then export CXXFLAGS="${CXXFLAGS} -O2"; fi script: - export OMP_NUM_THREADS=2 - export OMP_PLACES=threads - export OMP_PROC_BIND=spread + - export BUILD_JOBS=2 # LD_LIBRARY_PATH workaround to find clang's libomp: https://github.com/travis-ci/travis-ci/issues/8613 - if [[ ${CC} = clang ]]; then export LD_LIBRARY_PATH=/usr/local/clang/lib${LD_LIBRARY_PATH:+:}$LD_LIBRARY_PATH; fi # enable ccache for clang on linux and add CCACHE_CPP2 to avoid 'Argument unused during compilation -I...' warning @@ -81,17 +82,17 @@ script: ln -s /usr/bin/ccache $HOME/bin/clang++; export CCACHE_CPP2=yes; fi - - mkdir build && - pushd build && - cmake .. + - cmake ${BACKEND:+-DKokkos_ENABLE_${BACKEND}=On} -DCMAKE_CXX_FLAGS="${CXXFLAGS} -Werror" -DCMAKE_CXX_STANDARD=14 -DKokkos_ENABLE_COMPILER_WARNINGS=ON -DKokkos_ENABLE_TESTS=On - ${CMAKE_BUILD_TYPE:+-DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}} && - make VERBOSE=1 -j2 && - travis_wait 60 make test CTEST_OUTPUT_ON_FAILURE=1 && + ${CMAKE_BUILD_TYPE:+-DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}} + -DBUILD_NAME="${CC}-${BACKEND}${BUILD_NAME_SUFFIX}" + -DSITE=Travis + -P cmake/KokkosCI.cmake && + pushd build && make install DESTDIR=${PWD}/install && rm -rf ${PWD}/install/usr/local && rmdir ${PWD}/install/usr && popd diff --git a/packages/kokkos/CHANGELOG.md b/packages/kokkos/CHANGELOG.md index c759181aa21e..5859fe32c432 100644 --- a/packages/kokkos/CHANGELOG.md +++ b/packages/kokkos/CHANGELOG.md @@ -1,5 +1,161 @@ # Change Log +## [3.4.00](https://github.com/kokkos/kokkos/tree/3.4.00) (2021-04-25) +[Full Changelog](https://github.com/kokkos/kokkos/compare/3.3.01...3.4.00) + +**Features:** +- Implement parallel_scan with ThreadVectorRange and Reducer [\#3861](https://github.com/kokkos/kokkos/pull/3861) +- Implement SYCL Random [\#3849](https://github.com/kokkos/kokkos/pull/3849) +- OpenMPTarget: Adding Implementation for nested reducers [\#3845](https://github.com/kokkos/kokkos/pull/3845) +- Implement UniqueToken for SYCL [\#3833](https://github.com/kokkos/kokkos/pull/3833) +- OpenMPTarget: UniqueToken::Global implementation [\#3823](https://github.com/kokkos/kokkos/pull/3823) +- DualView sync's on ExecutionSpaces [\#3822](https://github.com/kokkos/kokkos/pull/3822) +- SYCL outer TeamPolicy parallel_reduce [\#3818](https://github.com/kokkos/kokkos/pull/3818) +- SYCL TeamPolicy::team_scan [\#3815](https://github.com/kokkos/kokkos/pull/3815) +- SYCL MDRangePolicy parallel_reduce [\#3801](https://github.com/kokkos/kokkos/pull/3801) +- Enable use of execution space instances in ScatterView [\#3786](https://github.com/kokkos/kokkos/pull/3786) +- SYCL TeamPolicy nested parallel_reduce [\#3783](https://github.com/kokkos/kokkos/pull/3783) +- OpenMPTarget: MDRange with TagType for parallel_for [\#3781](https://github.com/kokkos/kokkos/pull/3781) +- Adding OpenMPTarget parallel_scan [\#3655](https://github.com/kokkos/kokkos/pull/3655) +- SYCL basic TeamPolicy [\#3654](https://github.com/kokkos/kokkos/pull/3654) +- OpenMPTarget: scratch memory implementation [\#3611](https://github.com/kokkos/kokkos/pull/3611) + +**Implemented enhancements Backends and Archs:** +- SYCL choose a specific GPU [\#3918](https://github.com/kokkos/kokkos/pull/3918) +- [HIP] Lock access to scratch memory when using Teams [\#3916](https://github.com/kokkos/kokkos/pull/3916) +- [HIP] fix multithreaded access to get_next_driver [\#3908](https://github.com/kokkos/kokkos/pull/3908) +- Forward declare HIPHostPinnedSpace and SYCLSharedUSMSpace [\#3902](https://github.com/kokkos/kokkos/pull/3902) +- Let SYCL USMObjectMem use SharedAllocationRecord [\#3898](https://github.com/kokkos/kokkos/pull/3898) +- Implement clock_tic for SYCL [\#3893](https://github.com/kokkos/kokkos/pull/3893) +- Don't use a static variable in HIPInternal::scratch_space [\#3866](https://github.com/kokkos/kokkos/pull/3866)(https://github.com/kokkos/kokkos/pull/3866) +- Reuse memory for SYCL parallel_reduce [\#3873](https://github.com/kokkos/kokkos/pull/3873) +- Update SYCL compiler in CI [\#3826](https://github.com/kokkos/kokkos/pull/3826) +- Introduce HostSharedPtr to manage m_space_instance for Cuda/HIP/SYCL [\#3824](https://github.com/kokkos/kokkos/pull/3824) +- [HIP] Use shuffle for range reduction [\#3811](https://github.com/kokkos/kokkos/pull/3811) +- OpenMPTarget: Changes to the hierarchical parallelism [\#3808](https://github.com/kokkos/kokkos/pull/3808) +- Remove ExtendedReferenceWrapper for SYCL parallel_reduce [\#3802](https://github.com/kokkos/kokkos/pull/3802) +- Eliminate sycl_indirect_launch [\#3777](https://github.com/kokkos/kokkos/pull/3777) +- OpenMPTarget: scratch implementation for parallel_reduce [\#3776](https://github.com/kokkos/kokkos/pull/3776) +- Allow initializing SYCL execution space from sycl::queue and SYCL::impl_static_fence [\#3767](https://github.com/kokkos/kokkos/pull/3767) +- SYCL TeamPolicy scratch memory alternative [\#3763](https://github.com/kokkos/kokkos/pull/3763) +- Alternative implementation for SYCL TeamPolicy [\#3759](https://github.com/kokkos/kokkos/pull/3759) +- Unify handling of synchronous errors in SYCL [\#3754](https://github.com/kokkos/kokkos/pull/3754) +- core/Cuda: Half_t updates for cgsolve [\#3746](https://github.com/kokkos/kokkos/pull/3746) +- Unify HIPParallelLaunch structures [\#3733](https://github.com/kokkos/kokkos/pull/3733) +- Improve performance for SYCL parallel_reduce [\#3732](https://github.com/kokkos/kokkos/pull/3732) +- Use consistent types in Kokkos_OpenMPTarget_Parallel.hpp [\#3703](https://github.com/kokkos/kokkos/pull/3703) +- Implement non-blocking kernel launches for HIP backend [\#3697](https://github.com/kokkos/kokkos/pull/3697) +- Change SYCLInternal::m_queue std::unique_ptr -> std::optional [\#3677](https://github.com/kokkos/kokkos/pull/3677) +- Use alternative SYCL parallel_reduce implementation [\#3671](https://github.com/kokkos/kokkos/pull/3671) +- Use runtime values in KokkosExp_MDRangePolicy.hpp [\#3626](https://github.com/kokkos/kokkos/pull/3626) +- Introduce KOKKOS_PRINTF [\#3615](https://github.com/kokkos/kokkos/pull/3615) +- Clean up AnalyzePolicy [\#3564](https://github.com/kokkos/kokkos/pull/3564) +- Changes for indirect launch of SYCL parallel reduce [\#3511](https://github.com/kokkos/kokkos/pull/3511) + +**Implemented enhancements BuildSystem:** +- Also require C++14 when building gtest [\#3912](https://github.com/kokkos/kokkos/pull/3912) +- Fix compiling SYCL with OpenMP [\#3874](https://github.com/kokkos/kokkos/pull/3874) +- Require C++17 for SYCL (at configuration time) [\#3869](https://github.com/kokkos/kokkos/pull/3869) +- Add COMPILE_DEFINITIONS argument to kokkos_create_imported_tpl [\#3862](https://github.com/kokkos/kokkos/pull/3862) +- Do not pass arch flags to the linker with no rdc [\#3846](https://github.com/kokkos/kokkos/pull/3846) +- Try compiling C++14 check with C++14 support and print error message [\#3843](https://github.com/kokkos/kokkos/pull/3843) +- Enable HIP with Cray Clang [\#3842](https://github.com/kokkos/kokkos/pull/3842) +- Add an option to disable header self containment tests [\#3834](https://github.com/kokkos/kokkos/pull/3834) +- CMake check for C++14 [\#3809](https://github.com/kokkos/kokkos/pull/3809) +- Prefer -std=* over --std=* [\#3779](https://github.com/kokkos/kokkos/pull/3779) +- Kokkos launch compiler updates [\#3778](https://github.com/kokkos/kokkos/pull/3778) +- Updated comments and enabled no-op for kokkos_launch_compiler [\#3774](https://github.com/kokkos/kokkos/pull/3774) +- Apple's Clang not correctly recognised [\#3772](https://github.com/kokkos/kokkos/pull/3772) +- kokkos_launch_compiler + CUDA auto-detect arch [\#3770](https://github.com/kokkos/kokkos/pull/3770) +- Add Spack test support for Kokkos [\#3753](https://github.com/kokkos/kokkos/pull/3753) +- Split SYCL tests for aot compilation [\#3741](https://github.com/kokkos/kokkos/pull/3741) +- Use consistent OpenMP flag for IntelClang [\#3735](https://github.com/kokkos/kokkos/pull/3735) +- Add support for -Wno-deprecated-gpu-targets [\#3722](https://github.com/kokkos/kokkos/pull/3722) +- Add configuration to target CUDA compute capability 8.6 [\#3713](https://github.com/kokkos/kokkos/pull/3713) +- Added VERSION and SOVERSION to KOKKOS_INTERNAL_ADD_LIBRARY [\#3706](https://github.com/kokkos/kokkos/pull/3706) +- Add fast-math to known NVCC flags [\#3699](https://github.com/kokkos/kokkos/pull/3699) +- Add MI-100 arch string [\#3698](https://github.com/kokkos/kokkos/pull/3698) +- Require CMake >=3.16 [\#3679](https://github.com/kokkos/kokkos/pull/3679) +- KokkosCI.cmake, KokkosCTest.cmake.in, CTestConfig.cmake.in + CI updates [\#2844](https://github.com/kokkos/kokkos/pull/2844) + +**Implemented enhancements Tools:** +- Improve readability of the callback invocation in profiling [\#3860](https://github.com/kokkos/kokkos/pull/3860) +- V1.1 Tools Interface: incremental, action-based [\#3812](https://github.com/kokkos/kokkos/pull/3812) +- Enable launch latency simulations [\#3721](https://github.com/kokkos/kokkos/pull/3721) +- Added metadata callback to tools interface [\#3711](https://github.com/kokkos/kokkos/pull/3711) +- MDRange Tile Size Tuning [\#3688](https://github.com/kokkos/kokkos/pull/3688) +- Added support for command-line args for kokkos-tools [\#3627](https://github.com/kokkos/kokkos/pull/3627) +- Query max tile sizes for an MDRangePolicy, and set tile sizes on an existing policy [\#3481](https://github.com/kokkos/kokkos/pull/3481) + +**Implemented enhancements Other:** +- Try detecting ndevices in get_gpu [\#3921](https://github.com/kokkos/kokkos/pull/3921) +- Use strcmp to compare names() [\#3909](https://github.com/kokkos/kokkos/pull/3909) +- Add execution space arguments for constructor overloads that might allocate a new underlying View [\#3904](https://github.com/kokkos/kokkos/pull/3904) +- Prefix labels in internal use of kokkos_malloc [\#3891](https://github.com/kokkos/kokkos/pull/3891) +- Prefix labels for internal uses of SharedAllocationRecord [\#3890](https://github.com/kokkos/kokkos/pull/3890) +- Add missing hypot math function [\#3880](https://github.com/kokkos/kokkos/pull/3880) +- Unify algorithm unit tests to avoid code duplication [\#3851](https://github.com/kokkos/kokkos/pull/3851) +- DualView.template view() better matches for Devices in UVMSpace cases [\#3857](https://github.com/kokkos/kokkos/pull/3857) +- More extensive disentangling of Policy Traits [\#3829](https://github.com/kokkos/kokkos/pull/3829) +- Replaced nanosleep and sched_yield with STL routines [\#3825](https://github.com/kokkos/kokkos/pull/3825) +- Constructing Atomic Subviews [\#3810](https://github.com/kokkos/kokkos/pull/3810) +- Metadata Declaration in Core [\#3729](https://github.com/kokkos/kokkos/pull/3729) +- Allow using tagged final functor in parallel_reduce [\#3714](https://github.com/kokkos/kokkos/pull/3714) +- Major duplicate code removal in SharedAllocationRecord specializations [\#3658](https://github.com/kokkos/kokkos/pull/3658) + +**Fixed bugs:** +- Provide forward declarations in Kokkos_ViewLayoutTiled.hpp for XL [\#3911](https://github.com/kokkos/kokkos/pull/3911) +- Fixup absolute value of floating points in Kokkos complex [\#3882](https://github.com/kokkos/kokkos/pull/3882) +- Address intel 17 ICE [\#3881](https://github.com/kokkos/kokkos/pull/3881) +- Add missing pow(Kokkos::complex) overloads [\#3868](https://github.com/kokkos/kokkos/pull/3868) +- Fix bug {pow, log}(Kokkos::complex) [\#3866](https://github.com/kokkos/kokkos/pull/3866)(https://github.com/kokkos/kokkos/pull/3866) +- Cleanup writing to output streams in Cuda [\#3859](https://github.com/kokkos/kokkos/pull/3859) +- Fixup cache CUDA fallback execution space instance used by DualView::sync [\#3856](https://github.com/kokkos/kokkos/pull/3856) +- Fix cmake warning with pthread [\#3854](https://github.com/kokkos/kokkos/pull/3854) +- Fix typo FOUND_CUDA_{DRIVVER -> DRIVER} [\#3852](https://github.com/kokkos/kokkos/pull/3852) +- Fix bug in SYCL team_reduce [\#3848](https://github.com/kokkos/kokkos/pull/3848) +- Atrocious bug in MDRange tuning [\#3803](https://github.com/kokkos/kokkos/pull/3803) +- Fix compiling SYCL with Kokkos_ENABLE_TUNING=ON [\#3800](https://github.com/kokkos/kokkos/pull/3800) +- Fixed command line parsing bug [\#3797](https://github.com/kokkos/kokkos/pull/3797) +- Workaround race condition in SYCL parallel_reduce [\#3782](https://github.com/kokkos/kokkos/pull/3782) +- Fix Atomic{Min,Max} for Kepler30 [\#3780](https://github.com/kokkos/kokkos/pull/3780) +- Fix SYCL typo [\#3755](https://github.com/kokkos/kokkos/pull/3755) +- Fixed Kokkos_install_additional_files macro [\#3752](https://github.com/kokkos/kokkos/pull/3752) +- Fix a typo for Kokkos_ARCH_A64FX [\#3751](https://github.com/kokkos/kokkos/pull/3751) +- OpenMPTarget: fixes and workarounds to work with "Release" build type [\#3748](https://github.com/kokkos/kokkos/pull/3748) +- Fix parsing bug for number of devices command line argument [\#3724](https://github.com/kokkos/kokkos/pull/3724) +- Avoid more warnings with clang and C++20 [\#3719](https://github.com/kokkos/kokkos/pull/3719) +- Fix gcc-10.1 C++20 warnings [\#3718](https://github.com/kokkos/kokkos/pull/3718) +- Fix cuda cache config not being set correct [\#3712](https://github.com/kokkos/kokkos/pull/3712) +- Fix dualview deepcopy perftools [\#3701](https://github.com/kokkos/kokkos/pull/3701) +- use drand instead of frand in drand [\#3696](https://github.com/kokkos/kokkos/pull/3696) + +**Incompatibilities:** +- Remove unimplemented member functions of SYCLDevice [\#3919](https://github.com/kokkos/kokkos/pull/3919) +- Replace cl::sycl [\#3896](https://github.com/kokkos/kokkos/pull/3896) +- Get rid of SYCL workaround in Kokkos_Complex.hpp [\#3884](https://github.com/kokkos/kokkos/pull/3884) +- Replace most uses of if_c [\#3883](https://github.com/kokkos/kokkos/pull/3883) +- Remove Impl::enable_if_type [\#3863](https://github.com/kokkos/kokkos/pull/3863) +- Remove HostBarrier test [\#3847](https://github.com/kokkos/kokkos/pull/3847) +- Avoid (void) interface [\#3836](https://github.com/kokkos/kokkos/pull/3836) +- Remove VerifyExecutionCanAccessMemorySpace [\#3813](https://github.com/kokkos/kokkos/pull/3813) +- Avoid duplicated code in ScratchMemorySpace [\#3793](https://github.com/kokkos/kokkos/pull/3793) +- Remove superfluous FunctorFinal specialization [\#3788](https://github.com/kokkos/kokkos/pull/3788) +- Rename cl::sycl -> sycl in Kokkos_MathematicalFunctions.hpp [\#3678](https://github.com/kokkos/kokkos/pull/3678) +- Remove integer_sequence backward compatibility implementation [\#3533](https://github.com/kokkos/kokkos/pull/3533) + +**Enabled tests:** +- Fixup re-enable core performance tests [\#3903](https://github.com/kokkos/kokkos/pull/3903) +- Enable more SYCL tests [\#3900](https://github.com/kokkos/kokkos/pull/3900) +- Restrict MDRange Policy tests for Intel GPUs [\#3853](https://github.com/kokkos/kokkos/pull/3853) +- Disable death tests for rawhide [\#3844](https://github.com/kokkos/kokkos/pull/3844) +- OpenMPTarget: Block unit tests that do not pass with the nvidia compiler [\#3839](https://github.com/kokkos/kokkos/pull/3839) +- Enable Bitset container test for SYCL [\#3830](https://github.com/kokkos/kokkos/pull/3830) +- Enable some more SYCL tests [\#3744](https://github.com/kokkos/kokkos/pull/3744) +- Enable SYCL atomic tests [\#3742](https://github.com/kokkos/kokkos/pull/3742) +- Enable more SYCL perf_tests [\#3692](https://github.com/kokkos/kokkos/pull/3692) +- Enable examples for SYCL [\#3691](https://github.com/kokkos/kokkos/pull/3691) + ## [3.3.01](https://github.com/kokkos/kokkos/tree/3.3.01) (2021-01-06) [Full Changelog](https://github.com/kokkos/kokkos/compare/3.3.00...3.3.01) diff --git a/packages/kokkos/CMakeLists.txt b/packages/kokkos/CMakeLists.txt index 7bc3c7725648..6fc1bf7d2f7f 100644 --- a/packages/kokkos/CMakeLists.txt +++ b/packages/kokkos/CMakeLists.txt @@ -72,7 +72,7 @@ ENDFUNCTION() LIST(APPEND CMAKE_MODULE_PATH cmake/Modules) IF(NOT KOKKOS_HAS_TRILINOS) - cmake_minimum_required(VERSION 3.10 FATAL_ERROR) + cmake_minimum_required(VERSION 3.16 FATAL_ERROR) set(CMAKE_DISABLE_SOURCE_CHANGES ON) set(CMAKE_DISABLE_IN_SOURCE_BUILD ON) IF (Spack_WORKAROUND) @@ -111,27 +111,25 @@ ENDIF() set(Kokkos_VERSION_MAJOR 3) -set(Kokkos_VERSION_MINOR 3) -set(Kokkos_VERSION_PATCH 1) +set(Kokkos_VERSION_MINOR 4) +set(Kokkos_VERSION_PATCH 00) set(Kokkos_VERSION "${Kokkos_VERSION_MAJOR}.${Kokkos_VERSION_MINOR}.${Kokkos_VERSION_PATCH}") math(EXPR KOKKOS_VERSION "${Kokkos_VERSION_MAJOR} * 10000 + ${Kokkos_VERSION_MINOR} * 100 + ${Kokkos_VERSION_PATCH}") -IF(${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.12.0") - MESSAGE(STATUS "Setting policy CMP0074 to use _ROOT variables") - CMAKE_POLICY(SET CMP0074 NEW) -ENDIF() +MESSAGE(STATUS "Setting policy CMP0074 to use _ROOT variables") +CMAKE_POLICY(SET CMP0074 NEW) # Load either the real TriBITS or a TriBITS wrapper # for certain utility functions that are universal (like GLOBAL_SET) INCLUDE(${KOKKOS_SRC_PATH}/cmake/fake_tribits.cmake) -IF (Kokkos_ENABLE_CUDA AND ${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.14.0") - #If we are building CUDA, we have tricked CMake because we declare a CXX project - #If the default C++ standard for a given compiler matches the requested - #standard, then CMake just omits the -std flag in later versions of CMake - #This breaks CUDA compilation (CUDA compiler can have a different default - #-std then the underlying host compiler by itself). Setting this variable - #forces CMake to always add the -std flag even if it thinks it doesn't need it +IF (Kokkos_ENABLE_CUDA) + # If we are building CUDA, we have tricked CMake because we declare a CXX project + # If the default C++ standard for a given compiler matches the requested + # standard, then CMake just omits the -std flag in later versions of CMake + # This breaks CUDA compilation (CUDA compiler can have a different default + # -std then the underlying host compiler by itself). Setting this variable + # forces CMake to always add the -std flag even if it thinks it doesn't need it GLOBAL_SET(CMAKE_CXX_STANDARD_DEFAULT 98) ENDIF() @@ -139,15 +137,19 @@ ENDIF() # I really wish these were regular variables # but scoping issues can make it difficult GLOBAL_SET(KOKKOS_COMPILE_OPTIONS) -GLOBAL_SET(KOKKOS_LINK_OPTIONS -DKOKKOS_DEPENDENCE) +GLOBAL_SET(KOKKOS_LINK_OPTIONS) GLOBAL_SET(KOKKOS_CUDA_OPTIONS) GLOBAL_SET(KOKKOS_CUDAFE_OPTIONS) GLOBAL_SET(KOKKOS_XCOMPILER_OPTIONS) # We need to append text here for making sure TPLs # we import are available for an installed Kokkos GLOBAL_SET(KOKKOS_TPL_EXPORTS) -# this could probably be scoped to project +# KOKKOS_DEPENDENCE is used by kokkos_launch_compiler GLOBAL_SET(KOKKOS_COMPILE_DEFINITIONS KOKKOS_DEPENDENCE) +# MSVC never goes through kokkos_launch_compiler +IF(NOT MSVC) + GLOBAL_APPEND(KOKKOS_LINK_OPTIONS -DKOKKOS_DEPENDENCE) +ENDIF() # Include a set of Kokkos-specific wrapper functions that # will either call raw CMake or TriBITS diff --git a/packages/kokkos/Makefile.kokkos b/packages/kokkos/Makefile.kokkos index 3b6a5ff4368c..2599121d70ad 100644 --- a/packages/kokkos/Makefile.kokkos +++ b/packages/kokkos/Makefile.kokkos @@ -1,8 +1,8 @@ # Default settings common options. KOKKOS_VERSION_MAJOR = 3 -KOKKOS_VERSION_MINOR = 3 -KOKKOS_VERSION_PATCH = 1 +KOKKOS_VERSION_MINOR = 4 +KOKKOS_VERSION_PATCH = 00 KOKKOS_VERSION = $(shell echo $(KOKKOS_VERSION_MAJOR)*10000+$(KOKKOS_VERSION_MINOR)*100+$(KOKKOS_VERSION_PATCH) | bc) # Options: Cuda,HIP,OpenMP,Pthread,Serial @@ -10,7 +10,7 @@ KOKKOS_VERSION = $(shell echo $(KOKKOS_VERSION_MAJOR)*10000+$(KOKKOS_VERSION_MIN KOKKOS_DEVICES ?= "Pthread" # Options: # Intel: KNC,KNL,SNB,HSW,BDW,SKX -# NVIDIA: Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,Pascal60,Pascal61,Volta70,Volta72,Turing75,Ampere80 +# NVIDIA: Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,Pascal60,Pascal61,Volta70,Volta72,Turing75,Ampere80,Ampere86 # ARM: ARMv80,ARMv81,ARMv8-ThunderX,ARMv8-TX2,A64FX # IBM: BGQ,Power7,Power8,Power9 # AMD-GPUS: Vega900,Vega906,Vega908 @@ -154,17 +154,17 @@ KOKKOS_INTERNAL_OS_DARWIN := $(call kokkos_has_string,$(KOKKOS_OS),Darwin) KOKKOS_CXX_VERSION := $(strip $(shell $(CXX) --version 2>&1)) KOKKOS_INTERNAL_COMPILER_INTEL := $(call kokkos_has_string,$(KOKKOS_CXX_VERSION),Intel Corporation) KOKKOS_INTERNAL_COMPILER_PGI := $(call kokkos_has_string,$(KOKKOS_CXX_VERSION),PGI) -KOKKOS_INTERNAL_COMPILER_XL := $(strip $(shell $(CXX) -qversion 2>&1 | grep XL | wc -l)) -KOKKOS_INTERNAL_COMPILER_CRAY := $(strip $(shell $(CXX) -craype-verbose 2>&1 | grep "CC-" | wc -l)) -KOKKOS_INTERNAL_COMPILER_NVCC := $(strip $(shell echo "$(shell export OMPI_CXX=$(OMPI_CXX); export MPICH_CXX=$(MPICH_CXX); $(CXX) --version 2>&1 | grep nvcc | wc -l)>0" | bc)) +KOKKOS_INTERNAL_COMPILER_XL := $(strip $(shell $(CXX) -qversion 2>&1 | grep -c XL)) +KOKKOS_INTERNAL_COMPILER_CRAY := $(strip $(shell $(CXX) -craype-verbose 2>&1 | grep -c "CC-")) +KOKKOS_INTERNAL_COMPILER_NVCC := $(strip $(shell echo "$(shell export OMPI_CXX=$(OMPI_CXX); export MPICH_CXX=$(MPICH_CXX); $(CXX) --version 2>&1 | grep -c nvcc)>0" | bc)) KOKKOS_INTERNAL_COMPILER_CLANG := $(call kokkos_has_string,$(KOKKOS_CXX_VERSION),clang) -KOKKOS_INTERNAL_COMPILER_APPLE_CLANG := $(call kokkos_has_string,$(KOKKOS_CXX_VERSION),Apple LLVM) +KOKKOS_INTERNAL_COMPILER_APPLE_CLANG := $(call kokkos_has_string,$(KOKKOS_CXX_VERSION),Apple clang) KOKKOS_INTERNAL_COMPILER_HCC := $(call kokkos_has_string,$(KOKKOS_CXX_VERSION),HCC) KOKKOS_INTERNAL_COMPILER_GCC := $(call kokkos_has_string,$(KOKKOS_CXX_VERSION),GCC) # Check Host Compiler if using NVCC through nvcc_wrapper ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC), 1) - KOKKOS_INTERNAL_COMPILER_NVCC_WRAPPER := $(strip $(shell echo $(CXX) | grep nvcc_wrapper | wc -l)) + KOKKOS_INTERNAL_COMPILER_NVCC_WRAPPER := $(strip $(shell echo $(CXX) | grep -c nvcc_wrapper)) ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC_WRAPPER), 1) KOKKOS_CXX_HOST_VERSION := $(strip $(shell $(CXX) $(CXXFLAGS) --host-version 2>&1)) @@ -287,11 +287,11 @@ else #KOKKOS_INTERNAL_CXX1Z_FLAG := -hstd=c++1z #KOKKOS_INTERNAL_CXX2A_FLAG := -hstd=c++2a else - KOKKOS_INTERNAL_CXX14_FLAG := --std=c++14 - KOKKOS_INTERNAL_CXX1Y_FLAG := --std=c++1y - KOKKOS_INTERNAL_CXX17_FLAG := --std=c++17 - KOKKOS_INTERNAL_CXX1Z_FLAG := --std=c++1z - KOKKOS_INTERNAL_CXX2A_FLAG := --std=c++2a + KOKKOS_INTERNAL_CXX14_FLAG := -std=c++14 + KOKKOS_INTERNAL_CXX1Y_FLAG := -std=c++1y + KOKKOS_INTERNAL_CXX17_FLAG := -std=c++17 + KOKKOS_INTERNAL_CXX1Z_FLAG := -std=c++1z + KOKKOS_INTERNAL_CXX2A_FLAG := -std=c++2a endif endif endif @@ -322,6 +322,7 @@ KOKKOS_INTERNAL_USE_ARCH_VOLTA70 := $(call kokkos_has_string,$(KOKKOS_ARCH),Volt KOKKOS_INTERNAL_USE_ARCH_VOLTA72 := $(call kokkos_has_string,$(KOKKOS_ARCH),Volta72) KOKKOS_INTERNAL_USE_ARCH_TURING75 := $(call kokkos_has_string,$(KOKKOS_ARCH),Turing75) KOKKOS_INTERNAL_USE_ARCH_AMPERE80 := $(call kokkos_has_string,$(KOKKOS_ARCH),Ampere80) +KOKKOS_INTERNAL_USE_ARCH_AMPERE86 := $(call kokkos_has_string,$(KOKKOS_ARCH),Ampere86) KOKKOS_INTERNAL_USE_ARCH_NVIDIA := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_KEPLER30) \ + $(KOKKOS_INTERNAL_USE_ARCH_KEPLER32) \ + $(KOKKOS_INTERNAL_USE_ARCH_KEPLER35) \ @@ -334,7 +335,8 @@ KOKKOS_INTERNAL_USE_ARCH_NVIDIA := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_KEPLE + $(KOKKOS_INTERNAL_USE_ARCH_VOLTA70) \ + $(KOKKOS_INTERNAL_USE_ARCH_VOLTA72) \ + $(KOKKOS_INTERNAL_USE_ARCH_TURING75) \ - + $(KOKKOS_INTERNAL_USE_ARCH_AMPERE80)) + + $(KOKKOS_INTERNAL_USE_ARCH_AMPERE80) \ + + $(KOKKOS_INTERNAL_USE_ARCH_AMPERE86)) #SEK: This seems like a bug to me ifeq ($(KOKKOS_INTERNAL_USE_ARCH_NVIDIA), 0) @@ -575,10 +577,10 @@ ifeq ($(KOKKOS_INTERNAL_ENABLE_PROFILING_LOAD_PRINT), 1) endif ifeq ($(KOKKOS_INTERNAL_ENABLE_TUNING), 1) - tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_TUNING") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_TUNING") endif -tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_LIBDL") +tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_LIBDL") ifeq ($(KOKKOS_INTERNAL_USE_HWLOC), 1) ifneq ($(KOKKOS_CMAKE), yes) @@ -742,6 +744,14 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_A64FX), 1) KOKKOS_CXXFLAGS += -march=armv8.2-a+sve KOKKOS_LDFLAGS += -march=armv8.2-a+sve + ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1) + KOKKOS_CXXFLAGS += -msve-vector-bits=512 + KOKKOS_LDFLAGS += -msve-vector-bits=512 + endif + ifeq ($(KOKKOS_INTERNAL_COMPILER_GCC), 1) + KOKKOS_CXXFLAGS += -msve-vector-bits=512 + KOKKOS_LDFLAGS += -msve-vector-bits=512 + endif endif ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ZEN), 1) @@ -1090,6 +1100,11 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA_ARCH), 1) tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMPERE80") KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_80 endif + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMPERE86), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMPERE") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMPERE86") + KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_86 + endif ifneq ($(KOKKOS_INTERNAL_USE_ARCH_NVIDIA), 0) KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG) @@ -1149,7 +1164,7 @@ endif KOKKOS_INTERNAL_LS_CONFIG := $(shell ls KokkosCore_config.h 2>&1) ifeq ($(KOKKOS_INTERNAL_LS_CONFIG), KokkosCore_config.h) - KOKKOS_INTERNAL_NEW_CONFIG := $(strip $(shell diff KokkosCore_config.h KokkosCore_config.tmp | grep define | wc -l)) + KOKKOS_INTERNAL_NEW_CONFIG := $(strip $(shell diff KokkosCore_config.h KokkosCore_config.tmp | grep -c define)) else KOKKOS_INTERNAL_NEW_CONFIG := 1 endif @@ -1171,41 +1186,41 @@ tmp := $(call kokkos_update_config_header, KOKKOS_SETUP_HPP_, "KokkosCore_Config tmp := $(call kokkos_update_config_header, KOKKOS_DECLARE_HPP_, "KokkosCore_Config_DeclareBackend.tmp", "KokkosCore_Config_DeclareBackend.hpp") tmp := $(call kokkos_update_config_header, KOKKOS_POST_INCLUDE_HPP_, "KokkosCore_Config_PostInclude.tmp", "KokkosCore_Config_PostInclude.hpp") ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1) - tmp := $(call kokkos_append_config_header,"\#include ","KokkosCore_Config_FwdBackend.hpp") - tmp := $(call kokkos_append_config_header,"\#include ","KokkosCore_Config_DeclareBackend.hpp") - tmp := $(call kokkos_append_config_header,"\#include ","KokkosCore_Config_SetupBackend.hpp") + tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_FwdBackend.hpp") + tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_DeclareBackend.hpp") + tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_SetupBackend.hpp") ifeq ($(KOKKOS_INTERNAL_CUDA_USE_UVM), 1) else endif endif ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1) - tmp := $(call kokkos_append_config_header,"\#include ","KokkosCore_Config_FwdBackend.hpp") - tmp := $(call kokkos_append_config_header,"\#include ","KokkosCore_Config_DeclareBackend.hpp") + tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_FwdBackend.hpp") + tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_DeclareBackend.hpp") endif ifeq ($(KOKKOS_INTERNAL_USE_HIP), 1) - tmp := $(call kokkos_append_config_header,"\#include ","KokkosCore_Config_FwdBackend.hpp") - tmp := $(call kokkos_append_config_header,"\#include ","KokkosCore_Config_DeclareBackend.hpp") - tmp := $(call kokkos_append_config_header,"\#include ","KokkosCore_Config_SetupBackend.hpp") + tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_FwdBackend.hpp") + tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_DeclareBackend.hpp") + tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_SetupBackend.hpp") endif ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1) - tmp := $(call kokkos_append_config_header,"\#include ","KokkosCore_Config_FwdBackend.hpp") - tmp := $(call kokkos_append_config_header,"\#include ","KokkosCore_Config_DeclareBackend.hpp") + tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_FwdBackend.hpp") + tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_DeclareBackend.hpp") endif ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1) - tmp := $(call kokkos_append_config_header,"\#include ","KokkosCore_Config_FwdBackend.hpp") - tmp := $(call kokkos_append_config_header,"\#include ","KokkosCore_Config_DeclareBackend.hpp") + tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_FwdBackend.hpp") + tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_DeclareBackend.hpp") endif ifeq ($(KOKKOS_INTERNAL_USE_HPX), 1) - tmp := $(call kokkos_append_config_header,"\#include ","KokkosCore_Config_FwdBackend.hpp") - tmp := $(call kokkos_append_config_header,"\#include ","KokkosCore_Config_DeclareBackend.hpp") + tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_FwdBackend.hpp") + tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_DeclareBackend.hpp") endif ifeq ($(KOKKOS_INTERNAL_USE_SERIAL), 1) - tmp := $(call kokkos_append_config_header,"\#include ","KokkosCore_Config_FwdBackend.hpp") - tmp := $(call kokkos_append_config_header,"\#include ","KokkosCore_Config_DeclareBackend.hpp") + tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_FwdBackend.hpp") + tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_DeclareBackend.hpp") endif ifeq ($(KOKKOS_INTERNAL_USE_MEMKIND), 1) - tmp := $(call kokkos_append_config_header,"\#include ","KokkosCore_Config_FwdBackend.hpp") - tmp := $(call kokkos_append_config_header,"\#include ","KokkosCore_Config_DeclareBackend.hpp") + tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_FwdBackend.hpp") + tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_DeclareBackend.hpp") endif KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/*.hpp) KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/impl/*.hpp) @@ -1324,7 +1339,7 @@ ifneq ($(KOKKOS_INTERNAL_USE_SERIAL), 1) endif # With Cygwin functions such as fdopen and fileno are not defined -# when strict ansi is enabled. strict ansi gets enabled with --std=c++14 +# when strict ansi is enabled. strict ansi gets enabled with -std=c++14 # though. So we hard undefine it here. Not sure if that has any bad side effects # This is needed for gtest actually, not for Kokkos itself! ifeq ($(KOKKOS_INTERNAL_OS_CYGWIN), 1) diff --git a/packages/kokkos/Makefile.targets b/packages/kokkos/Makefile.targets index 5a03f7d17e94..cf9fc242420e 100644 --- a/packages/kokkos/Makefile.targets +++ b/packages/kokkos/Makefile.targets @@ -36,6 +36,8 @@ Kokkos_MemorySpace.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_ $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_MemorySpace.cpp Kokkos_HostSpace_deepcopy.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_HostSpace_deepcopy.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_HostSpace_deepcopy.cpp +Kokkos_NumericTraits.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_NumericTraits.cpp + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_NumericTraits.cpp ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1) Kokkos_Cuda_Instance.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Instance.cpp diff --git a/packages/kokkos/algorithms/src/Kokkos_Random.hpp b/packages/kokkos/algorithms/src/Kokkos_Random.hpp index 69d6cf8f35ea..904cf5ccb967 100644 --- a/packages/kokkos/algorithms/src/Kokkos_Random.hpp +++ b/packages/kokkos/algorithms/src/Kokkos_Random.hpp @@ -668,6 +668,25 @@ struct Random_UniqueIndex { }; #endif +#ifdef KOKKOS_ENABLE_SYCL +template <> +struct Random_UniqueIndex { + using locks_view_type = View; + KOKKOS_FUNCTION + static int get_state_idx(const locks_view_type& locks_) { +#ifdef KOKKOS_ARCH_INTEL_GEN + int i = Kokkos::Impl::clock_tic() % locks_.extent(0); +#else + int i = 0; +#endif + while (Kokkos::atomic_compare_exchange(&locks_(i), 0, 1)) { + i = (i + 1) % static_cast(locks_.extent(0)); + } + return i; + } +}; +#endif + } // namespace Impl template @@ -1028,7 +1047,7 @@ class Random_XorShift1024 { KOKKOS_INLINE_FUNCTION double drand(const double& start, const double& end) { - return frand(end - start) + start; + return drand(end - start) + start; } // Marsaglia polar method for drawing a standard normal distributed random diff --git a/packages/kokkos/algorithms/unit_tests/CMakeLists.txt b/packages/kokkos/algorithms/unit_tests/CMakeLists.txt index 819c9e54bae4..9109837985a9 100644 --- a/packages/kokkos/algorithms/unit_tests/CMakeLists.txt +++ b/packages/kokkos/algorithms/unit_tests/CMakeLists.txt @@ -3,6 +3,7 @@ KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) KOKKOS_INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR}) KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/../src ) +KOKKOS_INCLUDE_DIRECTORIES(${KOKKOS_SOURCE_DIR}/core/unit_test/category_files) SET(GTEST_SOURCE_DIR ${${PARENT_PACKAGE_NAME}_SOURCE_DIR}/tpls/gtest) @@ -25,7 +26,7 @@ KOKKOS_ADD_TEST_LIBRARY( TARGET_COMPILE_DEFINITIONS(kokkosalgorithms_gtest PUBLIC GTEST_HAS_TR1_TUPLE=0 GTEST_HAS_PTHREAD=0) IF((NOT (Kokkos_ENABLE_CUDA AND WIN32)) AND (NOT ("${KOKKOS_CXX_COMPILER_ID}" STREQUAL "Fujitsu"))) -TARGET_COMPILE_FEATURES(kokkosalgorithms_gtest PUBLIC cxx_std_11) + TARGET_COMPILE_FEATURES(kokkosalgorithms_gtest PUBLIC cxx_std_14) ENDIF() # Suppress clang-tidy diagnostics on code that we do not have control over @@ -33,51 +34,42 @@ IF(CMAKE_CXX_CLANG_TIDY) SET_TARGET_PROPERTIES(kokkosalgorithms_gtest PROPERTIES CXX_CLANG_TIDY "") ENDIF() -SET(SOURCES - UnitTestMain.cpp -) +SET(ALGORITHM UnitTestMain.cpp) IF(Kokkos_ENABLE_OPENMP) - LIST( APPEND SOURCES - TestOpenMP.cpp + LIST(APPEND ALGORITHM_SOURCES TestOpenMP_Sort1D.cpp TestOpenMP_Sort3D.cpp TestOpenMP_SortDynamicView.cpp - TestOpenMP_Random.cpp - ) -ENDIF() - -IF(Kokkos_ENABLE_HIP) - LIST( APPEND SOURCES - TestHIP.cpp ) ENDIF() -IF(Kokkos_ENABLE_CUDA) - LIST( APPEND SOURCES - TestCuda.cpp - ) -ENDIF() - -IF(Kokkos_ENABLE_HPX) - LIST( APPEND SOURCES - TestHPX.cpp - ) -ENDIF() - -IF(Kokkos_ENABLE_SERIAL) - LIST( APPEND SOURCES - TestSerial.cpp - ) -ENDIF() - -IF(Kokkos_ENABLE_PTHREAD) - LIST( APPEND SOURCES - TestThreads.cpp - ) -ENDIF() +foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;HIP;SYCL) + # Because there is always an exception to the rule + if(Tag STREQUAL "Threads") + set(DEVICE "PTHREAD") + else() + string(TOUPPER ${Tag} DEVICE) + endif() + + if(Kokkos_ENABLE_${DEVICE}) + set(dir ${CMAKE_CURRENT_BINARY_DIR}) + set(file ${dir}/Test${Tag}.cpp) + # Write to a temporary intermediate file and call configure_file to avoid + # updating timestamps triggering unnecessary rebuilds on subsequent cmake runs. + file(WRITE ${dir}/dummy.cpp + "#include \n" + "#include \n" + "#include \n" + ) + configure_file(${dir}/dummy.cpp ${file}) + list(APPEND ALGORITHM_SOURCES ${file}) + endif() +endforeach() KOKKOS_ADD_EXECUTABLE_AND_TEST( UnitTest - SOURCES ${SOURCES} + SOURCES + UnitTestMain.cpp + ${ALGORITHM_SOURCES} ) diff --git a/packages/kokkos/algorithms/unit_tests/Makefile b/packages/kokkos/algorithms/unit_tests/Makefile index c112d7c6fcad..dd0aa87de0b2 100644 --- a/packages/kokkos/algorithms/unit_tests/Makefile +++ b/packages/kokkos/algorithms/unit_tests/Makefile @@ -20,11 +20,19 @@ override LDFLAGS += -lpthread include $(KOKKOS_PATH)/Makefile.kokkos -KOKKOS_CXXFLAGS += -I$(GTEST_PATH) -I${KOKKOS_PATH}/algorithms/unit_tests +KOKKOS_CXXFLAGS += -I$(GTEST_PATH) -I${KOKKOS_PATH}/algorithms/unit_tests -I${KOKKOS_PATH}/core/unit_test/category_files TEST_TARGETS = TARGETS = +tmp := $(foreach device, $(KOKKOS_DEVICELIST), \ + $(if $(filter Test$(device).cpp, $(shell ls Test$(device).cpp 2>/dev/null)),,\ + $(shell echo "\#include " > Test$(device).cpp); \ + $(shell echo "\#include " >> Test$(device).cpp); \ + $(shell echo "\#include " >> Test$(device).cpp); \ + ) \ +) + ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1) OBJ_CUDA = TestCuda.o UnitTestMain.o gtest-all.o TARGETS += KokkosAlgorithms_UnitTest_Cuda @@ -44,7 +52,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1) endif ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1) - OBJ_OPENMP = TestOpenMP.o TestOpenMP_Random.o TestOpenMP_Sort1D.o TestOpenMP_Sort3D.o TestOpenMP_SortDynamicView.o UnitTestMain.o gtest-all.o + OBJ_OPENMP = TestOpenMP.o TestOpenMP_Sort1D.o TestOpenMP_Sort3D.o TestOpenMP_SortDynamicView.o UnitTestMain.o gtest-all.o TARGETS += KokkosAlgorithms_UnitTest_OpenMP TEST_TARGETS += test-openmp endif diff --git a/packages/kokkos/algorithms/unit_tests/TestOpenMP_Sort1D.cpp b/packages/kokkos/algorithms/unit_tests/TestOpenMP_Sort1D.cpp index a9b2010ad025..4a5839f0c80a 100644 --- a/packages/kokkos/algorithms/unit_tests/TestOpenMP_Sort1D.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestOpenMP_Sort1D.cpp @@ -59,6 +59,8 @@ TEST(openmp, SortUnsigned1D) { Impl::test_1D_sort(171); } +TEST(openmp, SortIssue1160) { Impl::test_issue_1160_sort(); } + } // namespace Test #else void KOKKOS_ALGORITHMS_UNITTESTS_TESTOPENMP_PREVENT_LINK_ERROR() {} diff --git a/packages/kokkos/algorithms/unit_tests/TestRandom.hpp b/packages/kokkos/algorithms/unit_tests/TestRandom.hpp index caba92c152fa..1f14875096dd 100644 --- a/packages/kokkos/algorithms/unit_tests/TestRandom.hpp +++ b/packages/kokkos/algorithms/unit_tests/TestRandom.hpp @@ -491,6 +491,34 @@ void test_random(unsigned int num_draws) { } } // namespace Impl +template +void test_random_xorshift64() { +#if defined(KOKKOS_ENABLE_SYCL) || defined(KOKKOS_ENABLE_CUDA) || \ + defined(KOKKOS_ENABLE_HIP) + const int num_draws = 132141141; +#else // SERIAL, HPX, OPENMP + const int num_draws = 10240000; +#endif + Impl::test_random>(num_draws); + Impl::test_random>>( + num_draws); +} + +template +void test_random_xorshift1024() { +#if defined(KOKKOS_ENABLE_SYCL) || defined(KOKKOS_ENABLE_CUDA) || \ + defined(KOKKOS_ENABLE_HIP) + const int num_draws = 52428813; +#else // SERIAL, HPX, OPENMP + const int num_draws = 10130144; +#endif + Impl::test_random>( + num_draws); + Impl::test_random>>( + num_draws); +} } // namespace Test #endif // KOKKOS_TEST_UNORDERED_MAP_HPP diff --git a/packages/kokkos/algorithms/unit_tests/TestRandomCommon.hpp b/packages/kokkos/algorithms/unit_tests/TestRandomCommon.hpp new file mode 100644 index 000000000000..c6d3b59ae1f1 --- /dev/null +++ b/packages/kokkos/algorithms/unit_tests/TestRandomCommon.hpp @@ -0,0 +1,60 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_ALGORITHMS_UNITTESTS_TESTRANDOM_COMMON_HPP +#define KOKKOS_ALGORITHMS_UNITTESTS_TESTRANDOM_COMMON_HPP + +#include + +namespace Test { + +TEST(TEST_CATEGORY, Random_XorShift64) { + test_random_xorshift64(); +} +TEST(TEST_CATEGORY, Random_XorShift1024_0) { + test_random_xorshift1024(); +} +} // namespace Test + +#endif diff --git a/packages/kokkos/containers/unit_tests/TestCuda_Category.hpp b/packages/kokkos/algorithms/unit_tests/TestSortCommon.hpp similarity index 88% rename from packages/kokkos/containers/unit_tests/TestCuda_Category.hpp rename to packages/kokkos/algorithms/unit_tests/TestSortCommon.hpp index 50935d7a34d1..56657b6574b8 100644 --- a/packages/kokkos/containers/unit_tests/TestCuda_Category.hpp +++ b/packages/kokkos/algorithms/unit_tests/TestSortCommon.hpp @@ -42,10 +42,14 @@ //@HEADER */ -#ifndef KOKKOS_TEST_CUDA_HPP -#define KOKKOS_TEST_CUDA_HPP +#ifndef KOKKOS_ALGORITHMS_UNITTESTS_TESTSORT_COMMON_HPP +#define KOKKOS_ALGORITHMS_UNITTESTS_TESTSORT_COMMON_HPP -#define TEST_CATEGORY cuda -#define TEST_EXECSPACE Kokkos::Cuda +#include +namespace Test { +TEST(TEST_CATEGORY, SortUnsigned) { + Impl::test_sort(171); +} +} // namespace Test #endif diff --git a/packages/kokkos/appveyor.yml b/packages/kokkos/appveyor.yml index c40bf066b7a9..e8763c0b665c 100644 --- a/packages/kokkos/appveyor.yml +++ b/packages/kokkos/appveyor.yml @@ -3,8 +3,4 @@ image: clone_folder: c:\projects\source build_script: - cmd: >- - mkdir build && - cd build && - cmake c:\projects\source -DKokkos_ENABLE_TESTS=ON && - cmake --build . --target install && - ctest -C Debug -V + cmake c:\projects\source -DKokkos_ENABLE_TESTS=ON -DCMAKE_CXX_FLAGS="/W0 /EHsc /d1reportClassLayoutChanges" -DCTEST_ARGS="-C Debug -V --output-on-failure" -DBUILD_NAME=MSVC-2019 -DBUILD_TYPE=Debug -DSITE=AppVeyor -DTARGET=install -P cmake/KokkosCI.cmake diff --git a/packages/kokkos/bin/kokkos_launch_compiler b/packages/kokkos/bin/kokkos_launch_compiler index 1fbebf648fa0..d929d24f1dca 100755 --- a/packages/kokkos/bin/kokkos_launch_compiler +++ b/packages/kokkos/bin/kokkos_launch_compiler @@ -13,6 +13,17 @@ # $1 are 'ar', 'cmake', etc. during the linking phase # +# emit a message about the underlying command executed +: ${DEBUG:=0} +: ${KOKKOS_DEBUG_LAUNCH_COMPILER:=${DEBUG}} + +debug-message() +{ + if [ "${KOKKOS_DEBUG_LAUNCH_COMPILER}" -ne 0 ]; then + echo -e "##### $(basename ${BASH_SOURCE[0]}) executing: \"$@\"... #####" + fi +} + # check the arguments for the KOKKOS_DEPENDENCE compiler definition KOKKOS_DEPENDENCE=0 for i in ${@} @@ -23,16 +34,30 @@ do fi done -# if C++ is not passed, someone is probably trying to invoke it directly +# if Kokkos compiler is not passed, someone is probably trying to invoke it directly if [ -z "${1}" ]; then - echo -e "\n${BASH_SOURCE[0]} was invoked without the C++ compiler as the first argument." + echo -e "\n${BASH_SOURCE[0]} was invoked without the Kokkos compiler as the first argument." echo "This script is not indended to be directly invoked by any mechanism other" - echo -e "than through a RULE_LAUNCH_COMPILE or RULE_LAUNCH_LINK property set in CMake\n" + echo -e "than through a RULE_LAUNCH_COMPILE or RULE_LAUNCH_LINK property set in CMake.\n" + exit 1 +fi + +# if Kokkos compiler is not passed, someone is probably trying to invoke it directly +if [ -z "${2}" ]; then + echo -e "\n${BASH_SOURCE[0]} was invoked without the C++ compiler as the second argument." + echo "This script is not indended to be directly invoked by any mechanism other" + echo -e "than through a RULE_LAUNCH_COMPILE or RULE_LAUNCH_LINK property set in CMake.\n" exit 1 fi # if there aren't two args, this isn't necessarily invalid, just a bit strange -if [ -z "${2}" ]; then exit 0; fi +if [ -z "${3}" ]; then exit 0; fi + +# store the Kokkos compiler +KOKKOS_COMPILER=${1} + +# remove the Kokkos compiler from the arguments +shift # store the expected C++ compiler CXX_COMPILER=${1} @@ -40,48 +65,57 @@ CXX_COMPILER=${1} # remove the expected C++ compiler from the arguments shift -# after the above shift, $1 is now the exe for the compile or link command, e.g. -# kokkos_launch_compiler g++ gcc -c file.c -o file.o +# NOTE: in below, ${KOKKOS_COMPILER} is usually nvcc_wrapper +# +# after the above shifts, $1 is now the exe for the compile or link command, e.g. +# kokkos_launch_compiler ${KOKKOS_COMPILER} g++ gcc -c file.c -o file.o # becomes: # kokkos_launch_compiler gcc -c file.c -o file.o -# Check to see if the executable is the C++ compiler and if it is not, then +# We check to see if the executable is the C++ compiler and if it is not, then # just execute the command. # # Summary: -# kokkos_launch_compiler g++ gcc -c file.c -o file.o +# kokkos_launch_compiler ${KOKKOS_COMPILER} g++ gcc -c file.c -o file.o # results in this command being executed: # gcc -c file.c -o file.o # and -# kokkos_launch_compiler g++ g++ -c file.cpp -o file.o +# kokkos_launch_compiler ${KOKKOS_COMPILER} g++ g++ -c file.cpp -o file.o # results in this command being executed: -# nvcc_wrapper -c file.cpp -o file.o +# ${KOKKOS_COMPILER} -c file.cpp -o file.o if [[ "${KOKKOS_DEPENDENCE}" -eq "0" || "${CXX_COMPILER}" != "${1}" ]]; then - # the command does not depend on Kokkos so just execute the command w/o re-directing to nvcc_wrapper + debug-message $@ + # the command does not depend on Kokkos so just execute the command w/o re-directing to ${KOKKOS_COMPILER} eval $@ else - # the executable is the C++ compiler, so we need to re-direct to nvcc_wrapper + # the executable is the C++ compiler, so we need to re-direct to ${KOKKOS_COMPILER} + if [ ! -f "${KOKKOS_COMPILER}" ]; then + echo -e "\nError: the compiler redirect for Kokkos was not found at ${KOKKOS_COMPILER}\n" + exit 1 + fi # find the nvcc_wrapper from the same build/install NVCC_WRAPPER="$(dirname ${BASH_SOURCE[0]})/nvcc_wrapper" + if [ "${KOKKOS_COMPILER}" = "${NVCC_WRAPPER}" ]; then + # this should only be valid in the install tree -- it will be set to CMAKE_CXX_COMPILER used using Kokkos installation + if [ -z $(echo "@NVCC_WRAPPER_DEFAULT_COMPILER@" | grep 'NVCC_WRAPPER_DEFAULT_COMPILER') ]; then + : ${NVCC_WRAPPER_DEFAULT_COMPILER:="@NVCC_WRAPPER_DEFAULT_COMPILER@"} + fi - if [ -z "${NVCC_WRAPPER}" ]; then - echo -e "\nError: nvcc_wrapper not found in $(dirname ${BASH_SOURCE[0]}).\n" - exit 1 - fi + # set default nvcc wrapper compiler if not specified + : ${NVCC_WRAPPER_DEFAULT_COMPILER:=${CXX_COMPILER}} + export NVCC_WRAPPER_DEFAULT_COMPILER - # set default nvcc wrapper compiler if not specified - : ${NVCC_WRAPPER_DEFAULT_COMPILER:=${CXX_COMPILER}} - export NVCC_WRAPPER_DEFAULT_COMPILER - - # calling itself will cause an infinitely long build - if [ "${NVCC_WRAPPER}" = "${NVCC_WRAPPER_DEFAULT_COMPILER}" ]; then - echo -e "\nError: NVCC_WRAPPER == NVCC_WRAPPER_DEFAULT_COMPILER. Terminating to avoid infinite loop!\n" - exit 1 + # nvcc_wrapper calling itself will cause an infinitely long build + if [ "${NVCC_WRAPPER}" = "${NVCC_WRAPPER_DEFAULT_COMPILER}" ]; then + echo -e "\nError: NVCC_WRAPPER == NVCC_WRAPPER_DEFAULT_COMPILER. Terminating to avoid infinite loop!\n" + exit 1 + fi fi # discard the compiler from the command shift - # execute nvcc_wrapper - ${NVCC_WRAPPER} $@ + debug-message ${KOKKOS_COMPILER} $@ + # execute ${KOKKOS_COMPILER} (again, usually nvcc_wrapper) + ${KOKKOS_COMPILER} $@ fi diff --git a/packages/kokkos/bin/nvcc_wrapper b/packages/kokkos/bin/nvcc_wrapper index 4ecf4c66d5a0..5556e888e34b 100755 --- a/packages/kokkos/bin/nvcc_wrapper +++ b/packages/kokkos/bin/nvcc_wrapper @@ -191,11 +191,11 @@ do shift ;; #Handle known nvcc args - --dryrun|--verbose|--keep|--keep-dir*|-G|--relocatable-device-code*|-lineinfo|-expt-extended-lambda|-expt-relaxed-constexpr|--resource-usage|-Xptxas*|--fmad*|--Wext-lambda-captures-this|-Wext-lambda-captures-this) + --dryrun|--verbose|--keep|--keep-dir*|-G|--relocatable-device-code*|-lineinfo|-expt-extended-lambda|-expt-relaxed-constexpr|--resource-usage|-Xptxas*|--fmad*|--use_fast_math|--Wext-lambda-captures-this|-Wext-lambda-captures-this) cuda_args="$cuda_args $1" ;; #Handle more known nvcc args - --expt-extended-lambda|--expt-relaxed-constexpr) + --expt-extended-lambda|--expt-relaxed-constexpr|--Wno-deprecated-gpu-targets|-Wno-deprecated-gpu-targets) cuda_args="$cuda_args $1" ;; #Handle known nvcc args that have an argument diff --git a/packages/kokkos/cmake/CTestConfig.cmake.in b/packages/kokkos/cmake/CTestConfig.cmake.in new file mode 100644 index 000000000000..1f82c0d64d15 --- /dev/null +++ b/packages/kokkos/cmake/CTestConfig.cmake.in @@ -0,0 +1,91 @@ +#----------------------------------------------------------------------------------------# +# +# CTestConfig.cmake template for Kokkos +# +#----------------------------------------------------------------------------------------# + +# +# dash-board related +# +set(CTEST_PROJECT_NAME "Kokkos") +set(CTEST_NIGHTLY_START_TIME "01:00:00 UTC") +set(CTEST_DROP_METHOD "https") +set(CTEST_DROP_SITE "cdash.nersc.gov") +set(CTEST_DROP_LOCATION "/submit.php?project=${CTEST_PROJECT_NAME}") +set(CTEST_CDASH_VERSION "1.6") +set(CTEST_CDASH_QUERY_VERSION TRUE) +set(CTEST_SUBMIT_RETRY_COUNT "1") +set(CTEST_SUBMIT_RETRY_DELAY "30") + +# +# configure/build related +# +set(CTEST_BUILD_NAME "@BUILD_NAME@") +set(CTEST_MODEL "@MODEL@") +set(CTEST_SITE "@SITE@") +set(CTEST_CONFIGURATION_TYPE "@BUILD_TYPE@") +set(CTEST_SOURCE_DIRECTORY "@SOURCE_REALDIR@") +set(CTEST_BINARY_DIRECTORY "@BINARY_REALDIR@") + +# +# configure/build related +# +set(CTEST_UPDATE_TYPE "git") +set(CTEST_UPDATE_VERSION_ONLY ON) +# set(CTEST_GENERATOR "") +# set(CTEST_GENERATOR_PLATFORM "") + +# +# testing related +# +set(CTEST_TIMEOUT "7200") +set(CTEST_TEST_TIMEOUT "7200") +set(CTEST_CUSTOM_MAXIMUM_NUMBER_OF_ERRORS "100") +set(CTEST_CUSTOM_MAXIMUM_NUMBER_OF_WARNINGS "100") +set(CTEST_CUSTOM_MAXIMUM_PASSED_TEST_OUTPUT_SIZE "1048576") + +# +# coverage related +# +set(CTEST_CUSTOM_COVERAGE_EXCLUDE ".*tpls/.*;/usr/.*;.*unit_test/.*;.*unit_tests/.*;.*perf_test/.*") + +# +# commands +# +if(NOT "@CHECKOUT_COMMAND@" STREQUAL "") + set(CTEST_CHECKOUT_COMMAND "@CHECKOUT_COMMAND@") +endif() +set(CTEST_UPDATE_COMMAND "@GIT_EXECUTABLE@") +set(CTEST_CONFIGURE_COMMAND "@CMAKE_COMMAND@ -DCMAKE_BUILD_TYPE=@BUILD_TYPE@ -DKokkos_ENABLE_TESTS=ON @CONFIG_ARGS@ @SOURCE_REALDIR@") +set(CTEST_BUILD_COMMAND "@CMAKE_COMMAND@ --build @BINARY_REALDIR@ --target @TARGET@") +if(NOT WIN32) + set(CTEST_BUILD_COMMAND "${CTEST_BUILD_COMMAND} -- -j@BUILD_JOBS@") +endif() +set(CTEST_COVERAGE_COMMAND "gcov") +set(CTEST_MEMORYCHECK_COMMAND "valgrind") +set(CTEST_GIT_COMMAND "@GIT_EXECUTABLE@") + +# +# various configs +# +set(APPEND_VALUE @APPEND@) +if(APPEND_VALUE) + set(APPEND_CTEST APPEND) +endif() + +macro(SET_TEST_PROP VAR) + if(NOT "${ARGS}" STREQUAL "") + set(${VAR}_CTEST ${VAR} ${ARGN}) + endif() +endmacro() + +set_test_prop(START @START@) +set_test_prop(END @END@) +set_test_prop(STRIDE @STRIDE@) +set_test_prop(INCLUDE @INCLUDE@) +set_test_prop(EXCLUDE @EXCLUDE@) +set_test_prop(INCLUDE_LABEL @INCLUDE_LABEL@) +set_test_prop(EXCLUDE_LABEL @EXCLUDE_LABEL@) +set_test_prop(PARALLEL_LEVEL @PARALLEL_LEVEL@) +set_test_prop(STOP_TIME @STOP_TIME@) +set_test_prop(COVERAGE_LABELS @LABELS@) diff --git a/packages/kokkos/cmake/KokkosCI.cmake b/packages/kokkos/cmake/KokkosCI.cmake new file mode 100644 index 000000000000..e8c9af37ad54 --- /dev/null +++ b/packages/kokkos/cmake/KokkosCI.cmake @@ -0,0 +1,350 @@ +cmake_minimum_required(VERSION 3.16 FATAL_ERROR) + +message(STATUS "") + +get_cmake_property(_cached_vars CACHE_VARIABLES) +set(KOKKOS_CMAKE_ARGS) +set(EXCLUDED_VARIABLES "CMAKE_COMMAND" "CMAKE_CPACK_COMMAND" "CMAKE_CTEST_COMMAND" "CMAKE_ROOT" + "CTEST_ARGS" "BUILD_NAME" "CMAKE_CXX_FLAGS" "CMAKE_BUILD_TYPE") +list(SORT _cached_vars) +foreach(_var ${_cached_vars}) + if(NOT "${_var}" IN_LIST EXCLUDED_VARIABLES) + list(APPEND KOKKOS_CMAKE_ARGS ${_var}) + if("${_var}" STREQUAL "CMAKE_BUILD_TYPE") + set(BUILD_TYPE "${CMAKE_BUILD_TYPE}") + endif() + endif() +endforeach() + + +#----------------------------------------------------------------------------------------# +# +# Macros and variables +# +#----------------------------------------------------------------------------------------# + +macro(CHECK_REQUIRED VAR) + if(NOT DEFINED ${VAR}) + message(FATAL_ERROR "Error! Variable '${VAR}' must be defined") + endif() +endmacro() + +# require the build name variable +CHECK_REQUIRED(BUILD_NAME) + +# uses all args +macro(SET_DEFAULT VAR) + if(NOT DEFINED ${VAR}) + set(${VAR} ${ARGN}) + endif() + # remove these ctest configuration variables from the defines + # passed to the Kokkos configuration + if("${VAR}" IN_LIST KOKKOS_CMAKE_ARGS) + list(REMOVE_ITEM KOKKOS_CMAKE_ARGS "${VAR}") + endif() +endmacro() + +# uses first arg -- useful for selecting via priority from multiple +# potentially defined variables, e.g.: +# +# set_default_arg1(BUILD_NAME ${TRAVIS_BUILD_NAME} ${BUILD_NAME}) +# +macro(SET_DEFAULT_ARG1 VAR) + if(NOT DEFINED ${VAR}) + foreach(_ARG ${ARGN}) + if(NOT "${_ARG}" STREQUAL "") + set(${VAR} ${_ARG}) + break() + endif() + endforeach() + endif() + # remove these ctest configuration variables from the defines + # passed to the Kokkos configuration + if("${VAR}" IN_LIST KOKKOS_CMAKE_ARGS) + list(REMOVE_ITEM KOKKOS_CMAKE_ARGS "${VAR}") + endif() +endmacro() + +# determine the default working directory +if(NOT "$ENV{WORKSPACE}" STREQUAL "") + set(WORKING_DIR "$ENV{WORKSPACE}") +else() + get_filename_component(WORKING_DIR ${CMAKE_CURRENT_LIST_DIR} DIRECTORY) +endif() + +# determine the hostname +execute_process(COMMAND hostname + OUTPUT_VARIABLE HOSTNAME + OUTPUT_STRIP_TRAILING_WHITESPACE) + +SET_DEFAULT(HOSTNAME "$ENV{HOSTNAME}") + +# get the number of processors +include(ProcessorCount) +ProcessorCount(NUM_PROCESSORS) + +# find git +find_package(Git QUIET) +if(NOT GIT_EXECUTABLE) + unset(GIT_EXECUTABLE CACHE) + unset(GIT_EXECUTABLE) +endif() + +function(EXECUTE_GIT_COMMAND VAR) + set(${VAR} "" PARENT_SCOPE) + execute_process(COMMAND ${GIT_EXECUTABLE} ${ARGN} + OUTPUT_VARIABLE VAL + RESULT_VARIABLE RET + OUTPUT_STRIP_TRAILING_WHITESPACE + WORKING_DIRECTORY ${CMAKE_CURRENT_LIST_DIR} + ERROR_QUIET) + string(REPLACE ";" " " _CMD "${GIT_EXECUTABLE} ${ARGN}") + set(LAST_GIT_COMMAND "${_CMD}" PARENT_SCOPE) + if(RET EQUAL 0) + set(${VAR} "${VAL}" PARENT_SCOPE) + endif() +endfunction() + +# just gets the git branch name if available +function(GET_GIT_BRANCH_NAME VAR) + execute_git_command(GIT_BRANCH branch --show-current) + set(_INVALID "%D" "HEAD") + if(NOT GIT_BRANCH OR "${GIT_BRANCH}" IN_LIST _INVALID) + execute_git_command(GIT_BRANCH show -s --format=%D) + if(NOT GIT_BRANCH OR "${GIT_BRANCH}" IN_LIST _INVALID) + execute_git_command(GIT_BRANCH --describe all) + endif() + endif() + # + if(GIT_BRANCH) + string(REPLACE " " ";" _DESC "${GIT_BRANCH}") + # just set it to last one via loop instead of wonky cmake index manip + foreach(_ITR ${_DESC}) + set(GIT_BRANCH "${_ITR}") + endforeach() + set(${VAR} "${GIT_BRANCH}" PARENT_SCOPE) + message(STATUS "GIT BRANCH via '${LAST_GIT_COMMAND}': ${GIT_BRANCH}") + endif() +endfunction() + +# just gets the git branch name if available +function(GET_GIT_AUTHOR_NAME VAR) + execute_git_command(GIT_AUTHOR show -s --format=%an) + if(GIT_AUTHOR) + string(LENGTH "${GIT_AUTHOR}" STRLEN) + # if the build name gets too long, this can cause submission errors + if(STRLEN GREATER 24) + # remove middle initial + string(REGEX REPLACE " [A-Z]\. " " " GIT_AUTHOR "${GIT_AUTHOR}") + # get first and sur name + string(REGEX REPLACE "([A-Za-z]+) ([A-Za-z]+)" "\\1" F_NAME "${GIT_AUTHOR}") + string(REGEX REPLACE "([A-Za-z]+) ([A-Za-z]+)" "\\2" S_NAME "${GIT_AUTHOR}") + if(S_NAME) + set(GIT_AUTHOR "${S_NAME}") + elseif(F_NAME) + set(GIT_AUTHOR "${F_NAME}") + endif() + endif() + # remove any spaces, quotes, periods, etc. + string(REGEX REPLACE "[ ',;_\.\"]+" "" GIT_AUTHOR "${GIT_AUTHOR}") + set(${VAR} "${GIT_AUTHOR}" PARENT_SCOPE) + message(STATUS "GIT AUTHOR via '${LAST_GIT_COMMAND}': ${GIT_AUTHOR}") + endif() +endfunction() + +# get the name of the branch +GET_GIT_BRANCH_NAME(GIT_BRANCH) +# get the name of the author +GET_GIT_AUTHOR_NAME(GIT_AUTHOR) +# author, prefer git method for consistency +SET_DEFAULT_ARG1(AUTHOR ${GIT_AUTHOR} $ENV{GIT_AUTHOR} $ENV{AUTHOR}) +# SLUG == owner_name/repo_name +SET_DEFAULT_ARG1(SLUG $ENV{TRAVIS_PULL_REQUEST_SLUG} $ENV{TRAVIS_REPO_SLUG} $ENV{APPVEYOR_REPO_NAME} $ENV{PULL_REQUEST_SLUG} $ENV{REPO_SLUG}) +# branch name +SET_DEFAULT_ARG1(BRANCH $ENV{TRAVIS_PULL_REQUEST_BRANCH} $ENV{TRAVIS_BRANCH} $ENV{APPVEYOR_PULL_REQUEST_HEAD_REPO_BRANCH} $ENV{APPVEYOR_REPO_BRANCH} $ENV{GIT_BRANCH} $ENV{BRANCH_NAME} $ENV{BRANCH} ${GIT_BRANCH}) +# pull request number +SET_DEFAULT_ARG1(PULL_REQUEST_NUM $ENV{TRAVIS_PULL_REQUEST} $ENV{CHANGE_ID} $ENV{APPVEYOR_PULL_REQUEST_NUMBER} $ENV{PULL_REQUEST_NUM}) +# get the event type, e.g. push, pull_request, api, cron, etc. +SET_DEFAULT_ARG1(EVENT_TYPE $ENV{TRAVIS_EVENT_TYPE} ${EVENT_TYPE}) + +if("${BRANCH}" STREQUAL "") + message(STATUS "Checked: environment variables for Travis, Appveyor, Jenkins (git plugin), BRANCH_NAME, BRANCH and 'git branch --show-current'") + message(FATAL_ERROR "Error! Git branch could not be determined. Please provide -DBRANCH=") +endif() + +#----------------------------------------------------------------------------------------# +# +# Set default values if not provided on command-line +# +#----------------------------------------------------------------------------------------# + +SET_DEFAULT(SOURCE_DIR "${WORKING_DIR}") # source directory +SET_DEFAULT(BINARY_DIR "${WORKING_DIR}/build") # build directory +SET_DEFAULT(BUILD_TYPE "${CMAKE_BUILD_TYPE}") # Release, Debug, etc. +SET_DEFAULT(MODEL "Continuous") # Continuous, Nightly, or Experimental +SET_DEFAULT(JOBS 1) # number of parallel ctests +SET_DEFAULT(CTEST_COMMAND "${CMAKE_CTEST_COMMAND}") # just in case +SET_DEFAULT(CTEST_ARGS "-V --output-on-failure") # extra arguments when ctest is called +SET_DEFAULT(GIT_EXECUTABLE "git") # ctest_update +SET_DEFAULT(TARGET "all") # build target +SET_DEFAULT_ARG1(SITE "$ENV{SITE}" + "${HOSTNAME}") # update site +SET_DEFAULT_ARG1(BUILD_JOBS "$ENV{BUILD_JOBS}" + "${NUM_PROCESSORS}") # number of parallel compile jobs +# +# The variable below correspond to ctest arguments, i.e. START,END,STRIDE are +# '-I START,END,STRIDE' +# +SET_DEFAULT(START "") +SET_DEFAULT(END "") +SET_DEFAULT(STRIDE "") +SET_DEFAULT(INCLUDE "") +SET_DEFAULT(EXCLUDE "") +SET_DEFAULT(INCLUDE_LABEL "") +SET_DEFAULT(EXCLUDE_LABEL "") +SET_DEFAULT(PARALLEL_LEVEL "") +SET_DEFAULT(STOP_TIME "") +SET_DEFAULT(LABELS "") +SET_DEFAULT(NOTES "") + +# default static build tag for Nightly +set(BUILD_TAG "${BRANCH}") + +if(NOT BUILD_TYPE) + # default for kokkos if not specified + set(BUILD_TYPE "RelWithDebInfo") +endif() + +# generate dynamic name if continuous or experimental model +if(NOT "${MODEL}" STREQUAL "Nightly") + if(EVENT_TYPE AND PULL_REQUEST_NUM) + # e.g. pull_request/123 + if(AUTHOR) + set(BUILD_TAG "${AUTHOR}/${EVENT_TYPE}/${PULL_REQUEST_NUM}") + else() + set(BUILD_TAG "${EVENT_TYPE}/${PULL_REQUEST_NUM}") + endif() + elseif(SLUG) + # e.g. owner_name/repo_name + set(BUILD_TAG "${SLUG}") + elseif(AUTHOR) + set(BUILD_TAG "${AUTHOR}/${BRANCH}") + endif() + if(EVENT_TYPE AND NOT PULL_REQUEST_NUM) + set(BUILD_TAG "${BUILD_TAG}-${EVENT_TYPE}") + endif() +endif() + +# unnecessary +string(REPLACE "/remotes/" "/" BUILD_TAG "${BUILD_TAG}") +string(REPLACE "/origin/" "/" BUILD_TAG "${BUILD_TAG}") + +message(STATUS "BUILD_TAG: ${BUILD_TAG}") + +set(BUILD_NAME "[${BUILD_TAG}] [${BUILD_NAME}-${BUILD_TYPE}]") + +# colons in build name create extra (empty) entries in CDash +string(REPLACE ":" "-" BUILD_NAME "${BUILD_NAME}") +# unnecessary info +string(REPLACE "/merge]" "]" BUILD_NAME "${BUILD_NAME}") +# consistency +string(REPLACE "/pr/" "/pull/" BUILD_NAME "${BUILD_NAME}") +string(REPLACE "pull_request/" "pull/" BUILD_NAME "${BUILD_NAME}") +# miscellaneous from missing fields +string(REPLACE "--" "-" BUILD_NAME "${BUILD_NAME}") +string(REPLACE "-]" "]" BUILD_NAME "${BUILD_NAME}") + +# check binary directory +if(EXISTS ${BINARY_DIR}) + if(NOT IS_DIRECTORY "${BINARY_DIR}") + message(FATAL_ERROR "Error! '${BINARY_DIR}' already exists and is not a directory!") + endif() + file(GLOB BINARY_DIR_FILES "${BINARY_DIR}/*") + if(NOT "${BINARY_DIR_FILES}" STREQUAL "") + message(FATAL_ERROR "Error! '${BINARY_DIR}' already exists and is not empty!") + endif() +endif() + +get_filename_component(SOURCE_REALDIR ${SOURCE_DIR} REALPATH) +get_filename_component(BINARY_REALDIR ${BINARY_DIR} REALPATH) + +#----------------------------------------------------------------------------------------# +# +# Generate the CTestConfig.cmake +# +#----------------------------------------------------------------------------------------# + +set(CONFIG_ARGS) +foreach(_ARG ${KOKKOS_CMAKE_ARGS}) + if(NOT "${${_ARG}}" STREQUAL "") + get_property(_ARG_TYPE CACHE ${_ARG} PROPERTY TYPE) + if("${_ARG_TYPE}" STREQUAL "UNINITIALIZED") + if("${${_ARG}}" STREQUAL "ON" OR "${${_ARG}}" STREQUAL "OFF") + set(_ARG_TYPE "BOOL") + elseif(EXISTS "${${_ARG}}" AND NOT IS_DIRECTORY "${${_ARG}}") + set(_ARG_TYPE "FILEPATH") + elseif(EXISTS "${${_ARG}}" AND IS_DIRECTORY "${${_ARG}}") + set(_ARG_TYPE "PATH") + elseif(NOT "${${_ARG}}" STREQUAL "") + set(_ARG_TYPE "STRING") + endif() + endif() + set(CONFIG_ARGS "${CONFIG_ARGS}set(${_ARG} \"${${_ARG}}\" CACHE ${_ARG_TYPE} \"\")\n") + endif() +endforeach() + +file(WRITE ${BINARY_REALDIR}/initial-cache.cmake +" +set(CMAKE_CXX_FLAGS \"${CMAKE_CXX_FLAGS}\" CACHE STRING \"\") +${CONFIG_ARGS} +") + +file(READ ${BINARY_REALDIR}/initial-cache.cmake _CACHE_INFO) +message(STATUS "Initial cache:\n${_CACHE_INFO}") + +# initialize the cache +set(CONFIG_ARGS "-C ${BINARY_REALDIR}/initial-cache.cmake") + + +# generate the CTestConfig.cmake +configure_file( + ${CMAKE_CURRENT_LIST_DIR}/CTestConfig.cmake.in + ${BINARY_REALDIR}/CTestConfig.cmake + @ONLY) + +# copy/generate the dashboard script +configure_file( + ${CMAKE_CURRENT_LIST_DIR}/KokkosCTest.cmake.in + ${BINARY_REALDIR}/KokkosCTest.cmake + @ONLY) + +# custom CTest settings go in ${BINARY_DIR}/CTestCustom.cmake +execute_process( + COMMAND ${CMAKE_COMMAND} -E touch CTestCustom.cmake + WORKING_DIRECTORY ${BINARY_REALDIR} + ) + +#----------------------------------------------------------------------------------------# +# +# Execute CTest +# +#----------------------------------------------------------------------------------------# + +message(STATUS "") +message(STATUS "BUILD_NAME: ${BUILD_NAME}") +message(STATUS "Executing '${CTEST_COMMAND} -S KokkosCTest.cmake ${CTEST_ARGS}'...") +message(STATUS "") + +# e.g. -DCTEST_ARGS="--output-on-failure -VV" should really be -DCTEST_ARGS="--output-on-failure;-VV" +string(REPLACE " " ";" CTEST_ARGS "${CTEST_ARGS}") + +execute_process( + COMMAND ${CTEST_COMMAND} -S KokkosCTest.cmake ${CTEST_ARGS} + RESULT_VARIABLE RET + WORKING_DIRECTORY ${BINARY_REALDIR} + ) + +# ensure that any non-zero result variable gets propagated +if(NOT RET EQUAL 0) + message(FATAL_ERROR "CTest return non-zero exit code: ${RET}") +endif() diff --git a/packages/kokkos/cmake/KokkosCTest.cmake.in b/packages/kokkos/cmake/KokkosCTest.cmake.in new file mode 100644 index 000000000000..b6917f3cc189 --- /dev/null +++ b/packages/kokkos/cmake/KokkosCTest.cmake.in @@ -0,0 +1,261 @@ +cmake_minimum_required(VERSION 3.16 FATAL_ERROR) + +if(EXISTS "${CMAKE_CURRENT_LIST_DIR}/CTestConfig.cmake") + include("${CMAKE_CURRENT_LIST_DIR}/CTestConfig.cmake") +endif() + +include(ProcessorCount) +ProcessorCount(CTEST_PROCESSOR_COUNT) + +cmake_policy(SET CMP0009 NEW) +cmake_policy(SET CMP0011 NEW) + +# ---------------------------------------------------------------------------- # +# -- Commands +# ---------------------------------------------------------------------------- # +find_program(CTEST_CMAKE_COMMAND NAMES cmake) +find_program(CTEST_UNAME_COMMAND NAMES uname) + +find_program(CTEST_BZR_COMMAND NAMES bzr) +find_program(CTEST_CVS_COMMAND NAMES cvs) +find_program(CTEST_GIT_COMMAND NAMES git) +find_program(CTEST_HG_COMMAND NAMES hg) +find_program(CTEST_P4_COMMAND NAMES p4) +find_program(CTEST_SVN_COMMAND NAMES svn) + +find_program(VALGRIND_COMMAND NAMES valgrind) +find_program(GCOV_COMMAND NAMES gcov) +find_program(LCOV_COMMAND NAMES llvm-cov) +find_program(MEMORYCHECK_COMMAND NAMES valgrind ) + +set(MEMORYCHECK_TYPE Valgrind) +# set(MEMORYCHECK_TYPE Purify) +# set(MEMORYCHECK_TYPE BoundsChecker) +# set(MEMORYCHECK_TYPE ThreadSanitizer) +# set(MEMORYCHECK_TYPE AddressSanitizer) +# set(MEMORYCHECK_TYPE LeakSanitizer) +# set(MEMORYCHECK_TYPE MemorySanitizer) +# set(MEMORYCHECK_TYPE UndefinedBehaviorSanitizer) +set(MEMORYCHECK_COMMAND_OPTIONS "--trace-children=yes --leak-check=full") + +# ---------------------------------------------------------------------------- # +# -- Settings +# ---------------------------------------------------------------------------- # +## -- Process timeout in seconds +set(CTEST_TIMEOUT "7200") +## -- Set output to English +set(ENV{LC_MESSAGES} "en_EN" ) + + +# ---------------------------------------------------------------------------- # +# -- Copy ctest configuration file +# ---------------------------------------------------------------------------- # +macro(COPY_CTEST_CONFIG_FILES) + + foreach(_FILE CTestConfig.cmake CTestCustom.cmake) + + # if current directory is not binary or source directory + if(NOT "${CMAKE_CURRENT_LIST_DIR}" STREQUAL "${CTEST_BINARY_DIRECTORY}" AND + NOT "${CTEST_SOURCE_DIRECTORY}" STREQUAL "${CTEST_BINARY_DIRECTORY}") + + # if file exists in current directory + if(EXISTS ${CMAKE_CURRENT_LIST_DIR}/${_FILE}) + configure_file(${CMAKE_CURRENT_LIST_DIR}/${_FILE} + ${CTEST_BINARY_DIRECTORY}/${_FILE} COPYONLY) + endif() + + # if source and binary differ + elseif(NOT "${CTEST_SOURCE_DIRECTORY}" STREQUAL "${CTEST_BINARY_DIRECTORY}") + + # if file exists in source directory but not in binary directory + if(EXISTS ${CTEST_SOURCE_DIRECTORY}/${_FILE} AND + NOT EXISTS ${CTEST_BINARY_DIRECTORY}/${_FILE}) + configure_file(${CTEST_SOURCE_DIRECTORY}/${_FILE} + ${CTEST_BINARY_DIRECTORY}/${_FILE} COPYONLY) + endif() + + endif() + endforeach() + +endmacro() + +ctest_read_custom_files("${CMAKE_CURRENT_LIST_DIR}") + +message(STATUS "CTEST_MODEL: ${CTEST_MODEL}") + +#-------------------------------------------------------------------------# +# Start +# +message(STATUS "") +message(STATUS "[${CTEST_BUILD_NAME}] Running START_CTEST stage...") +message(STATUS "") + +ctest_start(${CTEST_MODEL} TRACK ${CTEST_MODEL} ${APPEND_CTEST} + ${CTEST_SOURCE_DIRECTORY} ${CTEST_BINARY_DIRECTORY}) + + +#-------------------------------------------------------------------------# +# Config +# +copy_ctest_config_files() +ctest_read_custom_files("${CTEST_BINARY_DIRECTORY}") + + +#-------------------------------------------------------------------------# +# Update +# +message(STATUS "") +message(STATUS "[${CTEST_BUILD_NAME}] Running CTEST_UPDATE stage...") +message(STATUS "") + +ctest_update(SOURCE "${CTEST_SOURCE_DIRECTORY}" + RETURN_VALUE up_ret) + + +#-------------------------------------------------------------------------# +# Configure +# +message(STATUS "") +message(STATUS "[${CTEST_BUILD_NAME}] Running CTEST_CONFIGURE stage...") +message(STATUS "") + +ctest_configure(BUILD "${CTEST_BINARY_DIRECTORY}" + SOURCE ${CTEST_SOURCE_DIRECTORY} + ${APPEND_CTEST} + OPTIONS "${CTEST_CONFIGURE_OPTIONS}" + RETURN_VALUE config_ret) + + +#-------------------------------------------------------------------------# +# Echo configure log bc Damien wants to delay merging this PR for eternity +# +file(GLOB _configure_log "${CTEST_BINARY_DIRECTORY}/Testing/Temporary/LastConfigure*.log") +# should only have one but loop just for safety +foreach(_LOG ${_configure_log}) + file(READ ${_LOG} _LOG_MESSAGE) + message(STATUS "Configure Log: ${_LOG}") + message(STATUS "\n${_LOG_MESSAGE}\n") +endforeach() + + +#-------------------------------------------------------------------------# +# Build +# +message(STATUS "") +message(STATUS "[${CTEST_BUILD_NAME}] Running CTEST_BUILD stage...") +message(STATUS "") + +ctest_build(BUILD "${CTEST_BINARY_DIRECTORY}" + ${APPEND_CTEST} + RETURN_VALUE build_ret) + + +#-------------------------------------------------------------------------# +# Echo build log bc Damien wants to delay merging this PR for eternity +# +file(GLOB _build_log "${CTEST_BINARY_DIRECTORY}/Testing/Temporary/LastBuild*.log") +# should only have one but loop just for safety +foreach(_LOG ${_build_log}) + file(READ ${_LOG} _LOG_MESSAGE) + message(STATUS "Build Log: ${_LOG}") + message(STATUS "\n${_LOG_MESSAGE}\n") +endforeach() + + +#-------------------------------------------------------------------------# +# Test +# +message(STATUS "") +message(STATUS "[${CTEST_BUILD_NAME}] Running CTEST_TEST stage...") +message(STATUS "") + +ctest_test(RETURN_VALUE test_ret + ${APPEND_CTEST} + ${START_CTEST} + ${END_CTEST} + ${STRIDE_CTEST} + ${INCLUDE_CTEST} + ${EXCLUDE_CTEST} + ${INCLUDE_LABEL_CTEST} + ${EXCLUDE_LABEL_CTEST} + ${PARALLEL_LEVEL_CTEST} + ${STOP_TIME_CTEST} + SCHEDULE_RANDOM OFF) + + +#-------------------------------------------------------------------------# +# Coverage +# +message(STATUS "") +message(STATUS "[${CTEST_BUILD_NAME}] Running CTEST_COVERAGE stage...") +message(STATUS "") + +execute_process(COMMAND ${CTEST_COVERAGE_COMMAND} ${CTEST_COVERAGE_EXTRA_FLAGS} + WORKING_DIRECTORY ${CTEST_BINARY_DIRECTORY} + ERROR_QUIET) + +ctest_coverage(${APPEND_CTEST} + ${CTEST_COVERAGE_LABELS} + RETURN_VALUE cov_ret) + + +#-------------------------------------------------------------------------# +# MemCheck +# +message(STATUS "") +message(STATUS "[${CTEST_BUILD_NAME}] Running CTEST_MEMCHECK stage...") +message(STATUS "") + +ctest_memcheck(RETURN_VALUE mem_ret + ${APPEND_CTEST} + ${START_CTEST} + ${END_CTEST} + ${STRIDE_CTEST} + ${INCLUDE_CTEST} + ${EXCLUDE_CTEST} + ${INCLUDE_LABEL_CTEST} + ${EXCLUDE_LABEL_CTEST} + ${PARALLEL_LEVEL_CTEST}) + + +#-------------------------------------------------------------------------# +# Submit +# +message(STATUS "") +message(STATUS "[${CTEST_BUILD_NAME}] Running CTEST_SUBMIT stage...") +message(STATUS "") + +file(GLOB_RECURSE NOTE_FILES "${CTEST_BINARY_DIRECTORY}/*CTestNotes.cmake") +foreach(_FILE ${NOTE_FILES}) + message(STATUS "Including CTest notes files: \"${_FILE}\"...") + include("${_FILE}") +endforeach() + +# capture submit error so it doesn't fail because of a submission error +ctest_submit(RETURN_VALUE submit_ret + RETRY_COUNT 2 + RETRY_DELAY 10 + CAPTURE_CMAKE_ERROR submit_err) + +#-------------------------------------------------------------------------# +# Submit +# +message(STATUS "") +message(STATUS "[${CTEST_BUILD_NAME}] Finished ${CTEST_MODEL} Stages (${STAGES})") +message(STATUS "") + + +#-------------------------------------------------------------------------# +# Non-zero exit codes for important errors +# +if(NOT config_ret EQUAL 0) + message(FATAL_ERROR "Error during configuration! Exit code: ${config_ret}") +endif() + +if(NOT build_ret EQUAL 0) + message(FATAL_ERROR "Error during build! Exit code: ${build_ret}") +endif() + +if(NOT test_ret EQUAL 0) + message(FATAL_ERROR "Error during testing! Exit code: ${test_ret}") +endif() diff --git a/packages/kokkos/cmake/KokkosConfig.cmake.in b/packages/kokkos/cmake/KokkosConfig.cmake.in index 9fbd22ee5c47..44a8fcd9c319 100644 --- a/packages/kokkos/cmake/KokkosConfig.cmake.in +++ b/packages/kokkos/cmake/KokkosConfig.cmake.in @@ -19,17 +19,44 @@ INCLUDE("${Kokkos_CMAKE_DIR}/KokkosTargets.cmake") INCLUDE("${Kokkos_CMAKE_DIR}/KokkosConfigCommon.cmake") UNSET(Kokkos_CMAKE_DIR) -# if CUDA was enabled and separable compilation was specified, e.g. -# find_package(Kokkos COMPONENTS separable_compilation) -# then we set the RULE_LAUNCH_COMPILE and RULE_LAUNCH_LINK -IF(@Kokkos_ENABLE_CUDA@ AND NOT "separable_compilation" IN_LIST Kokkos_FIND_COMPONENTS) +# check for conflicts +IF("launch_compiler" IN_LIST Kokkos_FIND_COMPONENTS AND + "separable_compilation" IN_LIST Kokkos_FIND_COMPONENTS) + MESSAGE(STATUS "'launch_compiler' implies global redirection of targets depending on Kokkos to appropriate compiler.") + MESSAGE(STATUS "'separable_compilation' implies explicitly defining where redirection occurs via 'kokkos_compilation(PROJECT|TARGET|SOURCE|DIRECTORY ...)'") + MESSAGE(FATAL_ERROR "Conflicting COMPONENTS: 'launch_compiler' and 'separable_compilation'") +ENDIF() + +IF("launch_compiler" IN_LIST Kokkos_FIND_COMPONENTS) + # + # if find_package(Kokkos COMPONENTS launch_compiler) then rely on the + # RULE_LAUNCH_COMPILE and RULE_LAUNCH_LINK to always redirect to the + # appropriate compiler for Kokkos + # + + MESSAGE(STATUS "kokkos_launch_compiler is enabled globally. C++ compiler commands with -DKOKKOS_DEPENDENCE will be redirected to the appropriate compiler for Kokkos") + kokkos_compilation( + GLOBAL + CHECK_CUDA_COMPILES) + +ELSEIF(@Kokkos_ENABLE_CUDA@ AND NOT "separable_compilation" IN_LIST Kokkos_FIND_COMPONENTS) + # + # if CUDA was enabled, separable compilation was not specified, and current compiler + # cannot compile CUDA, then set the RULE_LAUNCH_COMPILE and RULE_LAUNCH_LINK globally and + # kokkos_launch_compiler will re-direct to the compiler used to compile CUDA code during installation. + # kokkos_launch_compiler will re-direct if ${CMAKE_CXX_COMPILER} and -DKOKKOS_DEPENDENCE is present, + # otherwise, the original command will be executed + # + # run test to see if CMAKE_CXX_COMPILER=nvcc_wrapper kokkos_compiler_is_nvcc(IS_NVCC ${CMAKE_CXX_COMPILER}) - # if not nvcc_wrapper, use RULE_LAUNCH_COMPILE and RULE_LAUNCH_LINK - IF(NOT IS_NVCC AND NOT CMAKE_CXX_COMPILER_ID STREQUAL Clang AND - (NOT DEFINED Kokkos_LAUNCH_COMPILER OR Kokkos_LAUNCH_COMPILER)) - MESSAGE(STATUS "kokkos_launch_compiler is enabled globally. C++ compiler commands with -DKOKKOS_DEPENDENCE will be redirected to nvcc_wrapper") + + # if not nvcc_wrapper and Kokkos_LAUNCH_COMPILER was not set to OFF + IF(NOT IS_NVCC AND (NOT DEFINED Kokkos_LAUNCH_COMPILER OR Kokkos_LAUNCH_COMPILER)) + MESSAGE(STATUS "kokkos_launch_compiler is enabled globally. C++ compiler commands with -DKOKKOS_DEPENDENCE will be redirected to the appropriate compiler for Kokkos") kokkos_compilation(GLOBAL) ENDIF() - UNSET(IS_NVCC) # be mindful of the environment, pollution is bad + + # be mindful of the environment, pollution is bad + UNSET(IS_NVCC) ENDIF() diff --git a/packages/kokkos/cmake/KokkosConfigCommon.cmake.in b/packages/kokkos/cmake/KokkosConfigCommon.cmake.in index 42c755c2157f..ab93e65afe97 100644 --- a/packages/kokkos/cmake/KokkosConfigCommon.cmake.in +++ b/packages/kokkos/cmake/KokkosConfigCommon.cmake.in @@ -3,6 +3,7 @@ SET(Kokkos_OPTIONS @KOKKOS_ENABLED_OPTIONS@) SET(Kokkos_TPLS @KOKKOS_ENABLED_TPLS@) SET(Kokkos_ARCH @KOKKOS_ENABLED_ARCH_LIST@) SET(Kokkos_CXX_COMPILER "@CMAKE_CXX_COMPILER@") +SET(Kokkos_CXX_COMPILER_ID "@KOKKOS_CXX_COMPILER_ID@") # These are needed by KokkosKernels FOREACH(DEV ${Kokkos_DEVICES}) @@ -13,13 +14,13 @@ IF(NOT Kokkos_FIND_QUIETLY) MESSAGE(STATUS "Enabled Kokkos devices: ${Kokkos_DEVICES}") ENDIF() -IF (Kokkos_ENABLE_CUDA AND ${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.14.0") - #If we are building CUDA, we have tricked CMake because we declare a CXX project - #If the default C++ standard for a given compiler matches the requested - #standard, then CMake just omits the -std flag in later versions of CMake - #This breaks CUDA compilation (CUDA compiler can have a different default - #-std then the underlying host compiler by itself). Setting this variable - #forces CMake to always add the -std flag even if it thinks it doesn't need it +IF (Kokkos_ENABLE_CUDA) + # If we are building CUDA, we have tricked CMake because we declare a CXX project + # If the default C++ standard for a given compiler matches the requested + # standard, then CMake just omits the -std flag in later versions of CMake + # This breaks CUDA compilation (CUDA compiler can have a different default + # -std then the underlying host compiler by itself). Setting this variable + # forces CMake to always add the -std flag even if it thinks it doesn't need it SET(CMAKE_CXX_STANDARD_DEFAULT 98 CACHE INTERNAL "" FORCE) ENDIF() @@ -90,7 +91,88 @@ function(kokkos_check) endif() endfunction() -# this function is provided to easily select which files use nvcc_wrapper: +# A test to check whether a downstream project set the C++ compiler to NVCC or not +# this is called only when Kokkos was installed with Kokkos_ENABLE_CUDA=ON +FUNCTION(kokkos_compiler_is_nvcc VAR COMPILER) + # Check if the compiler is nvcc (which really means nvcc_wrapper). + EXECUTE_PROCESS(COMMAND ${COMPILER} ${ARGN} --version + OUTPUT_VARIABLE INTERNAL_COMPILER_VERSION + OUTPUT_STRIP_TRAILING_WHITESPACE + RESULT_VARIABLE RET) + # something went wrong + IF(RET GREATER 0) + SET(${VAR} false PARENT_SCOPE) + ELSE() + STRING(REPLACE "\n" " - " INTERNAL_COMPILER_VERSION_ONE_LINE ${INTERNAL_COMPILER_VERSION} ) + STRING(FIND ${INTERNAL_COMPILER_VERSION_ONE_LINE} "nvcc" INTERNAL_COMPILER_VERSION_CONTAINS_NVCC) + STRING(REGEX REPLACE "^ +" "" INTERNAL_HAVE_COMPILER_NVCC "${INTERNAL_HAVE_COMPILER_NVCC}") + IF(${INTERNAL_COMPILER_VERSION_CONTAINS_NVCC} GREATER -1) + SET(${VAR} true PARENT_SCOPE) + ELSE() + SET(${VAR} false PARENT_SCOPE) + ENDIF() + ENDIF() +ENDFUNCTION() + +# this function checks whether the current CXX compiler supports building CUDA +FUNCTION(kokkos_cxx_compiler_cuda_test _VAR _COMPILER) + + FILE(WRITE ${PROJECT_BINARY_DIR}/compile_tests/compiles_cuda.cu +" +#include +#include + +__global__ +void kernel(int sz, double* data) +{ + int _beg = blockIdx.x * blockDim.x + threadIdx.x; + for(int i = _beg; i < sz; ++i) + data[i] += static_cast(i); +} + +int main() +{ + double* data = NULL; + int blocks = 64; + int grids = 64; + int ret = cudaMalloc(&data, blocks * grids * sizeof(double)); + if(ret != cudaSuccess) + return EXIT_FAILURE; + kernel<<>>(blocks * grids, data); + cudaDeviceSynchronize(); + return EXIT_SUCCESS; +} +") + + # save the command for debugging + SET(_COMMANDS "${_COMPILER} ${ARGN} -c ${PROJECT_BINARY_DIR}/compile_tests/compiles_cuda.cu") + + # use execute_process instead of try compile because we want to set custom compiler + EXECUTE_PROCESS(COMMAND ${_COMPILER} ${ARGN} -c ${PROJECT_BINARY_DIR}/compile_tests/compiles_cuda.cu + RESULT_VARIABLE _RET + WORKING_DIRECTORY ${PROJECT_BINARY_DIR}/compile_tests + TIMEOUT 15 + OUTPUT_QUIET + ERROR_QUIET) + + IF(NOT _RET EQUAL 0) + # save the command for debugging + SET(_COMMANDS "${_COMMAND}\n${_COMPILER} --cuda-gpu-arch=sm_35 ${ARGN} -c ${PROJECT_BINARY_DIR}/compile_tests/compiles_cuda.cu") + # try the compile test again with clang arguments + EXECUTE_PROCESS(COMMAND ${_COMPILER} --cuda-gpu-arch=sm_35 -c ${PROJECT_BINARY_DIR}/compile_tests/compiles_cuda.cu + RESULT_VARIABLE _RET + WORKING_DIRECTORY ${PROJECT_BINARY_DIR}/compile_tests + TIMEOUT 15 + OUTPUT_QUIET + ERROR_QUIET) + ENDIF() + + SET(${_VAR}_COMMANDS "${_COMMANDS}" PARENT_SCOPE) + SET(${_VAR} ${_RET} PARENT_SCOPE) +ENDFUNCTION() + +# this function is provided to easily select which files use the same compiler as Kokkos +# when it was installed (or nvcc_wrapper): # # GLOBAL --> all files # TARGET --> all files in a target @@ -98,8 +180,21 @@ endfunction() # DIRECTORY --> all files in directory # PROJECT --> all files/targets in a project/subproject # +# Use the COMPILER argument to specify a compiler, if needed. By default, it will +# set the values to ${Kokkos_CXX_COMPILER} unless Kokkos_ENABLE_CUDA=ON and +# Kokkos_CXX_COMPILER_ID is NVIDIA, then it will set it to nvcc_wrapper +# +# Use CHECK_CUDA_COMPILES to run a check when CUDA is enabled +# FUNCTION(kokkos_compilation) - CMAKE_PARSE_ARGUMENTS(COMP "GLOBAL;PROJECT" "" "DIRECTORY;TARGET;SOURCE" ${ARGN}) + CMAKE_PARSE_ARGUMENTS(COMP + "GLOBAL;PROJECT;CHECK_CUDA_COMPILES" + "COMPILER" + "DIRECTORY;TARGET;SOURCE;COMMAND_PREFIX" + ${ARGN}) + + # if built w/o CUDA support, we want to basically make this a no-op + SET(_Kokkos_ENABLE_CUDA @Kokkos_ENABLE_CUDA@) # search relative first and then absolute SET(_HINTS "${CMAKE_CURRENT_LIST_DIR}/../.." "@CMAKE_INSTALL_PREFIX@") @@ -115,10 +210,52 @@ FUNCTION(kokkos_compilation) MESSAGE(FATAL_ERROR "Kokkos could not find 'kokkos_launch_compiler'. Please set '-DKokkos_COMPILE_LAUNCHER=/path/to/launcher'") ENDIF() + # if COMPILER was not specified, assume Kokkos_CXX_COMPILER + IF(NOT COMP_COMPILER) + SET(COMP_COMPILER ${Kokkos_CXX_COMPILER}) + IF(_Kokkos_ENABLE_CUDA AND Kokkos_CXX_COMPILER_ID STREQUAL NVIDIA) + # find nvcc_wrapper + FIND_PROGRAM(Kokkos_NVCC_WRAPPER + NAMES nvcc_wrapper + HINTS ${_HINTS} + PATHS ${_HINTS} + PATH_SUFFIXES bin) + # fatal if we can't nvcc_wrapper + IF(NOT Kokkos_NVCC_WRAPPER) + MESSAGE(FATAL_ERROR "Kokkos could not find nvcc_wrapper. Please set '-DKokkos_NVCC_WRAPPER=/path/to/nvcc_wrapper'") + ENDIF() + SET(COMP_COMPILER ${Kokkos_NVCC_WRAPPER}) + ENDIF() + ENDIF() + + # check that the original compiler still exists! + IF(NOT EXISTS ${COMP_COMPILER}) + MESSAGE(FATAL_ERROR "Kokkos could not find original compiler: '${COMP_COMPILER}'") + ENDIF() + + # try to ensure that compiling cuda code works! + IF(_Kokkos_ENABLE_CUDA AND COMP_CHECK_CUDA_COMPILES) + + # this may fail if kokkos_compiler launcher was used during install + kokkos_cxx_compiler_cuda_test(_COMPILES_CUDA + ${Kokkos_COMPILE_LAUNCHER} ${COMP_COMPILER} ${CMAKE_CXX_COMPILER}) + + # if above failed, throw an error + IF(NOT _COMPILES_CUDA) + MESSAGE(FATAL_ERROR "kokkos_cxx_compiler_cuda_test failed! Test commands:\n${_COMPILES_CUDA_COMMANDS}") + ENDIF() + ENDIF() + + IF(COMP_COMMAND_PREFIX) + SET(_PREFIX "${COMP_COMMAND_PREFIX}") + STRING(REPLACE ";" " " _PREFIX "${COMP_COMMAND_PREFIX}") + SET(Kokkos_COMPILER_LAUNCHER "${_PREFIX} ${Kokkos_COMPILE_LAUNCHER}") + ENDIF() + IF(COMP_GLOBAL) # if global, don't bother setting others - SET_PROPERTY(GLOBAL PROPERTY RULE_LAUNCH_COMPILE "${Kokkos_COMPILE_LAUNCHER} ${CMAKE_CXX_COMPILER}") - SET_PROPERTY(GLOBAL PROPERTY RULE_LAUNCH_LINK "${Kokkos_COMPILE_LAUNCHER} ${CMAKE_CXX_COMPILER}") + SET_PROPERTY(GLOBAL PROPERTY RULE_LAUNCH_COMPILE "${Kokkos_COMPILE_LAUNCHER} ${COMP_COMPILER} ${CMAKE_CXX_COMPILER}") + SET_PROPERTY(GLOBAL PROPERTY RULE_LAUNCH_LINK "${Kokkos_COMPILE_LAUNCHER} ${COMP_COMPILER} ${CMAKE_CXX_COMPILER}") ELSE() FOREACH(_TYPE PROJECT DIRECTORY TARGET SOURCE) # make project/subproject scoping easy, e.g. KokkosCompilation(PROJECT) after project(...) @@ -128,34 +265,10 @@ FUNCTION(kokkos_compilation) ENDIF() # set the properties if defined IF(COMP_${_TYPE}) - # MESSAGE(STATUS "Using nvcc_wrapper :: ${_TYPE} :: ${COMP_${_TYPE}}") - SET_PROPERTY(${_TYPE} ${COMP_${_TYPE}} PROPERTY RULE_LAUNCH_COMPILE "${Kokkos_COMPILE_LAUNCHER} ${CMAKE_CXX_COMPILER}") - SET_PROPERTY(${_TYPE} ${COMP_${_TYPE}} PROPERTY RULE_LAUNCH_LINK "${Kokkos_COMPILE_LAUNCHER} ${CMAKE_CXX_COMPILER}") + # MESSAGE(STATUS "Using ${COMP_COMPILER} :: ${_TYPE} :: ${COMP_${_TYPE}}") + SET_PROPERTY(${_TYPE} ${COMP_${_TYPE}} PROPERTY RULE_LAUNCH_COMPILE "${Kokkos_COMPILE_LAUNCHER} ${COMP_COMPILER} ${CMAKE_CXX_COMPILER}") + SET_PROPERTY(${_TYPE} ${COMP_${_TYPE}} PROPERTY RULE_LAUNCH_LINK "${Kokkos_COMPILE_LAUNCHER} ${COMP_COMPILER} ${CMAKE_CXX_COMPILER}") ENDIF() ENDFOREACH() ENDIF() ENDFUNCTION() - -# A test to check whether a downstream project set the C++ compiler to NVCC or not -# this is called only when Kokkos was installed with Kokkos_ENABLE_CUDA=ON -FUNCTION(kokkos_compiler_is_nvcc VAR COMPILER) - # Check if the compiler is nvcc (which really means nvcc_wrapper). - EXECUTE_PROCESS(COMMAND ${COMPILER} ${ARGN} --version - OUTPUT_VARIABLE INTERNAL_COMPILER_VERSION - OUTPUT_STRIP_TRAILING_WHITESPACE - RESULT_VARIABLE RET) - # something went wrong - IF(RET GREATER 0) - SET(${VAR} false PARENT_SCOPE) - ELSE() - STRING(REPLACE "\n" " - " INTERNAL_COMPILER_VERSION_ONE_LINE ${INTERNAL_COMPILER_VERSION} ) - STRING(FIND ${INTERNAL_COMPILER_VERSION_ONE_LINE} "nvcc" INTERNAL_COMPILER_VERSION_CONTAINS_NVCC) - STRING(REGEX REPLACE "^ +" "" INTERNAL_HAVE_COMPILER_NVCC "${INTERNAL_HAVE_COMPILER_NVCC}") - IF(${INTERNAL_COMPILER_VERSION_CONTAINS_NVCC} GREATER -1) - SET(${VAR} true PARENT_SCOPE) - ELSE() - SET(${VAR} false PARENT_SCOPE) - ENDIF() - ENDIF() -ENDFUNCTION() - diff --git a/packages/kokkos/cmake/KokkosCore_config.h.in b/packages/kokkos/cmake/KokkosCore_config.h.in index 0259fe69d50c..fbfae3711ec1 100644 --- a/packages/kokkos/cmake/KokkosCore_config.h.in +++ b/packages/kokkos/cmake/KokkosCore_config.h.in @@ -78,6 +78,7 @@ #cmakedefine KOKKOS_ARCH_POWER7 #cmakedefine KOKKOS_ARCH_POWER8 #cmakedefine KOKKOS_ARCH_POWER9 +#cmakedefine KOKKOS_ARCH_INTEL_GEN #cmakedefine KOKKOS_ARCH_KEPLER #cmakedefine KOKKOS_ARCH_KEPLER30 #cmakedefine KOKKOS_ARCH_KEPLER32 @@ -95,5 +96,8 @@ #cmakedefine KOKKOS_ARCH_VOLTA72 #cmakedefine KOKKOS_ARCH_TURING75 #cmakedefine KOKKOS_ARCH_AMPERE80 +#cmakedefine KOKKOS_ARCH_AMPERE86 #cmakedefine KOKKOS_ARCH_AMD_ZEN #cmakedefine KOKKOS_ARCH_AMD_ZEN2 + +#cmakedefine KOKKOS_IMPL_DISABLE_SYCL_DEVICE_PRINTF diff --git a/packages/kokkos/cmake/Modules/CudaToolkit.cmake b/packages/kokkos/cmake/Modules/CudaToolkit.cmake index d620a71d3698..eda5541f7c06 100644 --- a/packages/kokkos/cmake/Modules/CudaToolkit.cmake +++ b/packages/kokkos/cmake/Modules/CudaToolkit.cmake @@ -481,76 +481,6 @@ if(CMAKE_CUDA_COMPILER_LOADED AND NOT CUDAToolkit_BIN_DIR AND CMAKE_CUDA_COMPILE unset(cuda_dir) endif() -IF(CMAKE_VERSION VERSION_LESS "3.12.0") - function(import_target_link_libraries target) - cmake_parse_arguments(HACK - "SYSTEM;INTERFACE;PUBLIC" - "" - "" - ${ARGN} - ) - get_target_property(LIBS ${target} INTERFACE_LINK_LIBRARIES) - if (LIBS) - list(APPEND LIBS ${HACK_UNPARSED_ARGUMENTS}) - else() - set(LIBS ${HACK_UNPARSED_ARGUMENTS}) - endif() - set_target_properties(${target} PROPERTIES - INTERFACE_LINK_LIBRARIES "${LIBS}") - endfunction() -ELSE() - function(import_target_link_libraries) - target_link_libraries(${ARGN}) - endfunction() -ENDIF() - -IF(CMAKE_VERSION VERSION_LESS "3.13.0") - function(import_target_link_directories target) - cmake_parse_arguments(HACK - "SYSTEM;INTERFACE;PUBLIC" - "" - "" - ${ARGN} - ) - get_target_property(LINK_LIBS ${target} INTERFACE_LINK_LIBRARIES) - if (LINK_LIBS) #could be not-found - set(LINK_LIBS_LIST ${LINK_LIBS}) - endif() - foreach(LIB ${HACK_UNPARSED_ARGUMENTS}) - list(APPEND LINK_LIBS_LIST -L${LIB}) - endforeach() - set_target_properties(${target} PROPERTIES - INTERFACE_LINK_LIBRARIES "${LINK_LIBS_LIST}") - endfunction() -ELSE() - function(import_target_link_directories) - target_link_directories(${ARGN}) - endfunction() -ENDIF() - -IF(CMAKE_VERSION VERSION_LESS "3.12.0") - function(import_target_include_directories target) - cmake_parse_arguments(HACK - "SYSTEM;INTERFACE;PUBLIC" - "" - "" - ${ARGN} - ) - get_target_property(INLUDE_DIRS ${target} INTERFACE_INCLUDE_DIRECTORIES) - if (INCLUDE_DIRS) - list(APPEND INCLUDE_DIRS ${HACK_UNPARSED_ARGUMENTS}) - else() - set(INCLUDE_DIRS ${HACK_UNPARSED_ARGUMENTS}) - endif() - set_target_properties(${target} PROPERTIES - INTERFACE_INCLUDE_DIRECTORIES "${INCLUDE_DIRS}") - endfunction() -ELSE() - function(import_target_include_directories) - target_include_directories(${ARGN}) - endfunction() -ENDIF() - # Try language- or user-provided path first. if(CUDAToolkit_BIN_DIR) find_program(CUDAToolkit_NVCC_EXECUTABLE @@ -854,11 +784,11 @@ if(CUDAToolkit_FOUND) if (NOT TARGET CUDA::${lib_name} AND CUDA_${lib_name}_LIBRARY) add_library(CUDA::${lib_name} IMPORTED INTERFACE) - import_target_include_directories(CUDA::${lib_name} SYSTEM INTERFACE "${CUDAToolkit_INCLUDE_DIRS}") - import_target_link_libraries(CUDA::${lib_name} INTERFACE "${CUDA_${lib_name}_LIBRARY}") + target_include_directories(CUDA::${lib_name} SYSTEM INTERFACE "${CUDAToolkit_INCLUDE_DIRS}") + target_link_libraries(CUDA::${lib_name} INTERFACE "${CUDA_${lib_name}_LIBRARY}") foreach(dep ${arg_DEPS}) if(TARGET CUDA::${dep}) - import_target_link_libraries(CUDA::${lib_name} INTERFACE CUDA::${dep}) + target_link_libraries(CUDA::${lib_name} INTERFACE CUDA::${dep}) endif() endforeach() endif() @@ -866,8 +796,8 @@ if(CUDAToolkit_FOUND) if(NOT TARGET CUDA::toolkit) add_library(CUDA::toolkit IMPORTED INTERFACE) - import_target_include_directories(CUDA::toolkit SYSTEM INTERFACE "${CUDAToolkit_INCLUDE_DIRS}") - import_target_link_directories(CUDA::toolkit INTERFACE "${CUDAToolkit_LIBRARY_DIR}") + target_include_directories(CUDA::toolkit SYSTEM INTERFACE "${CUDAToolkit_INCLUDE_DIRS}") + target_link_directories(CUDA::toolkit INTERFACE "${CUDAToolkit_LIBRARY_DIR}") endif() _CUDAToolkit_find_and_add_import_lib(cuda_driver ALT cuda) @@ -882,11 +812,11 @@ if(CUDAToolkit_FOUND) AND TARGET CUDA::cudart_static) add_library(CUDA::cudart_static_deps IMPORTED INTERFACE) - import_target_link_libraries(CUDA::cudart_static INTERFACE CUDA::cudart_static_deps) + target_link_libraries(CUDA::cudart_static INTERFACE CUDA::cudart_static_deps) if(UNIX AND (CMAKE_C_COMPILER OR CMAKE_CXX_COMPILER)) find_package(Threads REQUIRED) - import_target_link_libraries(CUDA::cudart_static_deps INTERFACE Threads::Threads ${CMAKE_DL_LIBS}) + target_link_libraries(CUDA::cudart_static_deps INTERFACE Threads::Threads ${CMAKE_DL_LIBS}) endif() if(UNIX AND NOT APPLE) @@ -896,7 +826,7 @@ if(CUDAToolkit_FOUND) if(NOT CUDAToolkit_rt_LIBRARY) message(WARNING "Could not find librt library, needed by CUDA::cudart_static") else() - import_target_link_libraries(CUDA::cudart_static_deps INTERFACE ${CUDAToolkit_rt_LIBRARY}) + target_link_libraries(CUDA::cudart_static_deps INTERFACE ${CUDAToolkit_rt_LIBRARY}) endif() endif() endif() diff --git a/packages/kokkos/cmake/Modules/FindTPLCUDA.cmake b/packages/kokkos/cmake/Modules/FindTPLCUDA.cmake index a1072a60c618..8d58d9641580 100644 --- a/packages/kokkos/cmake/Modules/FindTPLCUDA.cmake +++ b/packages/kokkos/cmake/Modules/FindTPLCUDA.cmake @@ -25,7 +25,7 @@ IF (TARGET CUDA::cuda_driver) SET(FOUND_CUDA_DRIVER TRUE) KOKKOS_EXPORT_IMPORTED_TPL(CUDA::cuda_driver) ELSE() - SET(FOUND_CUDA_DRIVVER FALSE) + SET(FOUND_CUDA_DRIVER FALSE) ENDIF() include(FindPackageHandleStandardArgs) diff --git a/packages/kokkos/cmake/Modules/FindTPLPTHREAD.cmake b/packages/kokkos/cmake/Modules/FindTPLPTHREAD.cmake index 1d154e29afff..a743fca0e452 100644 --- a/packages/kokkos/cmake/Modules/FindTPLPTHREAD.cmake +++ b/packages/kokkos/cmake/Modules/FindTPLPTHREAD.cmake @@ -10,7 +10,7 @@ TRY_COMPILE(KOKKOS_HAS_PTHREAD_ARG # ${CMAKE_CXX${KOKKOS_CXX_STANDARD}_STANDARD_COMPILE_OPTION} INCLUDE(FindPackageHandleStandardArgs) -FIND_PACKAGE_HANDLE_STANDARD_ARGS(PTHREAD DEFAULT_MSG KOKKOS_HAS_PTHREAD_ARG) +FIND_PACKAGE_HANDLE_STANDARD_ARGS(TPLPTHREAD DEFAULT_MSG KOKKOS_HAS_PTHREAD_ARG) #Only create the TPL if we succeed IF (KOKKOS_HAS_PTHREAD_ARG) KOKKOS_CREATE_IMPORTED_TPL(PTHREAD diff --git a/packages/kokkos/cmake/Modules/FindTPLROCM.cmake b/packages/kokkos/cmake/Modules/FindTPLROCM.cmake new file mode 100644 index 000000000000..512ad6ceb283 --- /dev/null +++ b/packages/kokkos/cmake/Modules/FindTPLROCM.cmake @@ -0,0 +1,11 @@ +include(FindPackageHandleStandardArgs) + +FIND_LIBRARY(AMD_HIP_LIBRARY amdhip64 PATHS ENV ROCM_PATH PATH_SUFFIXES lib) +FIND_LIBRARY(HSA_RUNTIME_LIBRARY hsa-runtime64 PATHS ENV ROCM_PATH PATH_SUFFIXES lib) + +find_package_handle_standard_args(TPLROCM DEFAULT_MSG AMD_HIP_LIBRARY HSA_RUNTIME_LIBRARY) + +kokkos_create_imported_tpl(ROCM INTERFACE + LINK_LIBRARIES ${HSA_RUNTIME_LIBRARY} ${AMD_HIP_LIBRARY} + COMPILE_DEFINITIONS __HIP_ROCclr__ +) diff --git a/packages/kokkos/cmake/compile_tests/cplusplus14.cpp b/packages/kokkos/cmake/compile_tests/cplusplus14.cpp new file mode 100644 index 000000000000..52ec9885ec3e --- /dev/null +++ b/packages/kokkos/cmake/compile_tests/cplusplus14.cpp @@ -0,0 +1,8 @@ +#include + +int main() { + // _t versions of type traits were added in C++14 + std::remove_cv_t i = 0; + + return i; +} diff --git a/packages/kokkos/cmake/compile_tests/cuda_compute_capability.cc b/packages/kokkos/cmake/compile_tests/cuda_compute_capability.cc index 48c01c070cb8..a26ac5af4bf2 100644 --- a/packages/kokkos/cmake/compile_tests/cuda_compute_capability.cc +++ b/packages/kokkos/cmake/compile_tests/cuda_compute_capability.cc @@ -72,6 +72,7 @@ int main() { case 72: std::cout << "Set -DKokkos_ARCH_VOLTA72=ON ." << std::endl; break; case 75: std::cout << "Set -DKokkos_ARCH_TURING75=ON ." << std::endl; break; case 80: std::cout << "Set -DKokkos_ARCH_AMPERE80=ON ." << std::endl; break; + case 86: std::cout << "Set -DKokkos_ARCH_AMPERE86=ON ." << std::endl; break; default: std::cout << "Compute capability " << compute_capability << " is not supported" << std::endl; diff --git a/packages/kokkos/cmake/compile_tests/pthread.cpp b/packages/kokkos/cmake/compile_tests/pthread.cpp index 92310da02937..3f83bf6a5f7f 100644 --- a/packages/kokkos/cmake/compile_tests/pthread.cpp +++ b/packages/kokkos/cmake/compile_tests/pthread.cpp @@ -2,7 +2,7 @@ void* kokkos_test(void* args) { return args; } -int main(void) { +int main() { pthread_t thread; /* Use NULL to avoid C++11. Some compilers do not have C++11 by default. Forcing C++11 diff --git a/packages/kokkos/cmake/fake_tribits.cmake b/packages/kokkos/cmake/fake_tribits.cmake index 2e82a462356b..fbd6745a602c 100644 --- a/packages/kokkos/cmake/fake_tribits.cmake +++ b/packages/kokkos/cmake/fake_tribits.cmake @@ -81,10 +81,16 @@ ENDMACRO() FUNCTION(KOKKOS_ADD_TEST) if (KOKKOS_HAS_TRILINOS) CMAKE_PARSE_ARGUMENTS(TEST - "" + "SKIP_TRIBITS" "EXE;NAME;TOOL" "ARGS" ${ARGN}) + + IF(TEST_SKIP_TRIBITS) + MESSAGE(STATUS "Skipping test ${TEST_NAME} in TriBits") + RETURN() + ENDIF() + IF(TEST_EXE) SET(EXE_ROOT ${TEST_EXE}) ELSE() @@ -119,11 +125,10 @@ FUNCTION(KOKKOS_ADD_TEST) endif() else() CMAKE_PARSE_ARGUMENTS(TEST - "WILL_FAIL" + "WILL_FAIL;SKIP_TRIBITS" "FAIL_REGULAR_EXPRESSION;PASS_REGULAR_EXPRESSION;EXE;NAME;TOOL" "CATEGORIES;ARGS" ${ARGN}) - SET(TESTS_ADDED) # To match Tribits, we should always be receiving # the root names of exes/libs IF(TEST_EXE) @@ -135,48 +140,27 @@ FUNCTION(KOKKOS_ADD_TEST) # These should be the full target name SET(TEST_NAME ${PACKAGE_NAME}_${TEST_NAME}) SET(EXE ${PACKAGE_NAME}_${EXE_ROOT}) - IF (TEST_ARGS) - SET(TEST_NUMBER 0) - FOREACH (ARG_STR ${TEST_ARGS}) - # This is passed as a single string blob to match TriBITS behavior - # We need this to be turned into a list - STRING(REPLACE " " ";" ARG_STR_LIST ${ARG_STR}) - IF(WIN32) - ADD_TEST(NAME ${TEST_NAME}${TEST_NUMBER} WORKING_DIRECTORY ${LIBRARY_OUTPUT_PATH} - COMMAND ${EXE}${CMAKE_EXECUTABLE_SUFFIX} ${ARG_STR_LIST}) - ELSE() - ADD_TEST(NAME ${TEST_NAME}${TEST_NUMBER} COMMAND ${EXE} ${ARG_STR_LIST}) - ENDIF() - LIST(APPEND TESTS_ADDED "${TEST_NAME}${TEST_NUMBER}") - MATH(EXPR TEST_NUMBER "${TEST_NUMBER} + 1") - ENDFOREACH() + IF(WIN32) + ADD_TEST(NAME ${TEST_NAME} WORKING_DIRECTORY ${LIBRARY_OUTPUT_PATH} + COMMAND ${EXE}${CMAKE_EXECUTABLE_SUFFIX} ${TEST_ARGS}) ELSE() - IF(WIN32) - ADD_TEST(NAME ${TEST_NAME} WORKING_DIRECTORY ${LIBRARY_OUTPUT_PATH} - COMMAND ${EXE}${CMAKE_EXECUTABLE_SUFFIX}) - ELSE() - ADD_TEST(NAME ${TEST_NAME} COMMAND ${EXE}) - ENDIF() - LIST(APPEND TESTS_ADDED "${TEST_NAME}") + ADD_TEST(NAME ${TEST_NAME} COMMAND ${EXE} ${TEST_ARGS}) + ENDIF() + IF(TEST_WILL_FAIL) + SET_TESTS_PROPERTIES(${TEST_NAME} PROPERTIES WILL_FAIL ${TEST_WILL_FAIL}) + ENDIF() + IF(TEST_FAIL_REGULAR_EXPRESSION) + SET_TESTS_PROPERTIES(${TEST_NAME} PROPERTIES FAIL_REGULAR_EXPRESSION ${TEST_FAIL_REGULAR_EXPRESSION}) + ENDIF() + IF(TEST_PASS_REGULAR_EXPRESSION) + SET_TESTS_PROPERTIES(${TEST_NAME} PROPERTIES PASS_REGULAR_EXPRESSION ${TEST_PASS_REGULAR_EXPRESSION}) + ENDIF() + IF(TEST_TOOL) + ADD_DEPENDENCIES(${EXE} ${TEST_TOOL}) #make sure the exe has to build the tool + SET_PROPERTY(TEST ${TEST_NAME} APPEND_STRING PROPERTY ENVIRONMENT "KOKKOS_PROFILE_LIBRARY=$") ENDIF() - - FOREACH(TEST_NAME ${TESTS_ADDED}) - IF(TEST_WILL_FAIL) - SET_TESTS_PROPERTIES(${TEST_NAME} PROPERTIES WILL_FAIL ${TEST_WILL_FAIL}) - ENDIF() - IF(TEST_FAIL_REGULAR_EXPRESSION) - SET_TESTS_PROPERTIES(${TEST_NAME} PROPERTIES FAIL_REGULAR_EXPRESSION ${TEST_FAIL_REGULAR_EXPRESSION}) - ENDIF() - IF(TEST_PASS_REGULAR_EXPRESSION) - SET_TESTS_PROPERTIES(${TEST_NAME} PROPERTIES PASS_REGULAR_EXPRESSION ${TEST_PASS_REGULAR_EXPRESSION}) - ENDIF() - if(TEST_TOOL) - add_dependencies(${EXE} ${TEST_TOOL}) #make sure the exe has to build the tool - set_property(TEST ${TEST_NAME} APPEND_STRING PROPERTY ENVIRONMENT "KOKKOS_PROFILE_LIBRARY=$") - endif() - ENDFOREACH() VERIFY_EMPTY(KOKKOS_ADD_TEST ${TEST_UNPARSED_ARGUMENTS}) - endif() + ENDIF() ENDFUNCTION() FUNCTION(KOKKOS_ADD_ADVANCED_TEST) @@ -326,14 +310,6 @@ ENDIF() ENDFUNCTION() -FUNCTION(KOKKOS_TARGET_COMPILE_DEFINITIONS) - IF (KOKKOS_HAS_TRILINOS) - TARGET_COMPILE_DEFINITIONS(${TARGET} ${ARGN}) - ELSE() - TARGET_COMPILE_DEFINITIONS(${TARGET} ${ARGN}) - ENDIF() -ENDFUNCTION() - FUNCTION(KOKKOS_INCLUDE_DIRECTORIES) IF(KOKKOS_HAS_TRILINOS) TRIBITS_INCLUDE_DIRECTORIES(${ARGN}) @@ -350,10 +326,6 @@ ENDIF() ENDFUNCTION() -MACRO(KOKKOS_ADD_COMPILE_OPTIONS) -ADD_COMPILE_OPTIONS(${ARGN}) -ENDMACRO() - MACRO(PRINTALL match) get_cmake_property(_variableNames VARIABLES) list (SORT _variableNames) @@ -376,4 +348,3 @@ FUNCTION(GLOBAL_APPEND VARNAME) LIST(APPEND TEMP ${ARGN}) GLOBAL_SET(${VARNAME} ${TEMP}) ENDFUNCTION() - diff --git a/packages/kokkos/cmake/kokkos_arch.cmake b/packages/kokkos/cmake/kokkos_arch.cmake index 53aaf7dccf16..ec18e70a36a3 100644 --- a/packages/kokkos/cmake/kokkos_arch.cmake +++ b/packages/kokkos/cmake/kokkos_arch.cmake @@ -35,7 +35,7 @@ KOKKOS_ARCH_OPTION(ARMV80 HOST "ARMv8.0 Compatible CPU") KOKKOS_ARCH_OPTION(ARMV81 HOST "ARMv8.1 Compatible CPU") KOKKOS_ARCH_OPTION(ARMV8_THUNDERX HOST "ARMv8 Cavium ThunderX CPU") KOKKOS_ARCH_OPTION(ARMV8_THUNDERX2 HOST "ARMv8 Cavium ThunderX2 CPU") -KOKKOS_ARCH_OPTION(A64FX HOST "ARMv8.2 with SVE Suport") +KOKKOS_ARCH_OPTION(A64FX HOST "ARMv8.2 with SVE Support") KOKKOS_ARCH_OPTION(WSM HOST "Intel Westmere CPU") KOKKOS_ARCH_OPTION(SNB HOST "Intel Sandy/Ivy Bridge CPUs") KOKKOS_ARCH_OPTION(HSW HOST "Intel Haswell CPUs") @@ -60,11 +60,12 @@ KOKKOS_ARCH_OPTION(VOLTA70 GPU "NVIDIA Volta generation CC 7.0") KOKKOS_ARCH_OPTION(VOLTA72 GPU "NVIDIA Volta generation CC 7.2") KOKKOS_ARCH_OPTION(TURING75 GPU "NVIDIA Turing generation CC 7.5") KOKKOS_ARCH_OPTION(AMPERE80 GPU "NVIDIA Ampere generation CC 8.0") +KOKKOS_ARCH_OPTION(AMPERE86 GPU "NVIDIA Ampere generation CC 8.6") KOKKOS_ARCH_OPTION(ZEN HOST "AMD Zen architecture") KOKKOS_ARCH_OPTION(ZEN2 HOST "AMD Zen2 architecture") KOKKOS_ARCH_OPTION(VEGA900 GPU "AMD GPU MI25 GFX900") KOKKOS_ARCH_OPTION(VEGA906 GPU "AMD GPU MI50/MI60 GFX906") -KOKKOS_ARCH_OPTION(VEGA908 GPU "AMD GPU") +KOKKOS_ARCH_OPTION(VEGA908 GPU "AMD GPU MI100 GFX908") KOKKOS_ARCH_OPTION(INTEL_GEN GPU "Intel GPUs Gen9+") @@ -141,8 +142,16 @@ ENDIF() #------------------------------- KOKKOS_HIP_OPTIONS --------------------------- #clear anything that might be in the cache GLOBAL_SET(KOKKOS_AMDGPU_OPTIONS) -IF(KOKKOS_CXX_COMPILER_ID STREQUAL HIP) - SET(AMDGPU_ARCH_FLAG "--amdgpu-target") +IF(KOKKOS_ENABLE_HIP) + IF(KOKKOS_CXX_COMPILER_ID STREQUAL HIPCC) + SET(AMDGPU_ARCH_FLAG "--amdgpu-target") + ELSE() + SET(AMDGPU_ARCH_FLAG "--offload-arch") + GLOBAL_APPEND(KOKKOS_AMDGPU_OPTIONS -x hip) + IF(DEFINED ENV{ROCM_PATH}) + GLOBAL_APPEND(KOKKOS_AMDGPU_OPTIONS --rocm-path=$ENV{ROCM_PATH}) + ENDIF() + ENDIF() ENDIF() @@ -183,6 +192,8 @@ ENDIF() IF (KOKKOS_ARCH_A64FX) COMPILER_SPECIFIC_FLAGS( DEFAULT -march=armv8.2-a+sve + Clang -march=armv8.2-a+sve -msve-vector-bits=512 + GCC -march=armv8.2-a+sve -msve-vector-bits=512 ) ENDIF() @@ -309,7 +320,7 @@ IF (KOKKOS_ARCH_POWER8 OR KOKKOS_ARCH_POWER9) SET(KOKKOS_USE_ISA_POWERPCLE ON) ENDIF() -IF (Kokkos_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE) +IF (KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE) COMPILER_SPECIFIC_FLAGS( Clang -fcuda-rdc NVIDIA --relocatable-device-code=true @@ -333,8 +344,8 @@ ENDIF() #Right now we cannot get the compiler ID when cross-compiling, so just check #that HIP is enabled -IF (Kokkos_ENABLE_HIP) - IF (Kokkos_ENABLE_HIP_RELOCATABLE_DEVICE_CODE) +IF (KOKKOS_ENABLE_HIP) + IF (KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE) COMPILER_SPECIFIC_FLAGS( DEFAULT -fgpu-rdc ) @@ -345,8 +356,7 @@ IF (Kokkos_ENABLE_HIP) ENDIF() ENDIF() - -IF (Kokkos_ENABLE_SYCL) +IF (KOKKOS_ENABLE_SYCL) COMPILER_SPECIFIC_FLAGS( DEFAULT -fsycl ) @@ -363,7 +373,7 @@ FUNCTION(CHECK_CUDA_ARCH ARCH FLAG) MESSAGE(FATAL_ERROR "Multiple GPU architectures given! Already have ${CUDA_ARCH_ALREADY_SPECIFIED}, but trying to add ${ARCH}. If you are re-running CMake, try clearing the cache and running again.") ENDIF() SET(CUDA_ARCH_ALREADY_SPECIFIED ${ARCH} PARENT_SCOPE) - IF (NOT KOKKOS_ENABLE_CUDA AND NOT KOKKOS_ENABLE_OPENMPTARGET) + IF (NOT KOKKOS_ENABLE_CUDA AND NOT KOKKOS_ENABLE_OPENMPTARGET AND NOT KOKKOS_ENABLE_SYCL) MESSAGE(WARNING "Given CUDA arch ${ARCH}, but Kokkos_ENABLE_CUDA and Kokkos_ENABLE_OPENMPTARGET are OFF. Option will be ignored.") UNSET(KOKKOS_ARCH_${ARCH} PARENT_SCOPE) ELSE() @@ -396,6 +406,7 @@ CHECK_CUDA_ARCH(VOLTA70 sm_70) CHECK_CUDA_ARCH(VOLTA72 sm_72) CHECK_CUDA_ARCH(TURING75 sm_75) CHECK_CUDA_ARCH(AMPERE80 sm_80) +CHECK_CUDA_ARCH(AMPERE86 sm_86) SET(AMDGPU_ARCH_ALREADY_SPECIFIED "") FUNCTION(CHECK_AMDGPU_ARCH ARCH FLAG) @@ -405,12 +416,12 @@ FUNCTION(CHECK_AMDGPU_ARCH ARCH FLAG) ENDIF() SET(AMDGPU_ARCH_ALREADY_SPECIFIED ${ARCH} PARENT_SCOPE) IF (NOT KOKKOS_ENABLE_HIP AND NOT KOKKOS_ENABLE_OPENMPTARGET) - MESSAGE(WARNING "Given HIP arch ${ARCH}, but Kokkos_ENABLE_AMDGPU and Kokkos_ENABLE_OPENMPTARGET are OFF. Option will be ignored.") + MESSAGE(WARNING "Given AMD GPU architecture ${ARCH}, but Kokkos_ENABLE_HIP and Kokkos_ENABLE_OPENMPTARGET are OFF. Option will be ignored.") UNSET(KOKKOS_ARCH_${ARCH} PARENT_SCOPE) ELSE() SET(KOKKOS_AMDGPU_ARCH_FLAG ${FLAG} PARENT_SCOPE) GLOBAL_APPEND(KOKKOS_AMDGPU_OPTIONS "${AMDGPU_ARCH_FLAG}=${FLAG}") - IF(KOKKOS_ENABLE_HIP) + IF(KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE) GLOBAL_APPEND(KOKKOS_LINK_OPTIONS "${AMDGPU_ARCH_FLAG}=${FLAG}") ENDIF() ENDIF() @@ -451,6 +462,24 @@ IF (KOKKOS_ENABLE_OPENMPTARGET) ENDIF() ENDIF() +IF (KOKKOS_ENABLE_SYCL) + IF(CUDA_ARCH_ALREADY_SPECIFIED) + IF(KOKKOS_ENABLE_UNSUPPORTED_ARCHS) + COMPILER_SPECIFIC_FLAGS( + DEFAULT -fsycl-targets=nvptx64-nvidia-cuda-sycldevice + ) + # FIXME_SYCL The CUDA backend doesn't support printf yet. + GLOBAL_SET(KOKKOS_IMPL_DISABLE_SYCL_DEVICE_PRINTF ON) + ELSE() + MESSAGE(SEND_ERROR "Setting a CUDA architecture for SYCL is only allowed with Kokkos_ENABLE_UNSUPPORTED_ARCHS=ON!") + ENDIF() + ELSEIF(KOKKOS_ARCH_INTEL_GEN) + COMPILER_SPECIFIC_FLAGS( + DEFAULT -fsycl-targets=spir64_gen-unknown-unknown-sycldevice -Xsycl-target-backend "-device skl" + ) + ENDIF() +ENDIF() + IF(KOKKOS_ENABLE_CUDA AND NOT CUDA_ARCH_ALREADY_SPECIFIED) # Try to autodetect the CUDA Compute Capability by asking the device SET(_BINARY_TEST_DIR ${CMAKE_CURRENT_BINARY_DIR}/cmake/compile_tests/CUDAComputeCapabilityWorkdir) @@ -464,6 +493,43 @@ IF(KOKKOS_ENABLE_CUDA AND NOT CUDA_ARCH_ALREADY_SPECIFIED) ${CMAKE_CURRENT_SOURCE_DIR}/cmake/compile_tests/cuda_compute_capability.cc COMPILE_DEFINITIONS -DSM_ONLY RUN_OUTPUT_VARIABLE _CUDA_COMPUTE_CAPABILITY) + + # if user is using kokkos_compiler_launcher, above will fail. + IF(NOT _COMPILE_RESULT OR NOT _RESULT EQUAL 0) + # check to see if CUDA is not already enabled (may happen when Kokkos is subproject) + GET_PROPERTY(_ENABLED_LANGUAGES GLOBAL PROPERTY ENABLED_LANGUAGES) + # language has to be fully enabled, just checking for CMAKE_CUDA_COMPILER isn't enough + IF(NOT "CUDA" IN_LIST _ENABLED_LANGUAGES) + # make sure the user knows that we aren't using CUDA compiler for anything else + MESSAGE(STATUS "CUDA auto-detection of architecture failed with ${CMAKE_CXX_COMPILER}. Enabling CUDA language ONLY to auto-detect architecture...") + INCLUDE(CheckLanguage) + CHECK_LANGUAGE(CUDA) + IF(CMAKE_CUDA_COMPILER) + ENABLE_LANGUAGE(CUDA) + ELSE() + MESSAGE(STATUS "CUDA language could not be enabled") + ENDIF() + ENDIF() + + # if CUDA was enabled, this will be defined + IF(CMAKE_CUDA_COMPILER) + # copy our test to .cu so cmake compiles as CUDA + CONFIGURE_FILE( + ${PROJECT_SOURCE_DIR}/cmake/compile_tests/cuda_compute_capability.cc + ${PROJECT_BINARY_DIR}/compile_tests/cuda_compute_capability.cu + COPYONLY + ) + # run test again + TRY_RUN( + _RESULT + _COMPILE_RESULT + ${_BINARY_TEST_DIR} + ${PROJECT_BINARY_DIR}/compile_tests/cuda_compute_capability.cu + COMPILE_DEFINITIONS -DSM_ONLY + RUN_OUTPUT_VARIABLE _CUDA_COMPUTE_CAPABILITY) + ENDIF() + ENDIF() + LIST(FIND KOKKOS_CUDA_ARCH_FLAGS sm_${_CUDA_COMPUTE_CAPABILITY} FLAG_INDEX) IF(_COMPILE_RESULT AND _RESULT EQUAL 0 AND NOT FLAG_INDEX EQUAL -1) MESSAGE(STATUS "Detected CUDA Compute Capability ${_CUDA_COMPUTE_CAPABILITY}") @@ -500,7 +566,7 @@ IF (KOKKOS_ENABLE_CUDA) SET(KOKKOS_ARCH_VOLTA ON) ENDIF() - IF (KOKKOS_ARCH_AMPERE80) + IF (KOKKOS_ARCH_AMPERE80 OR KOKKOS_ARCH_AMPERE86) SET(KOKKOS_ARCH_AMPERE ON) ENDIF() ENDIF() diff --git a/packages/kokkos/cmake/kokkos_compiler_id.cmake b/packages/kokkos/cmake/kokkos_compiler_id.cmake index e6600161f9fe..4434d6928f46 100644 --- a/packages/kokkos/cmake/kokkos_compiler_id.cmake +++ b/packages/kokkos/cmake/kokkos_compiler_id.cmake @@ -27,6 +27,12 @@ IF(Kokkos_ENABLE_CUDA) PATHS ${PROJECT_SOURCE_DIR} PATH_SUFFIXES bin) + FIND_PROGRAM(Kokkos_NVCC_WRAPPER + NAMES nvcc_wrapper + HINTS ${PROJECT_SOURCE_DIR} + PATHS ${PROJECT_SOURCE_DIR} + PATH_SUFFIXES bin) + # check if compiler was set to nvcc_wrapper kokkos_internal_have_compiler_nvcc(${CMAKE_CXX_COMPILER}) # if launcher was found and nvcc_wrapper was not specified as @@ -37,7 +43,7 @@ IF(Kokkos_ENABLE_CUDA) # if the second argument matches the C++ compiler, it forwards the rest of the # args to nvcc_wrapper kokkos_internal_have_compiler_nvcc( - ${Kokkos_COMPILE_LAUNCHER} ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_COMPILER} -DKOKKOS_DEPENDENCE) + ${Kokkos_COMPILE_LAUNCHER} ${Kokkos_NVCC_WRAPPER} ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_COMPILER} -DKOKKOS_DEPENDENCE) SET(INTERNAL_USE_COMPILER_LAUNCHER true) ENDIF() ENDIF() @@ -55,32 +61,7 @@ IF(INTERNAL_HAVE_COMPILER_NVCC) SET(KOKKOS_CXX_COMPILER_VERSION ${TEMP_CXX_COMPILER_VERSION} CACHE STRING INTERNAL FORCE) MESSAGE(STATUS "Compiler Version: ${KOKKOS_CXX_COMPILER_VERSION}") IF(INTERNAL_USE_COMPILER_LAUNCHER) - IF(Kokkos_LAUNCH_COMPILER_INFO) - GET_FILENAME_COMPONENT(BASE_COMPILER_NAME ${CMAKE_CXX_COMPILER} NAME) - # does not have STATUS intentionally - MESSAGE("") - MESSAGE("Kokkos_LAUNCH_COMPILER_INFO (${Kokkos_COMPILE_LAUNCHER}):") - MESSAGE(" - Kokkos + CUDA backend requires the C++ files to be compiled as CUDA code.") - MESSAGE(" - kokkos_launch_compiler permits CMAKE_CXX_COMPILER to be set to a traditional C++ compiler when Kokkos_ENABLE_CUDA=ON") - MESSAGE(" by prefixing all the compile and link commands with the path to the script + CMAKE_CXX_COMPILER (${CMAKE_CXX_COMPILER}).") - MESSAGE(" - If any of the compile or link commands have CMAKE_CXX_COMPILER as the first argument, it replaces CMAKE_CXX_COMPILER with nvcc_wrapper.") - MESSAGE(" - If the compile or link command is not CMAKE_CXX_COMPILER, it just executes the command.") - MESSAGE(" - If using ccache, set CMAKE_CXX_COMPILER to nvcc_wrapper explicitly.") - MESSAGE(" - kokkos_compiler_launcher is available to downstream projects as well.") - MESSAGE(" - If CMAKE_CXX_COMPILER=nvcc_wrapper, all legacy behavior will be preserved during 'find_package(Kokkos)'") - MESSAGE(" - If CMAKE_CXX_COMPILER is not nvcc_wrapper, 'find_package(Kokkos)' will apply 'kokkos_compilation(GLOBAL)' unless separable compilation is enabled") - MESSAGE(" - This can be disabled via '-DKokkos_LAUNCH_COMPILER=OFF'") - MESSAGE(" - Use 'find_package(Kokkos COMPONENTS separable_compilation)' to enable separable compilation") - MESSAGE(" - Separable compilation allows you to control the scope of where the compiler transformation behavior (${BASE_COMPILER_NAME} -> nvcc_wrapper) is applied") - MESSAGE(" - The compiler transformation can be applied on a per-project, per-directory, per-target, and/or per-source-file basis") - MESSAGE(" - 'kokkos_compilation(PROJECT)' will apply the compiler transformation to all targets in a project/subproject") - MESSAGE(" - 'kokkos_compilation(TARGET [...])' will apply the compiler transformation to the specified target(s)") - MESSAGE(" - 'kokkos_compilation(SOURCE [...])' will apply the compiler transformation to the specified source file(s)") - MESSAGE(" - 'kokkos_compilation(DIRECTORY [...])' will apply the compiler transformation to the specified directories") - MESSAGE("") - ELSE() - MESSAGE(STATUS "kokkos_launch_compiler (${Kokkos_COMPILE_LAUNCHER}) is enabled... Set Kokkos_LAUNCH_COMPILER_INFO=ON for more info.") - ENDIF() + MESSAGE(STATUS "kokkos_launch_compiler (${Kokkos_COMPILE_LAUNCHER}) is enabled...") kokkos_compilation(GLOBAL) ENDIF() ENDIF() @@ -92,7 +73,11 @@ IF(Kokkos_ENABLE_HIP) OUTPUT_STRIP_TRAILING_WHITESPACE) STRING(REPLACE "\n" " - " INTERNAL_COMPILER_VERSION_ONE_LINE ${INTERNAL_COMPILER_VERSION} ) - SET(KOKKOS_CXX_COMPILER_ID HIP CACHE STRING INTERNAL FORCE) + + STRING(FIND ${INTERNAL_COMPILER_VERSION_ONE_LINE} "HIP version" INTERNAL_COMPILER_VERSION_CONTAINS_HIP) + IF(INTERNAL_COMPILER_VERSION_CONTAINS_HIP GREATER -1) + SET(KOKKOS_CXX_COMPILER_ID HIPCC CACHE STRING INTERNAL FORCE) + ENDIF() STRING(REGEX MATCH "[0-9]+\\.[0-9]+\\.[0-9]+" TEMP_CXX_COMPILER_VERSION ${INTERNAL_COMPILER_VERSION_ONE_LINE}) @@ -103,8 +88,7 @@ ENDIF() IF(KOKKOS_CXX_COMPILER_ID STREQUAL Clang) # The Cray compiler reports as Clang to most versions of CMake EXECUTE_PROCESS(COMMAND ${CMAKE_CXX_COMPILER} --version - COMMAND grep Cray - COMMAND wc -l + COMMAND grep -c Cray OUTPUT_VARIABLE INTERNAL_HAVE_CRAY_COMPILER OUTPUT_STRIP_TRAILING_WHITESPACE) IF (INTERNAL_HAVE_CRAY_COMPILER) #not actually Clang @@ -112,8 +96,7 @@ IF(KOKKOS_CXX_COMPILER_ID STREQUAL Clang) ENDIF() # The clang based Intel compiler reports as Clang to most versions of CMake EXECUTE_PROCESS(COMMAND ${CMAKE_CXX_COMPILER} --version - COMMAND grep icpx - COMMAND wc -l + COMMAND grep -c "DPC++\\|icpx" OUTPUT_VARIABLE INTERNAL_HAVE_INTEL_COMPILER OUTPUT_STRIP_TRAILING_WHITESPACE) IF (INTERNAL_HAVE_INTEL_COMPILER) #not actually Clang @@ -174,7 +157,7 @@ ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") ENDIF() SET(CMAKE_CXX_EXTENSIONS OFF CACHE BOOL "Kokkos turns off CXX extensions" FORCE) -ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL HIP) +ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL HIPCC) IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 3.8.0) MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") ENDIF() diff --git a/packages/kokkos/cmake/kokkos_corner_cases.cmake b/packages/kokkos/cmake/kokkos_corner_cases.cmake index 3962c4b16efb..a84ac2b63027 100644 --- a/packages/kokkos/cmake/kokkos_corner_cases.cmake +++ b/packages/kokkos/cmake/kokkos_corner_cases.cmake @@ -49,11 +49,14 @@ ENDIF() IF (KOKKOS_CXX_STANDARD STREQUAL 17) IF (KOKKOS_CXX_COMPILER_ID STREQUAL GNU AND KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 7) - MESSAGE(FATAL_ERROR "You have requested c++17 support for GCC ${KOKKOS_CXX_COMPILER_VERSION}. Although CMake has allowed this and GCC accepts -std=c++1z/c++17, GCC <= 6 does not properly support *this capture. Please reduce the C++ standard to 14 or upgrade the compiler if you do need C++17 support.") + MESSAGE(FATAL_ERROR "You have requested C++17 support for GCC ${KOKKOS_CXX_COMPILER_VERSION}. Although CMake has allowed this and GCC accepts -std=c++1z/c++17, GCC < 7 does not properly support *this capture. Please reduce the C++ standard to 14 or upgrade the compiler if you do need C++17 support.") ENDIF() IF (KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA AND KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 11) - MESSAGE(FATAL_ERROR "You have requested c++17 support for NVCC ${KOKKOS_CXX_COMPILER_VERSION}. NVCC only supports C++17 from version 11 on. Please reduce the C++ standard to 14 or upgrade the compiler if you need C++17 support.") + MESSAGE(FATAL_ERROR "You have requested C++17 support for NVCC ${KOKKOS_CXX_COMPILER_VERSION}. NVCC only supports C++17 from version 11 on. Please reduce the C++ standard to 14 or upgrade the compiler if you need C++17 support.") + ENDIF() + IF (KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA AND KOKKOS_ENABLE_CUDA_CONSTEXPR) + MESSAGE(WARNING "You have requested -DKokkos_ENABLE_CUDA_CONSTEXPR=ON with C++17 support for NVCC ${KOKKOS_CXX_COMPILER_VERSION} which is known to trigger compiler bugs. See https://github.com/kokkos/kokkos/issues/3496") ENDIF() ENDIF() diff --git a/packages/kokkos/cmake/kokkos_enable_devices.cmake b/packages/kokkos/cmake/kokkos_enable_devices.cmake index 41ee10a8a05c..445dad47ce56 100644 --- a/packages/kokkos/cmake/kokkos_enable_devices.cmake +++ b/packages/kokkos/cmake/kokkos_enable_devices.cmake @@ -48,9 +48,6 @@ IF(KOKKOS_ENABLE_OPENMP) IF(KOKKOS_CLANG_IS_CRAY) SET(ClangOpenMPFlag -fopenmp) ENDIF() - IF(KOKKOS_CLANG_IS_INTEL) - SET(ClangOpenMPFlag -fiopenmp) - ENDIF() IF(KOKKOS_COMPILER_CLANG_MSVC) #for clang-cl expression /openmp yields an error, so directly add the specific Clang flag SET(ClangOpenMPFlag /clang:-fopenmp=libomp) @@ -64,6 +61,7 @@ IF(KOKKOS_ENABLE_OPENMP) COMPILER_SPECIFIC_FLAGS( COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID Clang -Xcompiler ${ClangOpenMPFlag} + IntelClang -Xcompiler -fiopenmp PGI -Xcompiler -mp Cray NO-VALUE-SPECIFIED XL -Xcompiler -qsmp=omp @@ -72,6 +70,7 @@ IF(KOKKOS_ENABLE_OPENMP) ELSE() COMPILER_SPECIFIC_FLAGS( Clang ${ClangOpenMPFlag} + IntelClang -fiopenmp AppleClang -Xpreprocessor -fopenmp PGI -mp Cray NO-VALUE-SPECIFIED @@ -152,3 +151,11 @@ IF (KOKKOS_ENABLE_HIP) ENDIF() KOKKOS_DEVICE_OPTION(SYCL OFF DEVICE "Whether to build SYCL backend") + +## SYCL has extra setup requirements, turn on Kokkos_Setup_SYCL.hpp in macros +IF (KOKKOS_ENABLE_SYCL) + IF(KOKKOS_CXX_STANDARD LESS 17) + MESSAGE(FATAL_ERROR "SYCL backend requires C++17 or newer!") + ENDIF() + LIST(APPEND DEVICE_SETUP_LIST SYCL) +ENDIF() diff --git a/packages/kokkos/cmake/kokkos_enable_options.cmake b/packages/kokkos/cmake/kokkos_enable_options.cmake index 5df498f37354..95bce66c7bee 100644 --- a/packages/kokkos/cmake/kokkos_enable_options.cmake +++ b/packages/kokkos/cmake/kokkos_enable_options.cmake @@ -48,6 +48,7 @@ KOKKOS_ENABLE_OPTION(COMPILER_WARNINGS OFF "Whether to print all compiler war KOKKOS_ENABLE_OPTION(PROFILING_LOAD_PRINT OFF "Whether to print information about which profiling tools got loaded") KOKKOS_ENABLE_OPTION(TUNING OFF "Whether to create bindings for tuning tools") KOKKOS_ENABLE_OPTION(AGGRESSIVE_VECTORIZATION OFF "Whether to aggressively vectorize loops") +KOKKOS_ENABLE_OPTION(LAUNCH_COMPILER ON "Whether to potentially use the launch compiler") IF (KOKKOS_ENABLE_CUDA) SET(KOKKOS_COMPILER_CUDA_VERSION "${KOKKOS_COMPILER_VERSION_MAJOR}${KOKKOS_COMPILER_VERSION_MINOR}") @@ -68,6 +69,15 @@ ELSE() ENDIF() KOKKOS_ENABLE_OPTION(COMPLEX_ALIGN ${COMPLEX_ALIGN_DEFAULT} "Whether to align Kokkos::complex to 2*alignof(RealType)") +IF (KOKKOS_ENABLE_TESTS) + SET(HEADER_SELF_CONTAINMENT_TESTS_DEFAULT ON) +ELSE() + SET(HEADER_SELF_CONTAINMENT_TESTS_DEFAULT OFF) +ENDIF() +KOKKOS_ENABLE_OPTION(HEADER_SELF_CONTAINMENT_TESTS ${HEADER_SELF_CONTAINMENT_TESTS_DEFAULT} "Enable header self-containment unit tests") +IF (NOT KOKKOS_ENABLE_TESTS AND KOKKOS_ENABLE_HEADER_SELF_CONTAINMENT_TESTS) + MESSAGE(WARNING "Kokkos_ENABLE_HEADER_SELF_CONTAINMENT_TESTS is ON but Kokkos_ENABLE_TESTS is OFF. Option will be ignored.") +ENDIF() IF (KOKKOS_ENABLE_CUDA AND (KOKKOS_CXX_COMPILER_ID STREQUAL Clang)) SET(CUDA_CONSTEXPR_DEFAULT ON) @@ -76,14 +86,14 @@ ELSE() ENDIF() KOKKOS_ENABLE_OPTION(CUDA_CONSTEXPR ${CUDA_CONSTEXPR_DEFAULT} "Whether to activate experimental relaxed constexpr functions") +Kokkos_ENABLE_OPTION(UNSUPPORTED_ARCHS OFF "Whether to allow architectures in backends Kokkos doesn't optimize for") + FUNCTION(check_device_specific_options) CMAKE_PARSE_ARGUMENTS(SOME "" "DEVICE" "OPTIONS" ${ARGN}) IF(NOT KOKKOS_ENABLE_${SOME_DEVICE}) FOREACH(OPTION ${SOME_OPTIONS}) - IF(CMAKE_VERSION VERSION_GREATER_EQUAL 3.14) - IF(NOT DEFINED CACHE{Kokkos_ENABLE_${OPTION}} OR NOT DEFINED CACHE{Kokkos_ENABLE_${SOME_DEVICE}}) - MESSAGE(FATAL_ERROR "Internal logic error: option '${OPTION}' or device '${SOME_DEVICE}' not recognized.") - ENDIF() + IF(NOT DEFINED CACHE{Kokkos_ENABLE_${OPTION}} OR NOT DEFINED CACHE{Kokkos_ENABLE_${SOME_DEVICE}}) + MESSAGE(FATAL_ERROR "Internal logic error: option '${OPTION}' or device '${SOME_DEVICE}' not recognized.") ENDIF() IF(KOKKOS_ENABLE_${OPTION}) MESSAGE(WARNING "Kokkos_ENABLE_${OPTION} is ON but ${SOME_DEVICE} backend is not enabled. Option will be ignored.") diff --git a/packages/kokkos/cmake/kokkos_functions.cmake b/packages/kokkos/cmake/kokkos_functions.cmake index 2b17d648b44b..858322394d7a 100644 --- a/packages/kokkos/cmake/kokkos_functions.cmake +++ b/packages/kokkos/cmake/kokkos_functions.cmake @@ -169,9 +169,7 @@ MACRO(kokkos_export_imported_tpl NAME) ENDIF() SET(TPL_LINK_OPTIONS) - IF(${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.13.0") - GET_TARGET_PROPERTY(TPL_LINK_OPTIONS ${NAME} INTERFACE_LINK_OPTIONS) - ENDIF() + GET_TARGET_PROPERTY(TPL_LINK_OPTIONS ${NAME} INTERFACE_LINK_OPTIONS) IF(TPL_LINK_OPTIONS) KOKKOS_APPEND_CONFIG_LINE("INTERFACE_LINK_OPTIONS ${TPL_LINK_OPTIONS}") ENDIF() @@ -230,9 +228,7 @@ MACRO(kokkos_import_tpl NAME) # I have still been getting errors about ROOT variables being ignored # I'm not sure if this is a scope issue - but make sure # the policy is set before we do any find_package calls - IF(${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.12.0") - CMAKE_POLICY(SET CMP0074 NEW) - ENDIF() + CMAKE_POLICY(SET CMP0074 NEW) IF (KOKKOS_ENABLE_${NAME}) #Tack on a TPL here to make sure we avoid using anyone else's find @@ -314,7 +310,7 @@ MACRO(kokkos_create_imported_tpl NAME) CMAKE_PARSE_ARGUMENTS(TPL "INTERFACE" "LIBRARY" - "LINK_LIBRARIES;INCLUDES;COMPILE_OPTIONS;LINK_OPTIONS" + "LINK_LIBRARIES;INCLUDES;COMPILE_DEFINITIONS;COMPILE_OPTIONS;LINK_OPTIONS" ${ARGN}) @@ -334,6 +330,9 @@ MACRO(kokkos_create_imported_tpl NAME) IF(TPL_INCLUDES) TARGET_INCLUDE_DIRECTORIES(${NAME} INTERFACE ${TPL_INCLUDES}) ENDIF() + IF(TPL_COMPILE_DEFINITIONS) + TARGET_COMPILE_DEFINITIONS(${NAME} INTERFACE ${TPL_COMPILE_DEFINITIONS}) + ENDIF() IF(TPL_COMPILE_OPTIONS) TARGET_COMPILE_OPTIONS(${NAME} INTERFACE ${TPL_COMPILE_OPTIONS}) ENDIF() @@ -355,6 +354,10 @@ MACRO(kokkos_create_imported_tpl NAME) SET_TARGET_PROPERTIES(${NAME} PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${TPL_INCLUDES}") ENDIF() + IF(TPL_COMPILE_DEFINITIONS) + SET_TARGET_PROPERTIES(${NAME} PROPERTIES + INTERFACE_COMPILE_DEFINITIONS "${TPL_COMPILE_DEFINITIONS}") + ENDIF() IF(TPL_COMPILE_OPTIONS) SET_TARGET_PROPERTIES(${NAME} PROPERTIES INTERFACE_COMPILE_OPTIONS "${TPL_COMPILE_OPTIONS}") @@ -770,7 +773,7 @@ FUNCTION(kokkos_link_tpl TARGET) ENDFUNCTION() FUNCTION(COMPILER_SPECIFIC_OPTIONS_HELPER) - SET(COMPILERS NVIDIA PGI XL DEFAULT Cray Intel Clang AppleClang IntelClang GNU HIP Fujitsu) + SET(COMPILERS NVIDIA PGI XL DEFAULT Cray Intel Clang AppleClang IntelClang GNU HIPCC Fujitsu) CMAKE_PARSE_ARGUMENTS( PARSE "LINK_OPTIONS;COMPILE_OPTIONS;COMPILE_DEFINITIONS;LINK_LIBRARIES" @@ -926,6 +929,9 @@ ENDFUNCTION() # DIRECTORY --> all files in directory # PROJECT --> all files/targets in a project/subproject # +# NOTE: this is VERY DIFFERENT than the version in KokkosConfigCommon.cmake.in. +# This version explicitly uses nvcc_wrapper. +# FUNCTION(kokkos_compilation) # check whether the compiler already supports building CUDA KOKKOS_CXX_COMPILER_CUDA_TEST(Kokkos_CXX_COMPILER_COMPILES_CUDA) @@ -947,10 +953,21 @@ FUNCTION(kokkos_compilation) MESSAGE(FATAL_ERROR "Kokkos could not find 'kokkos_launch_compiler'. Please set '-DKokkos_COMPILE_LAUNCHER=/path/to/launcher'") ENDIF() + # find nvcc_wrapper + FIND_PROGRAM(Kokkos_NVCC_WRAPPER + NAMES nvcc_wrapper + HINTS ${PROJECT_SOURCE_DIR} + PATHS ${PROJECT_SOURCE_DIR} + PATH_SUFFIXES bin) + + IF(NOT Kokkos_COMPILE_LAUNCHER) + MESSAGE(FATAL_ERROR "Kokkos could not find 'nvcc_wrapper'. Please set '-DKokkos_COMPILE_LAUNCHER=/path/to/nvcc_wrapper'") + ENDIF() + IF(COMP_GLOBAL) # if global, don't bother setting others - SET_PROPERTY(GLOBAL PROPERTY RULE_LAUNCH_COMPILE "${Kokkos_COMPILE_LAUNCHER} ${CMAKE_CXX_COMPILER}") - SET_PROPERTY(GLOBAL PROPERTY RULE_LAUNCH_LINK "${Kokkos_COMPILE_LAUNCHER} ${CMAKE_CXX_COMPILER}") + SET_PROPERTY(GLOBAL PROPERTY RULE_LAUNCH_COMPILE "${Kokkos_COMPILE_LAUNCHER} ${Kokkos_NVCC_WRAPPER} ${CMAKE_CXX_COMPILER}") + SET_PROPERTY(GLOBAL PROPERTY RULE_LAUNCH_LINK "${Kokkos_COMPILE_LAUNCHER} ${Kokkos_NVCC_WRAPPER} ${CMAKE_CXX_COMPILER}") ELSE() FOREACH(_TYPE PROJECT DIRECTORY TARGET SOURCE) # make project/subproject scoping easy, e.g. KokkosCompilation(PROJECT) after project(...) @@ -961,8 +978,8 @@ FUNCTION(kokkos_compilation) # set the properties if defined IF(COMP_${_TYPE}) # MESSAGE(STATUS "Using nvcc_wrapper :: ${_TYPE} :: ${COMP_${_TYPE}}") - SET_PROPERTY(${_TYPE} ${COMP_${_TYPE}} PROPERTY RULE_LAUNCH_COMPILE "${Kokkos_COMPILE_LAUNCHER} ${CMAKE_CXX_COMPILER}") - SET_PROPERTY(${_TYPE} ${COMP_${_TYPE}} PROPERTY RULE_LAUNCH_LINK "${Kokkos_COMPILE_LAUNCHER} ${CMAKE_CXX_COMPILER}") + SET_PROPERTY(${_TYPE} ${COMP_${_TYPE}} PROPERTY RULE_LAUNCH_COMPILE "${Kokkos_COMPILE_LAUNCHER} ${Kokkos_NVCC_WRAPPER} ${CMAKE_CXX_COMPILER}") + SET_PROPERTY(${_TYPE} ${COMP_${_TYPE}} PROPERTY RULE_LAUNCH_LINK "${Kokkos_COMPILE_LAUNCHER} ${Kokkos_NVCC_WRAPPER} ${CMAKE_CXX_COMPILER}") ENDIF() ENDFOREACH() ENDIF() diff --git a/packages/kokkos/cmake/kokkos_test_cxx_std.cmake b/packages/kokkos/cmake/kokkos_test_cxx_std.cmake index 1d7da922eb6e..707fb000af52 100644 --- a/packages/kokkos/cmake/kokkos_test_cxx_std.cmake +++ b/packages/kokkos/cmake/kokkos_test_cxx_std.cmake @@ -86,6 +86,19 @@ ELSE() MESSAGE(FATAL_ERROR "Unknown C++ standard ${KOKKOS_CXX_STANDARD} - must be 14, 17, or 20") ENDIF() +# Enforce that we can compile a simple C++14 program + +TRY_COMPILE(CAN_COMPILE_CPP14 + ${KOKKOS_TOP_BUILD_DIR}/corner_cases + ${KOKKOS_SOURCE_DIR}/cmake/compile_tests/cplusplus14.cpp + OUTPUT_VARIABLE ERROR_MESSAGE + CXX_STANDARD 14 +) +if (NOT CAN_COMPILE_CPP14) + UNSET(CAN_COMPILE_CPP14 CACHE) #make sure CMake always re-runs this + MESSAGE(FATAL_ERROR "C++${KOKKOS_CXX_STANDARD}-compliant compiler detected, but unable to compile C++14 or later program. Verify that ${CMAKE_CXX_COMPILER_ID}:${CMAKE_CXX_COMPILER_VERSION} is set up correctly (e.g., check that correct library headers are being used).\nFailing output:\n ${ERROR_MESSAGE}") +ENDIF() +UNSET(CAN_COMPILE_CPP14 CACHE) #make sure CMake always re-runs this # Enforce that extensions are turned off for nvcc_wrapper. diff --git a/packages/kokkos/cmake/kokkos_tpls.cmake b/packages/kokkos/cmake/kokkos_tpls.cmake index b58d3696ea9a..d8d044c9d753 100644 --- a/packages/kokkos/cmake/kokkos_tpls.cmake +++ b/packages/kokkos/cmake/kokkos_tpls.cmake @@ -1,5 +1,6 @@ KOKKOS_CFG_DEPENDS(TPLS OPTIONS) KOKKOS_CFG_DEPENDS(TPLS DEVICES) +KOKKOS_CFG_DEPENDS(TPLS COMPILER_ID) FUNCTION(KOKKOS_TPL_OPTION PKG DEFAULT) CMAKE_PARSE_ARGUMENTS(PARSED @@ -38,6 +39,12 @@ IF(KOKKOS_ENABLE_MEMKIND) ENDIF() KOKKOS_TPL_OPTION(CUDA ${Kokkos_ENABLE_CUDA} TRIBITS CUDA) KOKKOS_TPL_OPTION(LIBRT Off) +IF(KOKKOS_ENABLE_HIP AND NOT KOKKOS_CXX_COMPILER_ID STREQUAL HIPCC) + SET(ROCM_DEFAULT ON) +ELSE() + SET(ROCM_DEFAULT OFF) +ENDIF() +KOKKOS_TPL_OPTION(ROCM ${ROCM_DEFAULT}) IF (WIN32) SET(LIBDL_DEFAULT Off) @@ -70,6 +77,7 @@ KOKKOS_IMPORT_TPL(LIBRT) KOKKOS_IMPORT_TPL(LIBDL) KOKKOS_IMPORT_TPL(MEMKIND) KOKKOS_IMPORT_TPL(PTHREAD INTERFACE) +KOKKOS_IMPORT_TPL(ROCM INTERFACE) #Convert list to newlines (which CMake doesn't always like in cache variables) STRING(REPLACE ";" "\n" KOKKOS_TPL_EXPORT_TEMP "${KOKKOS_TPL_EXPORTS}") diff --git a/packages/kokkos/cmake/kokkos_tribits.cmake b/packages/kokkos/cmake/kokkos_tribits.cmake index 059fb192f051..afa036066afe 100644 --- a/packages/kokkos/cmake/kokkos_tribits.cmake +++ b/packages/kokkos/cmake/kokkos_tribits.cmake @@ -141,39 +141,54 @@ FUNCTION(KOKKOS_ADD_EXECUTABLE ROOT_NAME) ENDFUNCTION() FUNCTION(KOKKOS_ADD_EXECUTABLE_AND_TEST ROOT_NAME) -CMAKE_PARSE_ARGUMENTS(PARSE - "" - "" - "SOURCES;CATEGORIES;ARGS" - ${ARGN}) -VERIFY_EMPTY(KOKKOS_ADD_EXECUTABLE_AND_TEST ${PARSE_UNPARSED_ARGUMENTS}) - -IF (KOKKOS_HAS_TRILINOS) - IF(DEFINED PARSE_ARGS) - STRING(REPLACE ";" " " PARSE_ARGS "${PARSE_ARGS}") - ENDIF() - TRIBITS_ADD_EXECUTABLE_AND_TEST( - ${ROOT_NAME} - SOURCES ${PARSE_SOURCES} - TESTONLYLIBS kokkos_gtest - NUM_MPI_PROCS 1 - COMM serial mpi - ARGS ${PARSE_ARGS} - CATEGORIES ${PARSE_CATEGORIES} - SOURCES ${PARSE_SOURCES} - FAIL_REGULAR_EXPRESSION " FAILED " - ARGS ${PARSE_ARGS} - ) -ELSE() - KOKKOS_ADD_TEST_EXECUTABLE(${ROOT_NAME} - SOURCES ${PARSE_SOURCES} - ) - KOKKOS_ADD_TEST(NAME ${ROOT_NAME} - EXE ${ROOT_NAME} - FAIL_REGULAR_EXPRESSION " FAILED " - ARGS ${PARSE_ARGS} - ) -ENDIF() + CMAKE_PARSE_ARGUMENTS(PARSE + "" + "" + "SOURCES;CATEGORIES;ARGS" + ${ARGN}) + VERIFY_EMPTY(KOKKOS_ADD_EXECUTABLE_AND_TEST ${PARSE_UNPARSED_ARGUMENTS}) + + IF (KOKKOS_HAS_TRILINOS) + IF(DEFINED PARSE_ARGS) + STRING(REPLACE ";" " " PARSE_ARGS "${PARSE_ARGS}") + ENDIF() + TRIBITS_ADD_EXECUTABLE_AND_TEST( + ${ROOT_NAME} + SOURCES ${PARSE_SOURCES} + TESTONLYLIBS kokkos_gtest + NUM_MPI_PROCS 1 + COMM serial mpi + ARGS ${PARSE_ARGS} + CATEGORIES ${PARSE_CATEGORIES} + SOURCES ${PARSE_SOURCES} + FAIL_REGULAR_EXPRESSION " FAILED " + ARGS ${PARSE_ARGS} + ) + ELSE() + KOKKOS_ADD_TEST_EXECUTABLE(${ROOT_NAME} + SOURCES ${PARSE_SOURCES} + ) + IF (PARSE_ARGS) + SET(TEST_NUMBER 0) + FOREACH (ARG_STR ${PARSE_ARGS}) + # This is passed as a single string blob to match TriBITS behavior + # We need this to be turned into a list + STRING(REPLACE " " ";" ARG_STR_LIST ${ARG_STR}) + LIST(APPEND TEST_NAME "${ROOT_NAME}${TEST_NUMBER}") + MATH(EXPR TEST_NUMBER "${TEST_NUMBER} + 1") + KOKKOS_ADD_TEST(NAME ${TEST_NAME} + EXE ${ROOT_NAME} + FAIL_REGULAR_EXPRESSION " FAILED " + ARGS ${ARG_STR_LIST} + ) + ENDFOREACH() + ELSE() + KOKKOS_ADD_TEST(NAME ${ROOT_NAME} + EXE ${ROOT_NAME} + FAIL_REGULAR_EXPRESSION " FAILED " + ) + ENDIF() + ENDIF() ENDFUNCTION() FUNCTION(KOKKOS_SET_EXE_PROPERTY ROOT_NAME) @@ -301,11 +316,26 @@ ENDMACRO() ## Includes generated header files, scripts such as nvcc_wrapper and hpcbind, ## as well as other files provided through plugins. MACRO(KOKKOS_INSTALL_ADDITIONAL_FILES) - # kokkos_launch_compiler is used by Kokkos to prefix compiler commands so that they forward to nvcc_wrapper + + # kokkos_launch_compiler is used by Kokkos to prefix compiler commands so that they forward to original kokkos compiler + # if nvcc_wrapper was not used as CMAKE_CXX_COMPILER, configure the original compiler into kokkos_launch_compiler + IF(NOT "${CMAKE_CXX_COMPILER}" MATCHES "nvcc_wrapper") + SET(NVCC_WRAPPER_DEFAULT_COMPILER "${CMAKE_CXX_COMPILER}") + ELSE() + IF(NOT "$ENV{NVCC_WRAPPER_DEFAULT_COMPILER}" STREQUAL "") + SET(NVCC_WRAPPER_DEFAULT_COMPILER "$ENV{NVCC_WRAPPER_DEFAULT_COMPILER}") + ENDIF() + ENDIF() + + CONFIGURE_FILE(${CMAKE_CURRENT_SOURCE_DIR}/bin/kokkos_launch_compiler + ${PROJECT_BINARY_DIR}/temp/kokkos_launch_compiler + @ONLY) + INSTALL(PROGRAMS "${CMAKE_CURRENT_SOURCE_DIR}/bin/nvcc_wrapper" "${CMAKE_CURRENT_SOURCE_DIR}/bin/hpcbind" "${CMAKE_CURRENT_SOURCE_DIR}/bin/kokkos_launch_compiler" + "${PROJECT_BINARY_DIR}/temp/kokkos_launch_compiler" DESTINATION ${CMAKE_INSTALL_BINDIR}) INSTALL(FILES "${CMAKE_CURRENT_BINARY_DIR}/KokkosCore_config.h" @@ -313,7 +343,7 @@ MACRO(KOKKOS_INSTALL_ADDITIONAL_FILES) "${CMAKE_CURRENT_BINARY_DIR}/KokkosCore_Config_SetupBackend.hpp" "${CMAKE_CURRENT_BINARY_DIR}/KokkosCore_Config_DeclareBackend.hpp" "${CMAKE_CURRENT_BINARY_DIR}/KokkosCore_Config_PostInclude.hpp" - DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) + DESTINATION ${KOKKOS_HEADER_DIR}) ENDMACRO() FUNCTION(KOKKOS_SET_LIBRARY_PROPERTIES LIBRARY_NAME) @@ -330,24 +360,12 @@ FUNCTION(KOKKOS_SET_LIBRARY_PROPERTIES LIBRARY_NAME) ${LIBRARY_NAME} PUBLIC $<$:${KOKKOS_LINK_OPTIONS}> ) - ELSEIF(${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.13") + ELSE() #I can use link options #just assume CXX linkage TARGET_LINK_OPTIONS( ${LIBRARY_NAME} PUBLIC ${KOKKOS_LINK_OPTIONS} ) - ELSE() - #assume CXX linkage, we have no good way to check otherwise - IF (PARSE_PLAIN_STYLE) - TARGET_LINK_LIBRARIES( - ${LIBRARY_NAME} ${KOKKOS_LINK_OPTIONS} - ) - ELSE() - #well, have to do it the wrong way for now - TARGET_LINK_LIBRARIES( - ${LIBRARY_NAME} PUBLIC ${KOKKOS_LINK_OPTIONS} - ) - ENDIF() ENDIF() TARGET_COMPILE_OPTIONS( @@ -448,6 +466,13 @@ FUNCTION(KOKKOS_INTERNAL_ADD_LIBRARY LIBRARY_NAME) ${PARSE_SOURCES} ) + IF(PARSE_SHARED OR BUILD_SHARED_LIBS) + SET_TARGET_PROPERTIES(${LIBRARY_NAME} PROPERTIES + VERSION ${Kokkos_VERSION} + SOVERSION ${Kokkos_VERSION_MAJOR}.${Kokkos_VERSION_MINOR} + ) + ENDIF() + KOKKOS_INTERNAL_ADD_LIBRARY_INSTALL(${LIBRARY_NAME}) #In case we are building in-tree, add an alias name diff --git a/packages/kokkos/containers/src/CMakeLists.txt b/packages/kokkos/containers/src/CMakeLists.txt index 7000624b6bcf..98655896d4f3 100644 --- a/packages/kokkos/containers/src/CMakeLists.txt +++ b/packages/kokkos/containers/src/CMakeLists.txt @@ -26,8 +26,6 @@ KOKKOS_ADD_LIBRARY( HEADERS ${KOKKOS_CONTAINER_HEADERS} ) -SET_TARGET_PROPERTIES(kokkoscontainers PROPERTIES VERSION ${Kokkos_VERSION}) - KOKKOS_LIB_INCLUDE_DIRECTORIES(kokkoscontainers ${KOKKOS_TOP_BUILD_DIR} ${CMAKE_CURRENT_BINARY_DIR} @@ -36,4 +34,3 @@ KOKKOS_LIB_INCLUDE_DIRECTORIES(kokkoscontainers KOKKOS_LINK_INTERNAL_LIBRARY(kokkoscontainers kokkoscore) #----------------------------------------------------------------------------- - diff --git a/packages/kokkos/containers/src/Kokkos_DualView.hpp b/packages/kokkos/containers/src/Kokkos_DualView.hpp index 689f0eb2ed4e..45710d1f737c 100644 --- a/packages/kokkos/containers/src/Kokkos_DualView.hpp +++ b/packages/kokkos/containers/src/Kokkos_DualView.hpp @@ -91,6 +91,25 @@ namespace Kokkos { * behavior. Please see the documentation of Kokkos::View for * examples. The default suffices for most users. */ + +namespace Impl { + +#ifdef KOKKOS_ENABLE_CUDA + +inline const Kokkos::Cuda& get_cuda_space(const Kokkos::Cuda& in) { return in; } + +inline const Kokkos::Cuda& get_cuda_space() { + return *Kokkos::Impl::cuda_get_deep_copy_space(); +} + +template +inline const Kokkos::Cuda& get_cuda_space(const NonCudaExecSpace&) { + return get_cuda_space(); +} + +#endif // KOKKOS_ENABLE_CUDA + +} // namespace Impl template class DualView : public ViewTraits { @@ -295,6 +314,53 @@ class DualView : public ViewTraits { "DualView constructed with incompatible views"); } } + // does the DualView have only one device + struct impl_dualview_is_single_device { + enum : bool { + value = std::is_same::value + }; + }; + + // does the given device match the device of t_dev? + template + struct impl_device_matches_tdev_device { + enum : bool { + value = std::is_same::value + }; + }; + // does the given device match the device of t_host? + template + struct impl_device_matches_thost_device { + enum : bool { + value = std::is_same::value + }; + }; + + // does the given device match the execution space of t_host? + template + struct impl_device_matches_thost_exec { + enum : bool { + value = std::is_same::value + }; + }; + + // does the given device match the execution space of t_dev? + template + struct impl_device_matches_tdev_exec { + enum : bool { + value = std::is_same::value + }; + }; + + // does the given device's memory space match the memory space of t_dev? + template + struct impl_device_matches_tdev_memory_space { + enum : bool { + value = std::is_same::value + }; + }; //@} //! \name Methods for synchronizing, marking as modified, and getting Views. @@ -302,7 +368,7 @@ class DualView : public ViewTraits { /// \brief Return a View on a specific device \c Device. /// - /// Please don't be afraid of the if_c expression in the return + /// Please don't be afraid of the nested if_c expressions in the return /// value's type. That just tells the method what the return type /// should be: t_dev if the \c Device template parameter matches /// this DualView's device type, else t_host. @@ -323,10 +389,17 @@ class DualView : public ViewTraits { /// typename dual_view_type::t_host hostView = DV.view (); /// \endcode template - KOKKOS_INLINE_FUNCTION const typename Impl::if_c< - std::is_same::value, - t_dev, t_host>::type& + KOKKOS_INLINE_FUNCTION const typename std::conditional_t< + impl_device_matches_tdev_device::value, t_dev, + typename std::conditional_t< + impl_device_matches_thost_device::value, t_host, + typename std::conditional_t< + impl_device_matches_thost_exec::value, t_host, + typename std::conditional_t< + impl_device_matches_tdev_exec::value, t_dev, + typename std::conditional_t< + impl_device_matches_tdev_memory_space::value, + t_dev, t_host> > > > > view() const { constexpr bool device_is_memspace = std::is_same::value; @@ -463,6 +536,7 @@ class DualView : public ViewTraits { true); } } + /// \brief Update data on device or host only if data in the other /// space has been marked as modified. /// @@ -480,12 +554,9 @@ class DualView : public ViewTraits { /// the data in either View. You must manually mark modified data /// as modified, by calling the modify() method with the /// appropriate template parameter. - template - void sync(const typename std::enable_if< - (std::is_same::value) || - (std::is_same::value), - int>::type& = 0) { + // deliberately passing args by cref as they're used multiple times + template + void sync_impl(std::true_type, Args const&... args) { if (modified_flags.data() == nullptr) return; int dev = get_device_side(); @@ -497,12 +568,12 @@ class DualView : public ViewTraits { Kokkos::CudaUVMSpace>::value) { if (d_view.data() == h_view.data()) Kokkos::Impl::cuda_prefetch_pointer( - Kokkos::Cuda(), d_view.data(), + Impl::get_cuda_space(args...), d_view.data(), sizeof(typename t_dev::value_type) * d_view.span(), true); } #endif - deep_copy(d_view, h_view); + deep_copy(args..., d_view, h_view); modified_flags(0) = modified_flags(1) = 0; impl_report_device_sync(); } @@ -514,12 +585,12 @@ class DualView : public ViewTraits { Kokkos::CudaUVMSpace>::value) { if (d_view.data() == h_view.data()) Kokkos::Impl::cuda_prefetch_pointer( - Kokkos::Cuda(), d_view.data(), + Impl::get_cuda_space(args...), d_view.data(), sizeof(typename t_dev::value_type) * d_view.span(), false); } #endif - deep_copy(h_view, d_view); + deep_copy(args..., h_view, d_view); modified_flags(0) = modified_flags(1) = 0; impl_report_host_sync(); } @@ -533,10 +604,26 @@ class DualView : public ViewTraits { template void sync(const typename std::enable_if< - (!std::is_same::value) || + (std::is_same::value) || (std::is_same::value), int>::type& = 0) { + sync_impl(std::true_type{}); + } + + template + void sync(const ExecutionSpace& exec, + const typename std::enable_if< + (std::is_same::value) || + (std::is_same::value), + int>::type& = 0) { + sync_impl(std::true_type{}, exec); + } + + // deliberately passing args by cref as they're used multiple times + template + void sync_impl(std::false_type, Args const&...) { if (modified_flags.data() == nullptr) return; int dev = get_device_side(); @@ -557,7 +644,27 @@ class DualView : public ViewTraits { } } - void sync_host() { + template + void sync(const typename std::enable_if< + (!std::is_same::value) || + (std::is_same::value), + int>::type& = 0) { + sync_impl(std::false_type{}); + } + template + void sync(const ExecutionSpace& exec, + const typename std::enable_if< + (!std::is_same::value) || + (std::is_same::value), + int>::type& = 0) { + sync_impl(std::false_type{}, exec); + } + + // deliberately passing args by cref as they're used multiple times + template + void sync_host_impl(Args const&... args) { if (!std::is_same::value) Impl::throw_runtime_exception( @@ -569,18 +676,26 @@ class DualView : public ViewTraits { Kokkos::CudaUVMSpace>::value) { if (d_view.data() == h_view.data()) Kokkos::Impl::cuda_prefetch_pointer( - Kokkos::Cuda(), d_view.data(), + Impl::get_cuda_space(args...), d_view.data(), sizeof(typename t_dev::value_type) * d_view.span(), false); } #endif - deep_copy(h_view, d_view); + deep_copy(args..., h_view, d_view); modified_flags(1) = modified_flags(0) = 0; impl_report_host_sync(); } } - void sync_device() { + template + void sync_host(const ExecSpace& exec) { + sync_host_impl(exec); + } + void sync_host() { sync_host_impl(); } + + // deliberately passing args by cref as they're used multiple times + template + void sync_device_impl(Args const&... args) { if (!std::is_same::value) Impl::throw_runtime_exception( @@ -592,17 +707,23 @@ class DualView : public ViewTraits { Kokkos::CudaUVMSpace>::value) { if (d_view.data() == h_view.data()) Kokkos::Impl::cuda_prefetch_pointer( - Kokkos::Cuda(), d_view.data(), + Impl::get_cuda_space(args...), d_view.data(), sizeof(typename t_dev::value_type) * d_view.span(), true); } #endif - deep_copy(d_view, h_view); + deep_copy(args..., d_view, h_view); modified_flags(1) = modified_flags(0) = 0; impl_report_device_sync(); } } + template + void sync_device(const ExecSpace& exec) { + sync_device_impl(exec); + } + void sync_device() { sync_device_impl(); } + template bool need_sync() const { if (modified_flags.data() == nullptr) return false; @@ -658,6 +779,7 @@ class DualView : public ViewTraits { template void modify() { if (modified_flags.data() == nullptr) return; + if (impl_dualview_is_single_device::value) return; int dev = get_device_side(); if (dev == 1) { // if Device is the same as DualView's device type @@ -690,6 +812,7 @@ class DualView : public ViewTraits { } inline void modify_host() { + if (impl_dualview_is_single_device::value) return; if (modified_flags.data() != nullptr) { modified_flags(0) = (modified_flags(1) > modified_flags(0) ? modified_flags(1) @@ -710,6 +833,7 @@ class DualView : public ViewTraits { } inline void modify_device() { + if (impl_dualview_is_single_device::value) return; if (modified_flags.data() != nullptr) { modified_flags(1) = (modified_flags(1) > modified_flags(0) ? modified_flags(1) diff --git a/packages/kokkos/containers/src/Kokkos_DynRankView.hpp b/packages/kokkos/containers/src/Kokkos_DynRankView.hpp index c66d7a5f36ca..c6323fef9369 100644 --- a/packages/kokkos/containers/src/Kokkos_DynRankView.hpp +++ b/packages/kokkos/containers/src/Kokkos_DynRankView.hpp @@ -245,13 +245,10 @@ KOKKOS_INLINE_FUNCTION bool dyn_rank_view_verify_operator_bounds( return (size_t(i) < map.extent(R)) && dyn_rank_view_verify_operator_bounds(rank, map, args...); } else if (i != 0) { - // FIXME_SYCL SYCL doesn't allow printf in kernels -#ifndef KOKKOS_ENABLE_SYCL - printf( + KOKKOS_IMPL_DO_NOT_USE_PRINTF( "DynRankView Debug Bounds Checking Error: at rank %u\n Extra " "arguments beyond the rank must be zero \n", R); -#endif return (false) && dyn_rank_view_verify_operator_bounds(rank, map, args...); } else { @@ -575,37 +572,22 @@ class DynRankView : public ViewTraits { (is_layout_left || is_layout_right || is_layout_stride) }; - template ::accessible> - struct verify_space { - KOKKOS_FORCEINLINE_FUNCTION static void check() {} - }; - - template - struct verify_space { - KOKKOS_FORCEINLINE_FUNCTION static void check() { - Kokkos::abort( - "Kokkos::DynRankView ERROR: attempt to access inaccessible memory " - "space"); - }; - }; - // Bounds checking macros #if defined(KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK) // rank of the calling operator - included as first argument in ARG -#define KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(ARG) \ - DynRankView::template verify_space< \ - Kokkos::Impl::ActiveExecutionMemorySpace>::check(); \ - Kokkos::Impl::dyn_rank_view_verify_operator_bounds< \ - typename traits::memory_space> \ +#define KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(ARG) \ + Kokkos::Impl::verify_space::check(); \ + Kokkos::Impl::dyn_rank_view_verify_operator_bounds< \ + typename traits::memory_space> \ ARG; #else -#define KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(ARG) \ - DynRankView::template verify_space< \ - Kokkos::Impl::ActiveExecutionMemorySpace>::check(); +#define KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(ARG) \ + Kokkos::Impl::verify_space::check(); #endif diff --git a/packages/kokkos/containers/src/Kokkos_DynamicView.hpp b/packages/kokkos/containers/src/Kokkos_DynamicView.hpp index 06bd55666199..cc949d4c556a 100644 --- a/packages/kokkos/containers/src/Kokkos_DynamicView.hpp +++ b/packages/kokkos/containers/src/Kokkos_DynamicView.hpp @@ -76,6 +76,12 @@ struct ChunkArraySpace { using memory_space = typename Kokkos::Experimental::HIPHostPinnedSpace; }; #endif +#ifdef KOKKOS_ENABLE_SYCL +template <> +struct ChunkArraySpace { + using memory_space = typename Kokkos::Experimental::SYCLSharedUSMSpace; +}; +#endif } // end namespace Impl /** \brief Dynamic views are restricted to rank-one and no layout. diff --git a/packages/kokkos/containers/src/Kokkos_OffsetView.hpp b/packages/kokkos/containers/src/Kokkos_OffsetView.hpp index 4fd084338ed7..0f21a08ba3ba 100644 --- a/packages/kokkos/containers/src/Kokkos_OffsetView.hpp +++ b/packages/kokkos/containers/src/Kokkos_OffsetView.hpp @@ -377,34 +377,20 @@ class OffsetView : public ViewTraits { std::is_same::value && (is_layout_left || is_layout_right || is_layout_stride); - template ::accessible> - struct verify_space { - KOKKOS_FORCEINLINE_FUNCTION static void check() {} - }; - - template - struct verify_space { - KOKKOS_FORCEINLINE_FUNCTION static void check() { - Kokkos::abort( - "Kokkos::View ERROR: attempt to access inaccessible memory space"); - }; - }; - #if defined(KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK) -#define KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY(ARG) \ - OffsetView::template verify_space< \ - Kokkos::Impl::ActiveExecutionMemorySpace>::check(); \ - Kokkos::Experimental::Impl::offsetview_verify_operator_bounds< \ - typename traits::memory_space> \ +#define KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY(ARG) \ + Kokkos::Impl::verify_space::check(); \ + Kokkos::Experimental::Impl::offsetview_verify_operator_bounds< \ + typename traits::memory_space> \ ARG; #else -#define KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY(ARG) \ - OffsetView::template verify_space< \ - Kokkos::Impl::ActiveExecutionMemorySpace>::check(); +#define KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY(ARG) \ + Kokkos::Impl::verify_space::check(); #endif public: diff --git a/packages/kokkos/containers/src/Kokkos_ScatterView.hpp b/packages/kokkos/containers/src/Kokkos_ScatterView.hpp index 2e3ad30d469c..dcd4cf73e5d7 100644 --- a/packages/kokkos/containers/src/Kokkos_ScatterView.hpp +++ b/packages/kokkos/containers/src/Kokkos_ScatterView.hpp @@ -834,8 +834,8 @@ class ScatterView::value, "ScatterView contribute destination has different layout"); static_assert( - Kokkos::Impl::VerifyExecutionCanAccessMemorySpace< - memory_space, typename dest_type::memory_space>::value, + Kokkos::Impl::SpaceAccessibility< + execution_space, typename dest_type::memory_space>::accessible, "ScatterView contribute destination memory space not accessible"); if (dest.data() == internal_view.data()) return; Kokkos::Impl::Experimental::ReduceDuplicates::value, "ScatterView deep_copy destination has different layout"); static_assert( - Kokkos::Impl::VerifyExecutionCanAccessMemorySpace< - memory_space, typename dest_type::memory_space>::value, + Kokkos::Impl::SpaceAccessibility< + execution_space, typename dest_type::memory_space>::accessible, "ScatterView deep_copy destination memory space not accessible"); bool is_equal = (dest.data() == internal_view.data()); size_t start = is_equal ? 1 : 0; @@ -1290,8 +1290,8 @@ class ScatterView::value, "ScatterView deep_copy destination has different layout"); static_assert( - Kokkos::Impl::VerifyExecutionCanAccessMemorySpace< - memory_space, typename dest_type::memory_space>::value, + Kokkos::Impl::SpaceAccessibility< + execution_space, typename dest_type::memory_space>::accessible, "ScatterView deep_copy destination memory space not accessible"); auto extent = internal_view.extent(internal_view_type::rank - 1); bool is_equal = (dest.data() == internal_view.data()); @@ -1439,21 +1439,21 @@ template ::array_layout, typename ViewTraits::device_type, Op, - typename Kokkos::Impl::if_c< + std::conditional_t< std::is_same::value, typename Kokkos::Impl::Experimental::DefaultDuplication< typename ViewTraits::execution_space>::type, - Duplication>::type, - typename Kokkos::Impl::if_c< + Duplication>, + std::conditional_t< std::is_same::value, typename Kokkos::Impl::Experimental::DefaultContribution< typename ViewTraits::execution_space, - typename Kokkos::Impl::if_c< + typename std::conditional_t< std::is_same::value, typename Kokkos::Impl::Experimental::DefaultDuplication< typename ViewTraits::execution_space>::type, - Duplication>::type>::type, - Contribution>::type> + Duplication>>::type, + Contribution>> create_scatter_view(View const& original_view) { return original_view; // implicit ScatterView constructor call } diff --git a/packages/kokkos/containers/src/Kokkos_UnorderedMap.hpp b/packages/kokkos/containers/src/Kokkos_UnorderedMap.hpp index d2affda93aff..edb0e7261da9 100644 --- a/packages/kokkos/containers/src/Kokkos_UnorderedMap.hpp +++ b/packages/kokkos/containers/src/Kokkos_UnorderedMap.hpp @@ -264,26 +264,24 @@ class UnorderedMap { private: enum : size_type { invalid_index = ~static_cast(0) }; - using impl_value_type = - typename Impl::if_c::type; + using impl_value_type = std::conditional_t; - using key_type_view = typename Impl::if_c< + using key_type_view = std::conditional_t< is_insertable_map, View, - View > >::type; + View > >; - using value_type_view = - typename Impl::if_c, - View > >::type; + using value_type_view = std::conditional_t< + is_insertable_map || is_modifiable_map, + View, + View > >; - using size_type_view = typename Impl::if_c< + using size_type_view = std::conditional_t< is_insertable_map, View, - View > >::type; + View > >; using bitset_type = - typename Impl::if_c, - ConstBitset >::type; + std::conditional_t, + ConstBitset >; enum { modified_idx = 0, erasable_idx = 1, failed_insert_idx = 2 }; enum { num_scalars = 3 }; @@ -540,10 +538,7 @@ class UnorderedMap { // Previously claimed an unused entry that was not inserted. // Release this unused entry immediately. if (!m_available_indexes.reset(new_index)) { - // FIXME_SYCL SYCL doesn't allow printf in kernels -#ifndef KOKKOS_ENABLE_SYCL - printf("Unable to free existing\n"); -#endif + KOKKOS_IMPL_DO_NOT_USE_PRINTF("Unable to free existing\n"); } } @@ -659,8 +654,8 @@ class UnorderedMap { /// /// 'const value_type' via Cuda texture fetch must return by value. KOKKOS_FORCEINLINE_FUNCTION - typename Impl::if_c<(is_set || has_const_value), impl_value_type, - impl_value_type &>::type + std::conditional_t<(is_set || has_const_value), impl_value_type, + impl_value_type &> value_at(size_type i) const { return m_values[is_set ? 0 : (i < capacity() ? i : capacity())]; } diff --git a/packages/kokkos/containers/src/impl/Kokkos_Bitset_impl.hpp b/packages/kokkos/containers/src/impl/Kokkos_Bitset_impl.hpp index 6e450598d1eb..6047e60f3dd0 100644 --- a/packages/kokkos/containers/src/impl/Kokkos_Bitset_impl.hpp +++ b/packages/kokkos/containers/src/impl/Kokkos_Bitset_impl.hpp @@ -57,10 +57,22 @@ namespace Kokkos { namespace Impl { +KOKKOS_FORCEINLINE_FUNCTION +unsigned rotate_left(unsigned i, int r) { + constexpr int size = static_cast(sizeof(unsigned) * CHAR_BIT); + return r ? ((i << r) | (i >> (size - r))) : i; +} + KOKKOS_FORCEINLINE_FUNCTION unsigned rotate_right(unsigned i, int r) { - enum { size = static_cast(sizeof(unsigned) * CHAR_BIT) }; + constexpr int size = static_cast(sizeof(unsigned) * CHAR_BIT); + // FIXME_SYCL llvm.fshr.i32 missing + // (https://github.com/intel/llvm/issues/3308) +#ifdef __SYCL_DEVICE_ONLY__ + return rotate_left(i, size - r); +#else return r ? ((i >> r) | (i << (size - r))) : i; +#endif } template diff --git a/packages/kokkos/containers/src/impl/Kokkos_UnorderedMap_impl.hpp b/packages/kokkos/containers/src/impl/Kokkos_UnorderedMap_impl.hpp index b06ab0846c9a..d7c4a5d1ffdf 100644 --- a/packages/kokkos/containers/src/impl/Kokkos_UnorderedMap_impl.hpp +++ b/packages/kokkos/containers/src/impl/Kokkos_UnorderedMap_impl.hpp @@ -250,8 +250,8 @@ struct UnorderedMapPrint { uint32_t list = m_map.m_hash_lists(i); for (size_type curr = list, ii = 0; curr != invalid_index; curr = m_map.m_next_index[curr], ++ii) { - printf("%d[%d]: %d->%d\n", list, ii, m_map.key_at(curr), - m_map.value_at(curr)); + KOKKOS_IMPL_DO_NOT_USE_PRINTF("%d[%d]: %d->%d\n", list, ii, + m_map.key_at(curr), m_map.value_at(curr)); } } }; diff --git a/packages/kokkos/containers/unit_tests/CMakeLists.txt b/packages/kokkos/containers/unit_tests/CMakeLists.txt index c84c5f6d5ec3..947d222c273d 100644 --- a/packages/kokkos/containers/unit_tests/CMakeLists.txt +++ b/packages/kokkos/containers/unit_tests/CMakeLists.txt @@ -2,6 +2,7 @@ KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) KOKKOS_INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR}) KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/../src ) +KOKKOS_INCLUDE_DIRECTORIES(${KOKKOS_SOURCE_DIR}/core/unit_test/category_files) foreach(Tag Threads;Serial;OpenMP;HPX;Cuda;HIP;SYCL) # Because there is always an exception to the rule @@ -41,11 +42,6 @@ foreach(Tag Threads;Serial;OpenMP;HPX;Cuda;HIP;SYCL) configure_file(${dir}/dummy.cpp ${file}) list(APPEND UnitTestSources ${file}) endforeach() - list(REMOVE_ITEM UnitTestSources - ${CMAKE_CURRENT_BINARY_DIR}/sycl/TestSYCL_Bitset.cpp - ${CMAKE_CURRENT_BINARY_DIR}/sycl/TestSYCL_ScatterView.cpp - ${CMAKE_CURRENT_BINARY_DIR}/sycl/TestSYCL_UnorderedMap.cpp - ) KOKKOS_ADD_EXECUTABLE_AND_TEST(UnitTest_${Tag} SOURCES ${UnitTestSources}) endif() endforeach() diff --git a/packages/kokkos/containers/unit_tests/Makefile b/packages/kokkos/containers/unit_tests/Makefile index f42b9b751907..82669fe1ab75 100644 --- a/packages/kokkos/containers/unit_tests/Makefile +++ b/packages/kokkos/containers/unit_tests/Makefile @@ -26,7 +26,7 @@ override LDFLAGS += -lpthread include $(KOKKOS_PATH)/Makefile.kokkos -KOKKOS_CXXFLAGS += -I$(GTEST_PATH) -I${KOKKOS_PATH}/containers/unit_tests +KOKKOS_CXXFLAGS += -I$(GTEST_PATH) -I${KOKKOS_PATH}/containers/unit_tests -I${KOKKOS_PATH}/core/unit_test/category_files TEST_TARGETS = TARGETS = diff --git a/packages/kokkos/containers/unit_tests/TestDualView.hpp b/packages/kokkos/containers/unit_tests/TestDualView.hpp index 531caf0f85ce..3eee85ed10bd 100644 --- a/packages/kokkos/containers/unit_tests/TestDualView.hpp +++ b/packages/kokkos/containers/unit_tests/TestDualView.hpp @@ -114,6 +114,8 @@ struct test_dualview_combinations { a.template modify(); a.template sync(); + a.template sync( + Kokkos::DefaultExecutionSpace{}); a.h_view(5, 1) = 3; a.h_view(6, 1) = 4; @@ -122,11 +124,15 @@ struct test_dualview_combinations { ViewType b = Kokkos::subview(a, std::pair(6, 9), std::pair(0, 1)); a.template sync(); + a.template sync( + Kokkos::DefaultExecutionSpace{}); b.template modify(); Kokkos::deep_copy(b.d_view, 2); a.template sync(); + a.template sync( + Kokkos::DefaultExecutionSpace{}); Scalar count = 0; for (unsigned int i = 0; i < a.d_view.extent(0); i++) for (unsigned int j = 0; j < a.d_view.extent(1); j++) @@ -180,6 +186,7 @@ struct test_dual_view_deep_copy { } else { a.modify_device(); a.sync_host(); + a.sync_host(Kokkos::DefaultExecutionSpace{}); } // Check device view is initialized as expected @@ -208,6 +215,7 @@ struct test_dual_view_deep_copy { b.template sync(); } else { b.sync_host(); + b.sync_host(Kokkos::DefaultExecutionSpace{}); } // Perform same checks on b as done on a @@ -302,6 +310,7 @@ struct test_dualview_resize { ASSERT_EQ(a.extent(1), m / factor); a.sync_device(); + a.sync_device(Kokkos::DefaultExecutionSpace{}); // Check device view is initialized as expected a_d_sum = 0; @@ -404,19 +413,14 @@ void test_dualview_resize() { Impl::test_dualview_resize(); } -// FIXME_SYCL requires MDRange policy -#ifndef KOKKOS_ENABLE_SYCL TEST(TEST_CATEGORY, dualview_combination) { test_dualview_combinations(10, true); } -#endif TEST(TEST_CATEGORY, dualview_alloc) { test_dualview_alloc(10); } -// FIXME_SYCL requires MDRange policy -#ifndef KOKKOS_ENABLE_SYCL TEST(TEST_CATEGORY, dualview_combinations_without_init) { test_dualview_combinations(10, false); } @@ -433,8 +437,133 @@ TEST(TEST_CATEGORY, dualview_realloc) { TEST(TEST_CATEGORY, dualview_resize) { test_dualview_resize(); } + +namespace { +/** + * + * The following tests are a response to + * https://github.com/kokkos/kokkos/issues/3850 + * and + * https://github.com/kokkos/kokkos/pull/3857 + * + * DualViews were returning incorrect view types and taking + * inappropriate actions based on the templated view methods. + * + * Specifically, template view methods were always returning + * a device view if the memory space was UVM and a Kokkos::Device was passed. + * Sync/modify methods completely broke down So these tests exist to make sure + * that we keep the semantics of UVM DualViews intact. + */ +// modify if we have other UVM enabled backends +#ifdef KOKKOS_ENABLE_CUDA // OR other UVM builds +#define UVM_ENABLED_BUILD +#endif + +#ifdef UVM_ENABLED_BUILD +template +struct UVMSpaceFor; +#endif + +#ifdef KOKKOS_ENABLE_CUDA // specific to CUDA +template <> +struct UVMSpaceFor { + using type = Kokkos::CudaUVMSpace; +}; +#endif + +#ifdef UVM_ENABLED_BUILD +template <> +struct UVMSpaceFor { + using type = typename UVMSpaceFor::type; +}; +#else +template +struct UVMSpaceFor { + using type = typename ExecSpace::memory_space; +}; #endif +using ExecSpace = Kokkos::DefaultExecutionSpace; +using MemSpace = typename UVMSpaceFor::type; +using DeviceType = Kokkos::Device; + +using DualViewType = Kokkos::DualView; +using d_device = DeviceType; +using h_device = Kokkos::Device< + Kokkos::DefaultHostExecutionSpace, + typename UVMSpaceFor::type>; + +TEST(TEST_CATEGORY, dualview_device_correct_kokkos_device) { + DualViewType dv("myView", 100); + dv.clear_sync_state(); + auto v_d = dv.template view(); + using vdt = decltype(v_d); + using vdt_d = vdt::device_type; + using vdt_d_e = vdt_d::execution_space; + ASSERT_STREQ(vdt_d_e::name(), Kokkos::DefaultExecutionSpace::name()); +} +TEST(TEST_CATEGORY, dualview_host_correct_kokkos_device) { + DualViewType dv("myView", 100); + dv.clear_sync_state(); + auto v_h = dv.template view(); + using vht = decltype(v_h); + using vht_d = vht::device_type; + using vht_d_e = vht_d::execution_space; + ASSERT_STREQ(vht_d_e::name(), Kokkos::DefaultHostExecutionSpace::name()); +} + +TEST(TEST_CATEGORY, dualview_host_modify_template_device_sync) { + DualViewType dv("myView", 100); + dv.clear_sync_state(); + dv.modify_host(); + dv.template sync(); + EXPECT_TRUE(!dv.need_sync_device()); + EXPECT_TRUE(!dv.need_sync_host()); + dv.clear_sync_state(); +} + +TEST(TEST_CATEGORY, dualview_host_modify_template_device_execspace_sync) { + DualViewType dv("myView", 100); + dv.clear_sync_state(); + dv.modify_host(); + dv.template sync(); + EXPECT_TRUE(!dv.need_sync_device()); + EXPECT_TRUE(!dv.need_sync_host()); + dv.clear_sync_state(); +} + +TEST(TEST_CATEGORY, dualview_device_modify_template_host_sync) { + DualViewType dv("myView", 100); + dv.clear_sync_state(); + dv.modify_device(); + dv.template sync(); + EXPECT_TRUE(!dv.need_sync_device()); + EXPECT_TRUE(!dv.need_sync_host()); + dv.clear_sync_state(); +} +TEST(TEST_CATEGORY, dualview_device_modify_template_host_execspace_sync) { + DualViewType dv("myView", 100); + dv.clear_sync_state(); + dv.modify_device(); + dv.template sync(); + EXPECT_TRUE(!dv.need_sync_device()); + EXPECT_TRUE(!dv.need_sync_host()); + dv.clear_sync_state(); +} + +TEST(TEST_CATEGORY, + dualview_template_views_return_correct_executionspace_views) { + DualViewType dv("myView", 100); + dv.clear_sync_state(); + using hvt = decltype(dv.view()); + using dvt = decltype(dv.view()); + ASSERT_STREQ(Kokkos::DefaultExecutionSpace::name(), + dvt::device_type::execution_space::name()); + ASSERT_STREQ(Kokkos::DefaultHostExecutionSpace::name(), + hvt::device_type::execution_space::name()); +} + +} // anonymous namespace } // namespace Test #endif // KOKKOS_TEST_DUALVIEW_HPP diff --git a/packages/kokkos/containers/unit_tests/TestDynamicView.hpp b/packages/kokkos/containers/unit_tests/TestDynamicView.hpp index 4b9f99441724..f018793dd6f3 100644 --- a/packages/kokkos/containers/unit_tests/TestDynamicView.hpp +++ b/packages/kokkos/containers/unit_tests/TestDynamicView.hpp @@ -243,8 +243,6 @@ struct TestDynamicView { } }; -// FIXME_SYCL needs resize_serial -#ifndef KOKKOS_ENABLE_SYCL TEST(TEST_CATEGORY, dynamic_view) { using TestDynView = TestDynamicView; @@ -252,7 +250,6 @@ TEST(TEST_CATEGORY, dynamic_view) { TestDynView::run(100000 + 100 * i); } } -#endif } // namespace Test diff --git a/packages/kokkos/containers/unit_tests/TestHIP_Category.hpp b/packages/kokkos/containers/unit_tests/TestHIP_Category.hpp deleted file mode 100644 index c2d60d18148b..000000000000 --- a/packages/kokkos/containers/unit_tests/TestHIP_Category.hpp +++ /dev/null @@ -1,51 +0,0 @@ -/* -//@HEADER -// ************************************************************************ -// -// Kokkos v. 3.0 -// Copyright (2020) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// 1. Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// -// 2. Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// -// 3. Neither the name of the Corporation nor the names of the -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY -// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Questions? Contact Christian R. Trott (crtrott@sandia.gov) -// -// ************************************************************************ -//@HEADER -*/ - -#ifndef KOKKOS_TEST_HIP_HPP -#define KOKKOS_TEST_HIP_HPP - -#define TEST_CATEGORY hip -#define TEST_EXECSPACE Kokkos::Experimental::HIP - -#endif diff --git a/packages/kokkos/containers/unit_tests/TestHPX_Category.hpp b/packages/kokkos/containers/unit_tests/TestHPX_Category.hpp deleted file mode 100644 index 64fc7c0757ba..000000000000 --- a/packages/kokkos/containers/unit_tests/TestHPX_Category.hpp +++ /dev/null @@ -1,51 +0,0 @@ -/* -//@HEADER -// ************************************************************************ -// -// Kokkos v. 3.0 -// Copyright (2020) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// 1. Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// -// 2. Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// -// 3. Neither the name of the Corporation nor the names of the -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY -// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Questions? Contact Christian R. Trott (crtrott@sandia.gov) -// -// ************************************************************************ -//@HEADER -*/ - -#ifndef KOKKOS_TEST_HPX_HPP -#define KOKKOS_TEST_HPX_HPP - -#define TEST_CATEGORY hpx -#define TEST_EXECSPACE Kokkos::Experimental::HPX - -#endif diff --git a/packages/kokkos/containers/unit_tests/TestOffsetView.hpp b/packages/kokkos/containers/unit_tests/TestOffsetView.hpp index 802813b13b81..9ddc226e291f 100644 --- a/packages/kokkos/containers/unit_tests/TestOffsetView.hpp +++ b/packages/kokkos/containers/unit_tests/TestOffsetView.hpp @@ -130,8 +130,6 @@ void test_offsetview_construction() { } } - // FIXME_SYCL requires MDRange policy -#ifndef KOKKOS_ENABLE_SYCL const int ovmin0 = ov.begin(0); const int ovend0 = ov.end(0); const int ovmin1 = ov.begin(1); @@ -178,7 +176,6 @@ void test_offsetview_construction() { } ASSERT_EQ(OVResult, answer) << "Bad data found in OffsetView"; -#endif #endif { @@ -215,8 +212,6 @@ void test_offsetview_construction() { point3_type{{extent0, extent1, extent2}}); #if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) - // FIXME_SYCL requires MDRange policy -#ifdef KOKKOS_ENABLE_SYCL int view3DSum = 0; Kokkos::parallel_reduce( rangePolicy3DZero, @@ -239,7 +234,6 @@ void test_offsetview_construction() { ASSERT_EQ(view3DSum, offsetView3DSum) << "construction of OffsetView from View and begins array broken."; -#endif #endif } view_type viewFromOV = ov.view(); @@ -266,8 +260,6 @@ void test_offsetview_construction() { Kokkos::deep_copy(aView, ov); #if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) - // FIXME_SYCL requires MDRange policy -#ifndef KOKKOS_ENABLE_SYCL int sum = 0; Kokkos::parallel_reduce( rangePolicy2D, @@ -277,7 +269,6 @@ void test_offsetview_construction() { sum); ASSERT_EQ(sum, 0) << "deep_copy(view, offsetView) broken."; -#endif #endif } @@ -288,8 +279,6 @@ void test_offsetview_construction() { Kokkos::deep_copy(ov, aView); #if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) - // FIXME_SYCL requires MDRange policy -#ifndef KOKKOS_ENABLE_SYCL int sum = 0; Kokkos::parallel_reduce( rangePolicy2D, @@ -299,7 +288,6 @@ void test_offsetview_construction() { sum); ASSERT_EQ(sum, 0) << "deep_copy(offsetView, view) broken."; -#endif #endif } } @@ -471,8 +459,6 @@ void test_offsetview_subview() { ASSERT_EQ(offsetSubview.end(1), 9); #if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) - // FIXME_SYCL requires MDRange policy -#ifndef KOKKOS_ENABLE_SYCL using range_type = Kokkos::MDRangePolicy, Kokkos::IndexType >; using point_type = typename range_type::point_type; @@ -498,7 +484,6 @@ void test_offsetview_subview() { sum); ASSERT_EQ(sum, 6 * (e0 - b0) * (e1 - b1)); -#endif #endif } @@ -701,12 +686,9 @@ void test_offsetview_offsets_rank3() { } #endif -// FIXME_SYCL needs MDRangePolicy -#ifndef KOKKOS_ENABLE_SYCL TEST(TEST_CATEGORY, offsetview_construction) { test_offsetview_construction(); } -#endif TEST(TEST_CATEGORY, offsetview_unmanaged_construction) { test_offsetview_unmanaged_construction(); diff --git a/packages/kokkos/containers/unit_tests/TestOpenMP_Category.hpp b/packages/kokkos/containers/unit_tests/TestOpenMP_Category.hpp deleted file mode 100644 index a0169d170294..000000000000 --- a/packages/kokkos/containers/unit_tests/TestOpenMP_Category.hpp +++ /dev/null @@ -1,51 +0,0 @@ -/* -//@HEADER -// ************************************************************************ -// -// Kokkos v. 3.0 -// Copyright (2020) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// 1. Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// -// 2. Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// -// 3. Neither the name of the Corporation nor the names of the -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY -// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Questions? Contact Christian R. Trott (crtrott@sandia.gov) -// -// ************************************************************************ -//@HEADER -*/ - -#ifndef KOKKOS_TEST_OPENMP_HPP -#define KOKKOS_TEST_OPENMP_HPP - -#define TEST_CATEGORY openmp -#define TEST_EXECSPACE Kokkos::OpenMP - -#endif diff --git a/packages/kokkos/containers/unit_tests/TestSYCL_Category.hpp b/packages/kokkos/containers/unit_tests/TestSYCL_Category.hpp deleted file mode 100644 index 51fd3fc91118..000000000000 --- a/packages/kokkos/containers/unit_tests/TestSYCL_Category.hpp +++ /dev/null @@ -1,51 +0,0 @@ -/* -//@HEADER -// ************************************************************************ -// -// Kokkos v. 3.0 -// Copyright (2020) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// 1. Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// -// 2. Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// -// 3. Neither the name of the Corporation nor the names of the -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY -// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Questions? Contact Christian R. Trott (crtrott@sandia.gov) -// -// ************************************************************************ -//@HEADER -*/ - -#ifndef KOKKOS_TEST_SYCL_HPP -#define KOKKOS_TEST_SYCL_HPP - -#define TEST_CATEGORY sycl -#define TEST_EXECSPACE Kokkos::Experimental::SYCL - -#endif diff --git a/packages/kokkos/containers/unit_tests/TestSerial_Category.hpp b/packages/kokkos/containers/unit_tests/TestSerial_Category.hpp deleted file mode 100644 index 2aa09a315ae0..000000000000 --- a/packages/kokkos/containers/unit_tests/TestSerial_Category.hpp +++ /dev/null @@ -1,51 +0,0 @@ -/* -//@HEADER -// ************************************************************************ -// -// Kokkos v. 3.0 -// Copyright (2020) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// 1. Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// -// 2. Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// -// 3. Neither the name of the Corporation nor the names of the -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY -// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Questions? Contact Christian R. Trott (crtrott@sandia.gov) -// -// ************************************************************************ -//@HEADER -*/ - -#ifndef KOKKOS_TEST_SERIAL_HPP -#define KOKKOS_TEST_SERIAL_HPP - -#define TEST_CATEGORY serial -#define TEST_EXECSPACE Kokkos::Serial - -#endif diff --git a/packages/kokkos/containers/unit_tests/TestStaticCrsGraph.hpp b/packages/kokkos/containers/unit_tests/TestStaticCrsGraph.hpp index 8bb267ce5d97..a9a178f95e7b 100644 --- a/packages/kokkos/containers/unit_tests/TestStaticCrsGraph.hpp +++ b/packages/kokkos/containers/unit_tests/TestStaticCrsGraph.hpp @@ -285,10 +285,7 @@ void run_test_graph4() { TEST(TEST_CATEGORY, staticcrsgraph) { TestStaticCrsGraph::run_test_graph(); - // FIXME_SYCL requires MDRangePolicy -#ifndef KOKKOS_ENABLE_SYCL TestStaticCrsGraph::run_test_graph2(); -#endif TestStaticCrsGraph::run_test_graph3(1, 0); TestStaticCrsGraph::run_test_graph3(1, 1000); TestStaticCrsGraph::run_test_graph3(1, 10000); diff --git a/packages/kokkos/containers/unit_tests/TestThreads_Category.hpp b/packages/kokkos/containers/unit_tests/TestThreads_Category.hpp deleted file mode 100644 index 74a2b0da362e..000000000000 --- a/packages/kokkos/containers/unit_tests/TestThreads_Category.hpp +++ /dev/null @@ -1,51 +0,0 @@ -/* -//@HEADER -// ************************************************************************ -// -// Kokkos v. 3.0 -// Copyright (2020) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// 1. Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// -// 2. Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// -// 3. Neither the name of the Corporation nor the names of the -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY -// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Questions? Contact Christian R. Trott (crtrott@sandia.gov) -// -// ************************************************************************ -//@HEADER -*/ - -#ifndef KOKKOS_TEST_THREADS_HPP -#define KOKKOS_TEST_THREADS_HPP - -#define TEST_CATEGORY threads -#define TEST_EXECSPACE Kokkos::Threads - -#endif diff --git a/packages/kokkos/containers/unit_tests/TestUnorderedMap.hpp b/packages/kokkos/containers/unit_tests/TestUnorderedMap.hpp index d39e0061c747..4413cfbc80e3 100644 --- a/packages/kokkos/containers/unit_tests/TestUnorderedMap.hpp +++ b/packages/kokkos/containers/unit_tests/TestUnorderedMap.hpp @@ -163,7 +163,8 @@ struct TestFind { KOKKOS_INLINE_FUNCTION void operator()(typename execution_space::size_type i, value_type &errors) const { - const bool expect_to_find_i = (i < m_max_key); + const bool expect_to_find_i = + (i < typename execution_space::size_type(m_max_key)); const bool exists = m_map.exists(i); @@ -293,10 +294,11 @@ void test_deep_copy(uint32_t num_nodes) { } } -// FIXME_HIP wrong result in CI but works locally -#ifndef KOKKOS_ENABLE_HIP +// FIXME_SYCL wrong results on Nvidia GPUs but correct on Host and Intel GPUs +// FIXME_HIP // WORKAROUND MSVC -#ifndef _WIN32 +#if !(defined(KOKKOS_ENABLE_HIP) && (HIP_VERSION < 401)) && \ + !defined(_WIN32) && !defined(KOKKOS_ENABLE_SYCL) TEST(TEST_CATEGORY, UnorderedMap_insert) { for (int i = 0; i < 500; ++i) { test_insert(100000, 90000, 100, true); @@ -304,7 +306,6 @@ TEST(TEST_CATEGORY, UnorderedMap_insert) { } } #endif -#endif TEST(TEST_CATEGORY, UnorderedMap_failed_insert) { for (int i = 0; i < 1000; ++i) test_failed_insert(10000); diff --git a/packages/kokkos/core/perf_test/CMakeLists.txt b/packages/kokkos/core/perf_test/CMakeLists.txt index b7b817c91097..9ff4b6006da8 100644 --- a/packages/kokkos/core/perf_test/CMakeLists.txt +++ b/packages/kokkos/core/perf_test/CMakeLists.txt @@ -9,6 +9,14 @@ # that in TriBITS KokkosAlgorithms can be disabled... #INCLUDE_DIRECTORIES("${CMAKE_CURRENT_SOURCE_DIR}/../../algorithms/src") +# FIXME_OPENMPTARGET - the NVIDIA HPC compiler nvc++ in the OpenMPTarget backend does not pass the perf_tests. +IF (KOKKOS_ENABLE_OPENMPTARGET + AND (KOKKOS_CXX_COMPILER_ID STREQUAL PGI + OR KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC)) + RETURN() +ENDIF() + + SET(SOURCES PerfTestMain.cpp PerfTestGramSchmidt.cpp @@ -68,8 +76,7 @@ KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) KOKKOS_INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR}) # This test currently times out for MSVC -# FIXME_SYCL these tests don't compile yet (require parallel_for). -IF(NOT KOKKOS_CXX_COMPILER_ID STREQUAL "MSVC" AND NOT Kokkos_ENABLE_SYCL) +IF(NOT KOKKOS_CXX_COMPILER_ID STREQUAL "MSVC") KOKKOS_ADD_EXECUTABLE_AND_TEST( PerfTestExec SOURCES ${SOURCES} @@ -77,13 +84,11 @@ IF(NOT KOKKOS_CXX_COMPILER_ID STREQUAL "MSVC" AND NOT Kokkos_ENABLE_SYCL) ) ENDIF() -# FIXME_SYCL -IF(NOT Kokkos_ENABLE_SYCL) - KOKKOS_ADD_EXECUTABLE_AND_TEST( - PerformanceTest_Atomic - SOURCES test_atomic.cpp - CATEGORIES PERFORMANCE - ) +KOKKOS_ADD_EXECUTABLE_AND_TEST( + PerformanceTest_Atomic + SOURCES test_atomic.cpp + CATEGORIES PERFORMANCE +) IF(NOT KOKKOS_ENABLE_CUDA OR KOKKOS_ENABLE_CUDA_LAMBDA) KOKKOS_ADD_EXECUTABLE_AND_TEST( @@ -98,7 +103,6 @@ KOKKOS_ADD_EXECUTABLE_AND_TEST( SOURCES test_mempool.cpp CATEGORIES PERFORMANCE ) -ENDIF() IF(NOT Kokkos_ENABLE_OPENMPTARGET) # FIXME OPENMPTARGET needs tasking diff --git a/packages/kokkos/core/perf_test/PerfTestGramSchmidt.cpp b/packages/kokkos/core/perf_test/PerfTestGramSchmidt.cpp index 70186283c1a7..dee21fd7a575 100644 --- a/packages/kokkos/core/perf_test/PerfTestGramSchmidt.cpp +++ b/packages/kokkos/core/perf_test/PerfTestGramSchmidt.cpp @@ -69,7 +69,7 @@ struct InvNorm2 : public Kokkos::DotSingle { KOKKOS_INLINE_FUNCTION void final(value_type& result) const { - result = std::sqrt(result); + result = Kokkos::Experimental::sqrt(result); Rjj() = result; inv() = (0 < result) ? 1.0 / result : 0; } @@ -145,7 +145,7 @@ struct ModifiedGramSchmidt { // Q(:,j) *= ( 1 / R(j,j) ); => Q(:,j) *= tmp ; Kokkos::scale(tmp, Qj); - for (size_t k = j + 1; k < count; ++k) { + for (size_type k = j + 1; k < count; ++k) { const vector_type Qk = Kokkos::subview(Q_, Kokkos::ALL(), k); const value_view Rjk = Kokkos::subview(R_, j, k); @@ -165,7 +165,7 @@ struct ModifiedGramSchmidt { //-------------------------------------------------------------------------- - static double test(const size_t length, const size_t count, + static double test(const size_type length, const size_type count, const size_t iter = 1) { multivector_type Q_("Q", length, count); multivector_type R_("R", count, count); diff --git a/packages/kokkos/core/src/CMakeLists.txt b/packages/kokkos/core/src/CMakeLists.txt index e0590a78a4bc..2ab098980572 100644 --- a/packages/kokkos/core/src/CMakeLists.txt +++ b/packages/kokkos/core/src/CMakeLists.txt @@ -72,8 +72,6 @@ KOKKOS_ADD_LIBRARY( ADD_BUILD_OPTIONS # core should be given all the necessary compiler/linker flags ) -SET_TARGET_PROPERTIES(kokkoscore PROPERTIES VERSION ${Kokkos_VERSION}) - KOKKOS_LIB_INCLUDE_DIRECTORIES(kokkoscore ${KOKKOS_TOP_BUILD_DIR} ${CMAKE_CURRENT_BINARY_DIR} @@ -87,3 +85,4 @@ KOKKOS_LINK_TPL(kokkoscore PUBLIC HPX) KOKKOS_LINK_TPL(kokkoscore PUBLIC LIBDL) KOKKOS_LINK_TPL(kokkoscore PUBLIC LIBRT) KOKKOS_LINK_TPL(kokkoscore PUBLIC PTHREAD) +KOKKOS_LINK_TPL(kokkoscore PUBLIC ROCM) diff --git a/packages/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp b/packages/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp index 4a30c914f080..916f109758de 100644 --- a/packages/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp +++ b/packages/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp @@ -45,6 +45,10 @@ #include #ifdef KOKKOS_ENABLE_CUDA +#include +#include +#include + #include #include #include @@ -52,10 +56,6 @@ #include #include -#include -#include -#include - //#include #include #include @@ -65,6 +65,22 @@ /*--------------------------------------------------------------------------*/ /*--------------------------------------------------------------------------*/ +cudaStream_t Kokkos::Impl::cuda_get_deep_copy_stream() { + static cudaStream_t s = nullptr; + if (s == nullptr) { + cudaStreamCreate(&s); + } + return s; +} + +const std::unique_ptr &Kokkos::Impl::cuda_get_deep_copy_space( + bool initialize) { + static std::unique_ptr space = nullptr; + if (!space && initialize) + space = std::make_unique(Kokkos::Impl::cuda_get_deep_copy_stream()); + return space; +} + namespace Kokkos { namespace Impl { @@ -72,13 +88,6 @@ namespace { static std::atomic num_uvm_allocations(0); -cudaStream_t get_deep_copy_stream() { - static cudaStream_t s = nullptr; - if (s == nullptr) { - cudaStreamCreate(&s); - } - return s; -} } // namespace DeepCopy::DeepCopy(void *dst, const void *src, @@ -115,7 +124,7 @@ DeepCopy::DeepCopy(const Cuda &instance, void *dst, } void DeepCopyAsyncCuda(void *dst, const void *src, size_t n) { - cudaStream_t s = get_deep_copy_stream(); + cudaStream_t s = cuda_get_deep_copy_stream(); CUDA_SAFE_CALL(cudaMemcpyAsync(dst, src, n, cudaMemcpyDefault, s)); cudaStreamSynchronize(s); } @@ -128,14 +137,14 @@ void DeepCopyAsyncCuda(void *dst, const void *src, size_t n) { namespace Kokkos { -void CudaSpace::access_error() { +KOKKOS_DEPRECATED void CudaSpace::access_error() { const std::string msg( "Kokkos::CudaSpace::access_error attempt to execute Cuda function from " "non-Cuda space"); Kokkos::Impl::throw_runtime_exception(msg); } -void CudaSpace::access_error(const void *const) { +KOKKOS_DEPRECATED void CudaSpace::access_error(const void *const) { const std::string msg( "Kokkos::CudaSpace::access_error attempt to execute Cuda function from " "non-Cuda space"); @@ -459,79 +468,6 @@ SharedAllocationRecord::attach_texture_object( return tex_obj; } -//============================================================================== -// {{{1 - -std::string SharedAllocationRecord::get_label() const { - SharedAllocationHeader header; - - Kokkos::Impl::DeepCopy( - &header, RecordBase::head(), sizeof(SharedAllocationHeader)); - - return std::string(header.m_label); -} - -std::string SharedAllocationRecord::get_label() - const { - return std::string(RecordBase::head()->m_label); -} - -std::string -SharedAllocationRecord::get_label() const { - return std::string(RecordBase::head()->m_label); -} - -// end SharedAllocationRecord::get_label() }}}1 -//============================================================================== - -//============================================================================== -// {{{1 - -SharedAllocationRecord - *SharedAllocationRecord::allocate( - const Kokkos::CudaSpace &arg_space, const std::string &arg_label, - const size_t arg_alloc_size) { - return new SharedAllocationRecord(arg_space, arg_label, arg_alloc_size); -} - -SharedAllocationRecord - *SharedAllocationRecord::allocate( - const Kokkos::CudaUVMSpace &arg_space, const std::string &arg_label, - const size_t arg_alloc_size) { - return new SharedAllocationRecord(arg_space, arg_label, arg_alloc_size); -} - -SharedAllocationRecord - *SharedAllocationRecord::allocate( - const Kokkos::CudaHostPinnedSpace &arg_space, - const std::string &arg_label, const size_t arg_alloc_size) { - return new SharedAllocationRecord(arg_space, arg_label, arg_alloc_size); -} - -// end SharedAllocationRecord allocate() }}}1 -//============================================================================== - -//============================================================================== -// {{{1 - -void SharedAllocationRecord::deallocate( - SharedAllocationRecord *arg_rec) { - delete static_cast(arg_rec); -} - -void SharedAllocationRecord::deallocate( - SharedAllocationRecord *arg_rec) { - delete static_cast(arg_rec); -} - -void SharedAllocationRecord::deallocate( - SharedAllocationRecord *arg_rec) { - delete static_cast(arg_rec); -} - -// end SharedAllocationRecord deallocate }}}1 -//============================================================================== - //============================================================================== // {{{1 @@ -580,7 +516,7 @@ SharedAllocationRecord::SharedAllocationRecord( const SharedAllocationRecord::function_type arg_dealloc) // Pass through allocated [ SharedAllocationHeader , user_memory ] // Pass through deallocation function - : SharedAllocationRecord( + : base_t( #ifdef KOKKOS_ENABLE_DEBUG &SharedAllocationRecord::s_root_record, #endif @@ -592,13 +528,7 @@ SharedAllocationRecord::SharedAllocationRecord( SharedAllocationHeader header; - // Fill in the Header information - header.m_record = static_cast *>(this); - - strncpy(header.m_label, arg_label.c_str(), - SharedAllocationHeader::maximum_label_length); - // Set last element zero, in case c_str is too long - header.m_label[SharedAllocationHeader::maximum_label_length - 1] = (char)0; + this->base_t::_fill_host_accessible_header_info(header, arg_label); // Copy to device memory Kokkos::Impl::DeepCopy(RecordBase::m_alloc_ptr, &header, @@ -611,7 +541,7 @@ SharedAllocationRecord::SharedAllocationRecord( const SharedAllocationRecord::function_type arg_dealloc) // Pass through allocated [ SharedAllocationHeader , user_memory ] // Pass through deallocation function - : SharedAllocationRecord( + : base_t( #ifdef KOKKOS_ENABLE_DEBUG &SharedAllocationRecord::s_root_record, #endif @@ -620,16 +550,8 @@ SharedAllocationRecord::SharedAllocationRecord( sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc), m_tex_obj(0), m_space(arg_space) { - // Fill in the Header information, directly accessible via UVM - - RecordBase::m_alloc_ptr->m_record = this; - - strncpy(RecordBase::m_alloc_ptr->m_label, arg_label.c_str(), - SharedAllocationHeader::maximum_label_length); - - // Set last element zero, in case c_str is too long - RecordBase::m_alloc_ptr - ->m_label[SharedAllocationHeader::maximum_label_length - 1] = (char)0; + this->base_t::_fill_host_accessible_header_info(*base_t::m_alloc_ptr, + arg_label); } SharedAllocationRecord:: @@ -639,7 +561,7 @@ SharedAllocationRecord:: const SharedAllocationRecord::function_type arg_dealloc) // Pass through allocated [ SharedAllocationHeader , user_memory ] // Pass through deallocation function - : SharedAllocationRecord( + : base_t( #ifdef KOKKOS_ENABLE_DEBUG &SharedAllocationRecord::s_root_record, @@ -648,319 +570,13 @@ SharedAllocationRecord:: arg_alloc_size), sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc), m_space(arg_space) { - // Fill in the Header information, directly accessible on the host - - RecordBase::m_alloc_ptr->m_record = this; - - strncpy(RecordBase::m_alloc_ptr->m_label, arg_label.c_str(), - SharedAllocationHeader::maximum_label_length); - // Set last element zero, in case c_str is too long - RecordBase::m_alloc_ptr - ->m_label[SharedAllocationHeader::maximum_label_length - 1] = (char)0; + this->base_t::_fill_host_accessible_header_info(*base_t::m_alloc_ptr, + arg_label); } // end SharedAllocationRecord constructors }}}1 //============================================================================== -//============================================================================== -// {{{1 - -void *SharedAllocationRecord::allocate_tracked( - const Kokkos::CudaSpace &arg_space, const std::string &arg_alloc_label, - const size_t arg_alloc_size) { - if (!arg_alloc_size) return nullptr; - - SharedAllocationRecord *const r = - allocate(arg_space, arg_alloc_label, arg_alloc_size); - - RecordBase::increment(r); - - return r->data(); -} - -void SharedAllocationRecord::deallocate_tracked( - void *const arg_alloc_ptr) { - if (arg_alloc_ptr != nullptr) { - SharedAllocationRecord *const r = get_record(arg_alloc_ptr); - - RecordBase::decrement(r); - } -} - -void *SharedAllocationRecord::reallocate_tracked( - void *const arg_alloc_ptr, const size_t arg_alloc_size) { - SharedAllocationRecord *const r_old = get_record(arg_alloc_ptr); - SharedAllocationRecord *const r_new = - allocate(r_old->m_space, r_old->get_label(), arg_alloc_size); - - Kokkos::Impl::DeepCopy( - r_new->data(), r_old->data(), std::min(r_old->size(), r_new->size())); - - RecordBase::increment(r_new); - RecordBase::decrement(r_old); - - return r_new->data(); -} - -void *SharedAllocationRecord::allocate_tracked( - const Kokkos::CudaUVMSpace &arg_space, const std::string &arg_alloc_label, - const size_t arg_alloc_size) { - if (!arg_alloc_size) return nullptr; - - SharedAllocationRecord *const r = - allocate(arg_space, arg_alloc_label, arg_alloc_size); - - RecordBase::increment(r); - - return r->data(); -} - -void SharedAllocationRecord::deallocate_tracked( - void *const arg_alloc_ptr) { - if (arg_alloc_ptr != nullptr) { - SharedAllocationRecord *const r = get_record(arg_alloc_ptr); - - RecordBase::decrement(r); - } -} - -void *SharedAllocationRecord::reallocate_tracked( - void *const arg_alloc_ptr, const size_t arg_alloc_size) { - SharedAllocationRecord *const r_old = get_record(arg_alloc_ptr); - SharedAllocationRecord *const r_new = - allocate(r_old->m_space, r_old->get_label(), arg_alloc_size); - - Kokkos::Impl::DeepCopy( - r_new->data(), r_old->data(), std::min(r_old->size(), r_new->size())); - - RecordBase::increment(r_new); - RecordBase::decrement(r_old); - - return r_new->data(); -} - -void * -SharedAllocationRecord::allocate_tracked( - const Kokkos::CudaHostPinnedSpace &arg_space, - const std::string &arg_alloc_label, const size_t arg_alloc_size) { - if (!arg_alloc_size) return nullptr; - - SharedAllocationRecord *const r = - allocate(arg_space, arg_alloc_label, arg_alloc_size); - - RecordBase::increment(r); - - return r->data(); -} - -void SharedAllocationRecord::deallocate_tracked(void *const - arg_alloc_ptr) { - if (arg_alloc_ptr != nullptr) { - SharedAllocationRecord *const r = get_record(arg_alloc_ptr); - - RecordBase::decrement(r); - } -} - -void * -SharedAllocationRecord::reallocate_tracked( - void *const arg_alloc_ptr, const size_t arg_alloc_size) { - SharedAllocationRecord *const r_old = get_record(arg_alloc_ptr); - SharedAllocationRecord *const r_new = - allocate(r_old->m_space, r_old->get_label(), arg_alloc_size); - - Kokkos::Impl::DeepCopy( - r_new->data(), r_old->data(), std::min(r_old->size(), r_new->size())); - - RecordBase::increment(r_new); - RecordBase::decrement(r_old); - - return r_new->data(); -} - -// end SharedAllocationRecored::(re|de|)allocate_tracked }}}1 -//============================================================================== - -//============================================================================== -// {{{1 - -SharedAllocationRecord * -SharedAllocationRecord::get_record(void *alloc_ptr) { - using RecordCuda = SharedAllocationRecord; - - using Header = SharedAllocationHeader; - - // Copy the header from the allocation - Header head; - - Header const *const head_cuda = - alloc_ptr ? Header::get_header(alloc_ptr) : nullptr; - - if (alloc_ptr) { - Kokkos::Impl::DeepCopy( - &head, head_cuda, sizeof(SharedAllocationHeader)); - } - - RecordCuda *const record = - alloc_ptr ? static_cast(head.m_record) : nullptr; - - if (!alloc_ptr || record->m_alloc_ptr != head_cuda) { - Kokkos::Impl::throw_runtime_exception( - std::string("Kokkos::Impl::SharedAllocationRecord< Kokkos::CudaSpace , " - "void >::get_record ERROR")); - } - - return record; -} - -SharedAllocationRecord *SharedAllocationRecord< - Kokkos::CudaUVMSpace, void>::get_record(void *alloc_ptr) { - using Header = SharedAllocationHeader; - using RecordCuda = SharedAllocationRecord; - - Header *const h = - alloc_ptr ? reinterpret_cast
(alloc_ptr) - 1 : nullptr; - - if (!alloc_ptr || h->m_record->m_alloc_ptr != h) { - Kokkos::Impl::throw_runtime_exception( - std::string("Kokkos::Impl::SharedAllocationRecord< " - "Kokkos::CudaUVMSpace , void >::get_record ERROR")); - } - - return static_cast(h->m_record); -} - -SharedAllocationRecord - *SharedAllocationRecord::get_record( - void *alloc_ptr) { - using Header = SharedAllocationHeader; - using RecordCuda = SharedAllocationRecord; - - Header *const h = - alloc_ptr ? reinterpret_cast
(alloc_ptr) - 1 : nullptr; - - if (!alloc_ptr || h->m_record->m_alloc_ptr != h) { - Kokkos::Impl::throw_runtime_exception( - std::string("Kokkos::Impl::SharedAllocationRecord< " - "Kokkos::CudaHostPinnedSpace , void >::get_record ERROR")); - } - - return static_cast(h->m_record); -} - -// end SharedAllocationRecord::get_record() }}}1 -//============================================================================== - -//============================================================================== -// {{{1 - -// Iterate records to print orphaned memory ... -void SharedAllocationRecord::print_records( - std::ostream &s, const Kokkos::CudaSpace &, bool detail) { - (void)s; - (void)detail; -#ifdef KOKKOS_ENABLE_DEBUG - SharedAllocationRecord *r = &s_root_record; - - char buffer[256]; - - SharedAllocationHeader head; - - if (detail) { - do { - if (r->m_alloc_ptr) { - Kokkos::Impl::DeepCopy( - &head, r->m_alloc_ptr, sizeof(SharedAllocationHeader)); - } else { - head.m_label[0] = 0; - } - - // Formatting dependent on sizeof(uintptr_t) - const char *format_string; - - if (sizeof(uintptr_t) == sizeof(unsigned long)) { - format_string = - "Cuda addr( 0x%.12lx ) list( 0x%.12lx 0x%.12lx ) extent[ 0x%.12lx " - "+ %.8ld ] count(%d) dealloc(0x%.12lx) %s\n"; - } else if (sizeof(uintptr_t) == sizeof(unsigned long long)) { - format_string = - "Cuda addr( 0x%.12llx ) list( 0x%.12llx 0x%.12llx ) extent[ " - "0x%.12llx + %.8ld ] count(%d) dealloc(0x%.12llx) %s\n"; - } - - snprintf(buffer, 256, format_string, reinterpret_cast(r), - reinterpret_cast(r->m_prev), - reinterpret_cast(r->m_next), - reinterpret_cast(r->m_alloc_ptr), r->m_alloc_size, - r->m_count, reinterpret_cast(r->m_dealloc), - head.m_label); - s << buffer; - r = r->m_next; - } while (r != &s_root_record); - } else { - do { - if (r->m_alloc_ptr) { - Kokkos::Impl::DeepCopy( - &head, r->m_alloc_ptr, sizeof(SharedAllocationHeader)); - - // Formatting dependent on sizeof(uintptr_t) - const char *format_string; - - if (sizeof(uintptr_t) == sizeof(unsigned long)) { - format_string = "Cuda [ 0x%.12lx + %ld ] %s\n"; - } else if (sizeof(uintptr_t) == sizeof(unsigned long long)) { - format_string = "Cuda [ 0x%.12llx + %ld ] %s\n"; - } - - snprintf(buffer, 256, format_string, - reinterpret_cast(r->data()), r->size(), - head.m_label); - } else { - snprintf(buffer, 256, "Cuda [ 0 + 0 ]\n"); - } - s << buffer; - r = r->m_next; - } while (r != &s_root_record); - } -#else - Kokkos::Impl::throw_runtime_exception( - "SharedAllocationHeader::print_records only works with " - "KOKKOS_ENABLE_DEBUG enabled"); -#endif -} - -void SharedAllocationRecord::print_records( - std::ostream &s, const Kokkos::CudaUVMSpace &, bool detail) { - (void)s; - (void)detail; -#ifdef KOKKOS_ENABLE_DEBUG - SharedAllocationRecord::print_host_accessible_records( - s, "CudaUVM", &s_root_record, detail); -#else - Kokkos::Impl::throw_runtime_exception( - "SharedAllocationHeader::print_records only works with " - "KOKKOS_ENABLE_DEBUG enabled"); -#endif -} - -void SharedAllocationRecord::print_records( - std::ostream &s, const Kokkos::CudaHostPinnedSpace &, bool detail) { - (void)s; - (void)detail; -#ifdef KOKKOS_ENABLE_DEBUG - SharedAllocationRecord::print_host_accessible_records( - s, "CudaHostPinned", &s_root_record, detail); -#else - Kokkos::Impl::throw_runtime_exception( - "SharedAllocationHeader::print_records only works with " - "KOKKOS_ENABLE_DEBUG enabled"); -#endif -} - -// end SharedAllocationRecord::print_records() }}}1 -//============================================================================== - void cuda_prefetch_pointer(const Cuda &space, const void *ptr, size_t bytes, bool to_device) { if ((ptr == nullptr) || (bytes == 0)) return; @@ -984,6 +600,29 @@ void cuda_prefetch_pointer(const Cuda &space, const void *ptr, size_t bytes, } // namespace Impl } // namespace Kokkos + +//============================================================================== +// {{{1 + +#include + +namespace Kokkos { +namespace Impl { + +// To avoid additional compilation cost for something that's (mostly?) not +// performance sensitive, we explicity instantiate these CRTP base classes here, +// where we have access to the associated *_timpl.hpp header files. +template class SharedAllocationRecordCommon; +template class HostInaccessibleSharedAllocationRecordCommon; +template class SharedAllocationRecordCommon; +template class SharedAllocationRecordCommon; + +} // end namespace Impl +} // end namespace Kokkos + +// end Explicit instantiations of CRTP Base classes }}}1 +//============================================================================== + #else void KOKKOS_CORE_SRC_CUDA_CUDASPACE_PREVENT_LINK_ERROR() {} #endif // KOKKOS_ENABLE_CUDA diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp index 0d6d3bdb3ac5..0f4259072d97 100644 --- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp +++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp @@ -140,7 +140,7 @@ inline int cuda_deduce_block_size(bool early_termination, } } - if (early_termination && blocks_per_sm != 0) break; + if (early_termination && opt_block_size != 0) break; } return opt_block_size; @@ -222,7 +222,8 @@ inline size_t get_shmem_per_sm_prefer_l1(cudaDeviceProp const& properties) { case 52: case 61: return 96; case 70: - case 80: return 8; + case 80: + case 86: return 8; case 75: return 32; default: Kokkos::Impl::throw_runtime_exception( diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Half.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Half.hpp index a9a62380e5a4..ec9c434fe663 100644 --- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Half.hpp +++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Half.hpp @@ -175,30 +175,42 @@ class half_t { return cast_from_half(*this); } + /** + * Conversion constructors. + * + * Support implicit conversions from impl_type, float, double -> half_t + * Mixed precision expressions require upcasting which is done in the + * "// Binary Arithmetic" operator overloads below. + * + * Support implicit conversions from integral types -> half_t. + * Expressions involving half_t with integral types require downcasting + * the integral types to half_t. Existing operator overloads can handle this + * with the addition of the below implicit conversion constructors. + */ KOKKOS_FUNCTION half_t(impl_type rhs) : val(rhs) {} KOKKOS_FUNCTION - explicit half_t(float rhs) : val(cast_to_half(rhs).val) {} + half_t(float rhs) : val(cast_to_half(rhs).val) {} KOKKOS_FUNCTION - explicit half_t(bool rhs) : val(cast_to_half(rhs).val) {} + half_t(double rhs) : val(cast_to_half(rhs).val) {} KOKKOS_FUNCTION - explicit half_t(double rhs) : val(cast_to_half(rhs).val) {} + explicit half_t(bool rhs) : val(cast_to_half(rhs).val) {} KOKKOS_FUNCTION - explicit half_t(short rhs) : val(cast_to_half(rhs).val) {} + half_t(short rhs) : val(cast_to_half(rhs).val) {} KOKKOS_FUNCTION - explicit half_t(int rhs) : val(cast_to_half(rhs).val) {} + half_t(int rhs) : val(cast_to_half(rhs).val) {} KOKKOS_FUNCTION - explicit half_t(long rhs) : val(cast_to_half(rhs).val) {} + half_t(long rhs) : val(cast_to_half(rhs).val) {} KOKKOS_FUNCTION - explicit half_t(long long rhs) : val(cast_to_half(rhs).val) {} + half_t(long long rhs) : val(cast_to_half(rhs).val) {} KOKKOS_FUNCTION - explicit half_t(unsigned short rhs) : val(cast_to_half(rhs).val) {} + half_t(unsigned short rhs) : val(cast_to_half(rhs).val) {} KOKKOS_FUNCTION - explicit half_t(unsigned int rhs) : val(cast_to_half(rhs).val) {} + half_t(unsigned int rhs) : val(cast_to_half(rhs).val) {} KOKKOS_FUNCTION - explicit half_t(unsigned long rhs) : val(cast_to_half(rhs).val) {} + half_t(unsigned long rhs) : val(cast_to_half(rhs).val) {} KOKKOS_FUNCTION - explicit half_t(unsigned long long rhs) : val(cast_to_half(rhs).val) {} + half_t(unsigned long long rhs) : val(cast_to_half(rhs).val) {} // Unary operators KOKKOS_FUNCTION @@ -243,7 +255,7 @@ class half_t { #else float tmp = __half2float(val); --tmp; - val = __float2half(tmp); + val = __float2half(tmp); #endif return *this; } @@ -276,88 +288,317 @@ class half_t { return *this; } + template + KOKKOS_FUNCTION void operator=(T rhs) volatile { + val = cast_to_half(rhs).val; + } + // Compound operators KOKKOS_FUNCTION half_t& operator+=(half_t rhs) { #ifdef __CUDA_ARCH__ val += rhs.val; #else - val = __float2half(__half2float(val) + __half2float(rhs.val)); + val = __float2half(__half2float(val) + __half2float(rhs.val)); #endif return *this; } + KOKKOS_FUNCTION + volatile half_t& operator+=(half_t rhs) volatile { +#ifdef __CUDA_ARCH__ + // Cuda 10 supports __half volatile stores but not volatile arithmetic + // operands. Cast away volatile-ness of val for arithmetic but not for store + // location. + val = const_cast(val) + rhs.val; +#else + // Use non-volatile val_ref to suppress: + // "warning: implicit dereference will not access object of type ‘volatile + // __half’ in statement" + auto val_ref = const_cast(val); + val_ref = __float2half(__half2float(const_cast(val)) + + __half2float(rhs.val)); +#endif + return *this; + } + + // Compund operators: upcast overloads for += + template + KOKKOS_FUNCTION std::enable_if_t< + std::is_same::value || std::is_same::value, T> friend + operator+=(T& lhs, half_t rhs) { + lhs += static_cast(rhs); + return lhs; + } + + KOKKOS_FUNCTION + half_t& operator+=(float rhs) { + float result = static_cast(val) + rhs; + val = static_cast(result); + return *this; + } + + KOKKOS_FUNCTION + half_t& operator+=(double rhs) { + double result = static_cast(val) + rhs; + val = static_cast(result); + return *this; + } + KOKKOS_FUNCTION half_t& operator-=(half_t rhs) { #ifdef __CUDA_ARCH__ val -= rhs.val; #else - val = __float2half(__half2float(val) - __half2float(rhs.val)); + val = __float2half(__half2float(val) - __half2float(rhs.val)); #endif return *this; } + KOKKOS_FUNCTION + volatile half_t& operator-=(half_t rhs) volatile { +#ifdef __CUDA_ARCH__ + // Cuda 10 supports __half volatile stores but not volatile arithmetic + // operands. Cast away volatile-ness of val for arithmetic but not for store + // location. + val = const_cast(val) - rhs.val; +#else + // Use non-volatile val_ref to suppress: + // "warning: implicit dereference will not access object of type ‘volatile + // __half’ in statement" + auto val_ref = const_cast(val); + val_ref = __float2half(__half2float(const_cast(val)) - + __half2float(rhs.val)); +#endif + return *this; + } + + // Compund operators: upcast overloads for -= + template + KOKKOS_FUNCTION std::enable_if_t< + std::is_same::value || std::is_same::value, T> friend + operator-=(T& lhs, half_t rhs) { + lhs -= static_cast(rhs); + return lhs; + } + + KOKKOS_FUNCTION + half_t& operator-=(float rhs) { + float result = static_cast(val) - rhs; + val = static_cast(result); + return *this; + } + + KOKKOS_FUNCTION + half_t& operator-=(double rhs) { + double result = static_cast(val) - rhs; + val = static_cast(result); + return *this; + } + KOKKOS_FUNCTION half_t& operator*=(half_t rhs) { #ifdef __CUDA_ARCH__ val *= rhs.val; #else - val = __float2half(__half2float(val) * __half2float(rhs.val)); + val = __float2half(__half2float(val) * __half2float(rhs.val)); #endif return *this; } + KOKKOS_FUNCTION + volatile half_t& operator*=(half_t rhs) volatile { +#ifdef __CUDA_ARCH__ + // Cuda 10 supports __half volatile stores but not volatile arithmetic + // operands. Cast away volatile-ness of val for arithmetic but not for store + // location. + val = const_cast(val) * rhs.val; +#else + // Use non-volatile val_ref to suppress: + // "warning: implicit dereference will not access object of type ‘volatile + // __half’ in statement" + auto val_ref = const_cast(val); + val_ref = __float2half(__half2float(const_cast(val)) * + __half2float(rhs.val)); +#endif + return *this; + } + + // Compund operators: upcast overloads for *= + template + KOKKOS_FUNCTION std::enable_if_t< + std::is_same::value || std::is_same::value, T> friend + operator*=(T& lhs, half_t rhs) { + lhs *= static_cast(rhs); + return lhs; + } + + KOKKOS_FUNCTION + half_t& operator*=(float rhs) { + float result = static_cast(val) * rhs; + val = static_cast(result); + return *this; + } + + KOKKOS_FUNCTION + half_t& operator*=(double rhs) { + double result = static_cast(val) * rhs; + val = static_cast(result); + return *this; + } + KOKKOS_FUNCTION half_t& operator/=(half_t rhs) { #ifdef __CUDA_ARCH__ val /= rhs.val; #else - val = __float2half(__half2float(val) / __half2float(rhs.val)); + val = __float2half(__half2float(val) / __half2float(rhs.val)); #endif return *this; } + KOKKOS_FUNCTION + volatile half_t& operator/=(half_t rhs) volatile { +#ifdef __CUDA_ARCH__ + // Cuda 10 supports __half volatile stores but not volatile arithmetic + // operands. Cast away volatile-ness of val for arithmetic but not for store + // location. + val = const_cast(val) / rhs.val; +#else + // Use non-volatile val_ref to suppress: + // "warning: implicit dereference will not access object of type ‘volatile + // __half’ in statement" + auto val_ref = const_cast(val); + val_ref = __float2half(__half2float(const_cast(val)) / + __half2float(rhs.val)); +#endif + return *this; + } + + // Compund operators: upcast overloads for /= + template + KOKKOS_FUNCTION std::enable_if_t< + std::is_same::value || std::is_same::value, T> friend + operator/=(T& lhs, half_t rhs) { + lhs /= static_cast(rhs); + return lhs; + } + + KOKKOS_FUNCTION + half_t& operator/=(float rhs) { + float result = static_cast(val) / rhs; + val = static_cast(result); + return *this; + } + + KOKKOS_FUNCTION + half_t& operator/=(double rhs) { + double result = static_cast(val) / rhs; + val = static_cast(result); + return *this; + } + // Binary Arithmetic KOKKOS_FUNCTION half_t friend operator+(half_t lhs, half_t rhs) { #ifdef __CUDA_ARCH__ lhs.val += rhs.val; #else - lhs.val = __float2half(__half2float(lhs.val) + __half2float(rhs.val)); + lhs.val = __float2half(__half2float(lhs.val) + __half2float(rhs.val)); #endif return lhs; } + // Binary Arithmetic upcast operators for + + template + KOKKOS_FUNCTION std::enable_if_t< + std::is_same::value || std::is_same::value, T> friend + operator+(half_t lhs, T rhs) { + return T(lhs) + rhs; + } + + template + KOKKOS_FUNCTION std::enable_if_t< + std::is_same::value || std::is_same::value, T> friend + operator+(T lhs, half_t rhs) { + return lhs + T(rhs); + } + KOKKOS_FUNCTION half_t friend operator-(half_t lhs, half_t rhs) { #ifdef __CUDA_ARCH__ lhs.val -= rhs.val; #else - lhs.val = __float2half(__half2float(lhs.val) - __half2float(rhs.val)); + lhs.val = __float2half(__half2float(lhs.val) - __half2float(rhs.val)); #endif return lhs; } + // Binary Arithmetic upcast operators for - + template + KOKKOS_FUNCTION std::enable_if_t< + std::is_same::value || std::is_same::value, T> friend + operator-(half_t lhs, T rhs) { + return T(lhs) - rhs; + } + + template + KOKKOS_FUNCTION std::enable_if_t< + std::is_same::value || std::is_same::value, T> friend + operator-(T lhs, half_t rhs) { + return lhs - T(rhs); + } + KOKKOS_FUNCTION half_t friend operator*(half_t lhs, half_t rhs) { #ifdef __CUDA_ARCH__ lhs.val *= rhs.val; #else - lhs.val = __float2half(__half2float(lhs.val) * __half2float(rhs.val)); + lhs.val = __float2half(__half2float(lhs.val) * __half2float(rhs.val)); #endif return lhs; } + // Binary Arithmetic upcast operators for * + template + KOKKOS_FUNCTION std::enable_if_t< + std::is_same::value || std::is_same::value, T> friend + operator*(half_t lhs, T rhs) { + return T(lhs) * rhs; + } + + template + KOKKOS_FUNCTION std::enable_if_t< + std::is_same::value || std::is_same::value, T> friend + operator*(T lhs, half_t rhs) { + return lhs * T(rhs); + } + KOKKOS_FUNCTION half_t friend operator/(half_t lhs, half_t rhs) { #ifdef __CUDA_ARCH__ lhs.val /= rhs.val; #else - lhs.val = __float2half(__half2float(lhs.val) / __half2float(rhs.val)); + lhs.val = __float2half(__half2float(lhs.val) / __half2float(rhs.val)); #endif return lhs; } + // Binary Arithmetic upcast operators for / + template + KOKKOS_FUNCTION std::enable_if_t< + std::is_same::value || std::is_same::value, T> friend + operator/(half_t lhs, T rhs) { + return T(lhs) / rhs; + } + + template + KOKKOS_FUNCTION std::enable_if_t< + std::is_same::value || std::is_same::value, T> friend + operator/(T lhs, half_t rhs) { + return lhs / T(rhs); + } + // Logical operators KOKKOS_FUNCTION bool operator!() const { diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp index b8e816345873..016cb6cdcbdd 100644 --- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp +++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp @@ -54,6 +54,7 @@ #include #include #include +#include #include #include @@ -248,11 +249,11 @@ void CudaInternal::print_configuration(std::ostream &s) const { const CudaInternalDevices &dev_info = CudaInternalDevices::singleton(); #if defined(KOKKOS_ENABLE_CUDA) - s << "macro KOKKOS_ENABLE_CUDA : defined" << std::endl; + s << "macro KOKKOS_ENABLE_CUDA : defined\n"; #endif #if defined(CUDA_VERSION) s << "macro CUDA_VERSION = " << CUDA_VERSION << " = version " - << CUDA_VERSION / 1000 << "." << (CUDA_VERSION % 1000) / 10 << std::endl; + << CUDA_VERSION / 1000 << "." << (CUDA_VERSION % 1000) / 10 << '\n'; #endif for (int i = 0; i < dev_info.m_cudaDevCount; ++i) { @@ -274,7 +275,6 @@ CudaInternal::~CudaInternal() { m_scratchConcurrentBitset) { std::cerr << "Kokkos::Cuda ERROR: Failed to call Kokkos::Cuda::finalize()" << std::endl; - std::cerr.flush(); } m_cudaDev = -1; @@ -358,8 +358,7 @@ void CudaInternal::initialize(int cuda_device_id, cudaStream_t stream) { if (m_cudaArch == 0) { std::stringstream ss; - ss << "Kokkos::Cuda::initialize ERROR: likely mismatch of architecture" - << std::endl; + ss << "Kokkos::Cuda::initialize ERROR: likely mismatch of architecture\n"; std::string msg = ss.str(); Kokkos::abort(msg.c_str()); } @@ -373,7 +372,7 @@ void CudaInternal::initialize(int cuda_device_id, cudaStream_t stream) { "compute capability " << compiled_major << "." << compiled_minor << " on device with compute capability " << cudaProp.major << "." - << cudaProp.minor << " is not supported by CUDA!" << std::endl; + << cudaProp.minor << " is not supported by CUDA!\n"; std::string msg = ss.str(); Kokkos::abort(msg.c_str()); } @@ -458,7 +457,7 @@ void CudaInternal::initialize(int cuda_device_id, cudaStream_t stream) { Kokkos::Impl::SharedAllocationRecord; Record *const r = - Record::allocate(Kokkos::CudaSpace(), "InternalScratchBitset", + Record::allocate(Kokkos::CudaSpace(), "Kokkos::InternalScratchBitset", sizeof(uint32_t) * buffer_bound); Record::increment(r); @@ -492,17 +491,11 @@ void CudaInternal::initialize(int cuda_device_id, cudaStream_t stream) { #ifdef KOKKOS_ENABLE_CUDA_UVM if (Kokkos::show_warnings() && !cuda_launch_blocking()) { - std::cerr << "Kokkos::Cuda::initialize WARNING: Cuda is allocating into " - "UVMSpace by default" - << std::endl; - std::cerr << " without setting " - "CUDA_LAUNCH_BLOCKING=1." - << std::endl; - std::cerr << " The code must call " - "Cuda().fence() after each kernel" - << std::endl; - std::cerr << " or will likely crash when " - "accessing data on the host." + std::cerr << R"warning( +Kokkos::Cuda::initialize WARNING: Cuda is allocating into UVMSpace by default + without setting CUDA_LAUNCH_BLOCKING=1. + The code must call Cuda().fence() after each kernel + or will likely crash when accessing data on the host.)warning" << std::endl; } @@ -520,19 +513,13 @@ void CudaInternal::initialize(int cuda_device_id, cudaStream_t stream) { if (Kokkos::show_warnings() && (!visible_devices_one && !force_device_alloc)) { - std::cerr << "Kokkos::Cuda::initialize WARNING: Cuda is allocating into " - "UVMSpace by default" + std::cerr << R"warning( +Kokkos::Cuda::initialize WARNING: Cuda is allocating into UVMSpace by default + without setting CUDA_MANAGED_FORCE_DEVICE_ALLOC=1 or + setting CUDA_VISIBLE_DEVICES. + This could on multi GPU systems lead to severe performance" + penalties.)warning" << std::endl; - std::cerr << " without setting " - "CUDA_MANAGED_FORCE_DEVICE_ALLOC=1 or " - << std::endl; - std::cerr - << " setting CUDA_VISIBLE_DEVICES." - << std::endl; - std::cerr << " This could on multi GPU " - "systems lead to severe performance" - << std::endl; - std::cerr << " penalties." << std::endl; } #endif @@ -575,7 +562,7 @@ Cuda::size_type *CudaInternal::scratch_flags(const Cuda::size_type size) const { if (m_scratchFlags) Record::decrement(Record::get_record(m_scratchFlags)); Record *const r = - Record::allocate(Kokkos::CudaSpace(), "InternalScratchFlags", + Record::allocate(Kokkos::CudaSpace(), "Kokkos::InternalScratchFlags", (sizeof(ScratchGrain) * m_scratchFlagsCount)); Record::increment(r); @@ -600,7 +587,7 @@ Cuda::size_type *CudaInternal::scratch_space(const Cuda::size_type size) const { if (m_scratchSpace) Record::decrement(Record::get_record(m_scratchSpace)); Record *const r = - Record::allocate(Kokkos::CudaSpace(), "InternalScratchSpace", + Record::allocate(Kokkos::CudaSpace(), "Kokkos::InternalScratchSpace", (sizeof(ScratchGrain) * m_scratchSpaceCount)); Record::increment(r); @@ -624,7 +611,7 @@ Cuda::size_type *CudaInternal::scratch_unified( Record::decrement(Record::get_record(m_scratchUnified)); Record *const r = Record::allocate( - Kokkos::CudaHostPinnedSpace(), "InternalScratchUnified", + Kokkos::CudaHostPinnedSpace(), "Kokkos::InternalScratchUnified", (sizeof(ScratchGrain) * m_scratchUnifiedCount)); Record::increment(r); @@ -646,8 +633,9 @@ Cuda::size_type *CudaInternal::scratch_functor( if (m_scratchFunctor) Record::decrement(Record::get_record(m_scratchFunctor)); - Record *const r = Record::allocate( - Kokkos::CudaSpace(), "InternalScratchFunctor", m_scratchFunctorSize); + Record *const r = + Record::allocate(Kokkos::CudaSpace(), "Kokkos::InternalScratchFunctor", + m_scratchFunctorSize); Record::increment(r); @@ -662,7 +650,7 @@ void *CudaInternal::resize_team_scratch_space(std::int64_t bytes, if (m_team_scratch_current_size == 0) { m_team_scratch_current_size = bytes; m_team_scratch_ptr = Kokkos::kokkos_malloc( - "CudaSpace::ScratchMemory", m_team_scratch_current_size); + "Kokkos::CudaSpace::TeamScratchMemory", m_team_scratch_current_size); } if ((bytes > m_team_scratch_current_size) || ((bytes < m_team_scratch_current_size) && (force_shrink))) { @@ -676,6 +664,9 @@ void *CudaInternal::resize_team_scratch_space(std::int64_t bytes, //---------------------------------------------------------------------------- void CudaInternal::finalize() { + // skip if finalize() has already been called + if (was_finalized) return; + was_finalized = true; if (nullptr != m_scratchSpace || nullptr != m_scratchFlags) { // Only finalize this if we're the singleton @@ -719,6 +710,11 @@ void CudaInternal::finalize() { if (this == &singleton()) { cudaFreeHost(constantMemHostStaging); cudaEventDestroy(constantMemReusable); + auto &deep_copy_space = + Kokkos::Impl::cuda_get_deep_copy_space(/*initialize*/ false); + if (deep_copy_space) + deep_copy_space->impl_internal_space_instance()->finalize(); + cudaStreamDestroy(cuda_get_deep_copy_stream()); } } @@ -821,62 +817,23 @@ Cuda::size_type Cuda::device_arch() { void Cuda::impl_finalize() { Impl::CudaInternal::singleton().finalize(); } Cuda::Cuda() - : m_space_instance(&Impl::CudaInternal::singleton()), m_counter(nullptr) { + : m_space_instance(&Impl::CudaInternal::singleton(), + [](Impl::CudaInternal *) {}) { Impl::CudaInternal::singleton().verify_is_initialized( "Cuda instance constructor"); } Cuda::Cuda(cudaStream_t stream) - : m_space_instance(new Impl::CudaInternal), m_counter(new int(1)) { + : m_space_instance(new Impl::CudaInternal, [](Impl::CudaInternal *ptr) { + ptr->finalize(); + delete ptr; + }) { Impl::CudaInternal::singleton().verify_is_initialized( "Cuda instance constructor"); m_space_instance->initialize(Impl::CudaInternal::singleton().m_cudaDev, stream); } -KOKKOS_FUNCTION Cuda::Cuda(Cuda &&other) noexcept { - m_space_instance = other.m_space_instance; - other.m_space_instance = nullptr; - m_counter = other.m_counter; - other.m_counter = nullptr; -} - -KOKKOS_FUNCTION Cuda::Cuda(const Cuda &other) - : m_space_instance(other.m_space_instance), m_counter(other.m_counter) { -#ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA - if (m_counter) Kokkos::atomic_add(m_counter, 1); -#endif -} - -KOKKOS_FUNCTION Cuda &Cuda::operator=(Cuda &&other) noexcept { - m_space_instance = other.m_space_instance; - other.m_space_instance = nullptr; - m_counter = other.m_counter; - other.m_counter = nullptr; - return *this; -} - -KOKKOS_FUNCTION Cuda &Cuda::operator=(const Cuda &other) { - m_space_instance = other.m_space_instance; - m_counter = other.m_counter; -#ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA - if (m_counter) Kokkos::atomic_add(m_counter, 1); -#endif - return *this; -} - -KOKKOS_FUNCTION Cuda::~Cuda() noexcept { -#ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA - if (m_counter == nullptr) return; - int const count = Kokkos::atomic_fetch_sub(m_counter, 1); - if (count == 1) { - delete m_counter; - m_space_instance->finalize(); - delete m_space_instance; - } -#endif -} - void Cuda::print_configuration(std::ostream &s, const bool) { Impl::CudaInternal::singleton().print_configuration(s); } @@ -924,54 +881,53 @@ void CudaSpaceInitializer::fence() { Kokkos::Cuda::impl_static_fence(); } void CudaSpaceInitializer::print_configuration(std::ostream &msg, const bool detail) { - msg << "Device Execution Space:" << std::endl; - msg << " KOKKOS_ENABLE_CUDA: "; - msg << "yes" << std::endl; + msg << "Device Execution Space:\n"; + msg << " KOKKOS_ENABLE_CUDA: yes\n"; - msg << "Cuda Atomics:" << std::endl; + msg << "Cuda Atomics:\n"; msg << " KOKKOS_ENABLE_CUDA_ATOMICS: "; #ifdef KOKKOS_ENABLE_CUDA_ATOMICS - msg << "yes" << std::endl; + msg << "yes\n"; #else - msg << "no" << std::endl; + msg << "no\n"; #endif - msg << "Cuda Options:" << std::endl; + msg << "Cuda Options:\n"; msg << " KOKKOS_ENABLE_CUDA_LAMBDA: "; #ifdef KOKKOS_ENABLE_CUDA_LAMBDA - msg << "yes" << std::endl; + msg << "yes\n"; #else - msg << "no" << std::endl; + msg << "no\n"; #endif msg << " KOKKOS_ENABLE_CUDA_LDG_INTRINSIC: "; #ifdef KOKKOS_ENABLE_CUDA_LDG_INTRINSIC - msg << "yes" << std::endl; + msg << "yes\n"; #else - msg << "no" << std::endl; + msg << "no\n"; #endif msg << " KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE: "; #ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE - msg << "yes" << std::endl; + msg << "yes\n"; #else - msg << "no" << std::endl; + msg << "no\n"; #endif msg << " KOKKOS_ENABLE_CUDA_UVM: "; #ifdef KOKKOS_ENABLE_CUDA_UVM - msg << "yes" << std::endl; + msg << "yes\n"; #else - msg << "no" << std::endl; + msg << "no\n"; #endif msg << " KOKKOS_ENABLE_CUSPARSE: "; #ifdef KOKKOS_ENABLE_CUSPARSE - msg << "yes" << std::endl; + msg << "yes\n"; #else - msg << "no" << std::endl; + msg << "no\n"; #endif msg << " KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA: "; #ifdef KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA - msg << "yes" << std::endl; + msg << "yes\n"; #else - msg << "no" << std::endl; + msg << "no\n"; #endif msg << "\nCuda Runtime Configuration:" << std::endl; diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.hpp index 13773d70c5a8..aaec2c29260a 100644 --- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.hpp +++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.hpp @@ -17,30 +17,24 @@ namespace Kokkos { namespace Impl { struct CudaTraits { - enum : CudaSpace::size_type { WarpSize = 32 /* 0x0020 */ }; - enum : CudaSpace::size_type { - WarpIndexMask = 0x001f /* Mask for warpindex */ - }; - enum : CudaSpace::size_type { - WarpIndexShift = 5 /* WarpSize == 1 << WarpShift */ - }; - - enum : CudaSpace::size_type { - ConstantMemoryUsage = 0x008000 /* 32k bytes */ - }; - enum : CudaSpace::size_type { - ConstantMemoryCache = 0x002000 /* 8k bytes */ - }; - enum : CudaSpace::size_type { - KernelArgumentLimit = 0x001000 /* 4k bytes */ - }; - enum : CudaSpace::size_type { - MaxHierarchicalParallelism = 1024 /* team_size * vector_length */ - }; + static constexpr CudaSpace::size_type WarpSize = 32 /* 0x0020 */; + static constexpr CudaSpace::size_type WarpIndexMask = + 0x001f; /* Mask for warpindex */ + static constexpr CudaSpace::size_type WarpIndexShift = + 5; /* WarpSize == 1 << WarpShift */ + + static constexpr CudaSpace::size_type ConstantMemoryUsage = + 0x008000; /* 32k bytes */ + static constexpr CudaSpace::size_type ConstantMemoryCache = + 0x002000; /* 8k bytes */ + static constexpr CudaSpace::size_type KernelArgumentLimit = + 0x001000; /* 4k bytes */ + static constexpr CudaSpace::size_type MaxHierarchicalParallelism = + 1024; /* team_size * vector_length */ using ConstantGlobalBufferType = unsigned long[ConstantMemoryUsage / sizeof(unsigned long)]; - enum { ConstantMemoryUseThreshold = 0x000200 /* 512 bytes */ }; + static constexpr int ConstantMemoryUseThreshold = 0x000200 /* 512 bytes */; KOKKOS_INLINE_FUNCTION static CudaSpace::size_type warp_count( CudaSpace::size_type i) { diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp index 39404e0bf38f..d892a893b330 100644 --- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp +++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp @@ -158,6 +158,9 @@ inline void check_shmem_request(CudaInternal const* cuda_instance, int shmem) { } } +// This function needs to be template on DriverType and LaunchBounds +// so that the static bool is unique for each type combo +// KernelFuncPtr does not necessarily contain that type information. template inline void configure_shmem_preference(KernelFuncPtr const& func, bool prefer_shmem) { @@ -355,8 +358,7 @@ struct CudaParallelLaunchKernelInvoker< if (!Impl::is_empty_launch(grid, block)) { Impl::check_shmem_request(cuda_instance, shmem); - Impl::configure_shmem_preference( + Impl::configure_shmem_preference( base_t::get_kernel_func(), prefer_shmem); void const* args[] = {&driver}; @@ -449,8 +451,7 @@ struct CudaParallelLaunchKernelInvoker< if (!Impl::is_empty_launch(grid, block)) { Impl::check_shmem_request(cuda_instance, shmem); - Impl::configure_shmem_preference( + Impl::configure_shmem_preference( base_t::get_kernel_func(), prefer_shmem); auto* driver_ptr = Impl::allocate_driver_storage_for_kernel(driver); @@ -627,9 +628,8 @@ struct CudaParallelLaunchImpl< get_cuda_func_attributes(), block, shmem, prefer_shmem); Impl::configure_shmem_preference< - DriverType, Kokkos::LaunchBounds, - decltype(base_t::get_kernel_func())>(base_t::get_kernel_func(), - prefer_shmem); + DriverType, Kokkos::LaunchBounds>( + base_t::get_kernel_func(), prefer_shmem); KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE(); diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_MDRangePolicy.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_MDRangePolicy.hpp new file mode 100644 index 000000000000..12b7f70a9749 --- /dev/null +++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_MDRangePolicy.hpp @@ -0,0 +1,37 @@ +#ifndef KOKKOS_CUDA_MDRANGEPOLICY_HPP_ +#define KOKKOS_CUDA_MDRANGEPOLICY_HPP_ + +#include + +namespace Kokkos { + +template <> +struct default_outer_direction { + using type = Iterate; + static constexpr Iterate value = Iterate::Left; +}; + +template <> +struct default_inner_direction { + using type = Iterate; + static constexpr Iterate value = Iterate::Left; +}; + +namespace Impl { + +// Settings for MDRangePolicy +template <> +inline TileSizeProperties get_tile_size_properties( + const Kokkos::Cuda& space) { + TileSizeProperties properties; + properties.max_threads = + space.impl_internal_space_instance()->m_maxThreadsPerSM; + properties.default_largest_tile_size = 16; + properties.default_tile_size = 2; + properties.max_total_tile_size = 512; + return properties; +} + +} // Namespace Impl +} // Namespace Kokkos +#endif diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel.hpp index 131d18098096..2834e6f3de01 100644 --- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel.hpp +++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel.hpp @@ -60,6 +60,7 @@ #include #include #include +#include #include #include @@ -67,6 +68,7 @@ #include #include +#include //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- @@ -474,7 +476,7 @@ class ParallelFor, Kokkos::Cuda> { Policy const& get_policy() const { return m_policy; } - inline __device__ void operator()(void) const { + inline __device__ void operator()() const { const Member work_stride = blockDim.y * gridDim.x; const Member work_end = m_policy.end(); @@ -537,9 +539,23 @@ class ParallelFor, Kokkos::Cuda> { const Policy m_rp; public: + template + static int max_tile_size_product(const Policy& pol, const Functor&) { + cudaFuncAttributes attr = + CudaParallelLaunch::get_cuda_func_attributes(); + auto const& prop = pol.space().cuda_device_prop(); + // Limits due to registers/SM, MDRange doesn't have + // shared memory constraints + int const regs_per_sm = prop.regsPerMultiprocessor; + int const regs_per_thread = attr.numRegs; + int const max_threads_per_sm = regs_per_sm / regs_per_thread; + return std::min( + max_threads_per_sm, + static_cast(Kokkos::Impl::CudaTraits::MaxHierarchicalParallelism)); + } Policy const& get_policy() const { return m_rp; } - - inline __device__ void operator()(void) const { + inline __device__ void operator()() const { Kokkos::Impl::DeviceIterateTile(m_rp, m_functor) .exec_range(); @@ -689,7 +705,7 @@ class ParallelFor, public: Policy const& get_policy() const { return m_policy; } - __device__ inline void operator()(void) const { + __device__ inline void operator()() const { // Iterate this block through the league int64_t threadid = 0; if (m_scratch_size[1] > 0) { @@ -1248,8 +1264,21 @@ class ParallelReduce, ReducerType, using DummySHMEMReductionType = int; public: + template + static int max_tile_size_product(const Policy& pol, const Functor&) { + cudaFuncAttributes attr = + CudaParallelLaunch::get_cuda_func_attributes(); + auto const& prop = pol.space().cuda_device_prop(); + // Limits due do registers/SM + int const regs_per_sm = prop.regsPerMultiprocessor; + int const regs_per_thread = attr.numRegs; + int const max_threads_per_sm = regs_per_sm / regs_per_thread; + return std::min( + max_threads_per_sm, + static_cast(Kokkos::Impl::CudaTraits::MaxHierarchicalParallelism)); + } Policy const& get_policy() const { return m_policy; } - inline __device__ void exec_range(reference_type update) const { Kokkos::Impl::Reduce::DeviceIterateTile, ReducerType, .exec_range(); } - inline __device__ void operator()(void) const { + inline __device__ void operator()() const { /* run(Kokkos::Impl::if_c::select(1,1.0) ); } @@ -2074,7 +2103,7 @@ class ParallelScan, Kokkos::Cuda> { //---------------------------------------- - __device__ inline void initial(void) const { + __device__ inline void initial() const { const integral_nonzero_constant word_count(ValueTraits::value_size(m_functor) / sizeof(size_type)); @@ -2110,7 +2139,7 @@ class ParallelScan, Kokkos::Cuda> { //---------------------------------------- - __device__ inline void final(void) const { + __device__ inline void final() const { const integral_nonzero_constant word_count(ValueTraits::value_size(m_functor) / sizeof(size_type)); @@ -2195,7 +2224,7 @@ class ParallelScan, Kokkos::Cuda> { //---------------------------------------- - __device__ inline void operator()(void) const { + __device__ inline void operator()() const { #ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION if (m_run_serial) { typename ValueTraits::value_type value; @@ -2364,7 +2393,7 @@ class ParallelScanWithTotal, //---------------------------------------- - __device__ inline void initial(void) const { + __device__ inline void initial() const { const integral_nonzero_constant word_count(ValueTraits::value_size(m_functor) / sizeof(size_type)); @@ -2400,7 +2429,7 @@ class ParallelScanWithTotal, //---------------------------------------- - __device__ inline void final(void) const { + __device__ inline void final() const { const integral_nonzero_constant word_count(ValueTraits::value_size(m_functor) / sizeof(size_type)); @@ -2487,7 +2516,7 @@ class ParallelScanWithTotal, //---------------------------------------- - __device__ inline void operator()(void) const { + __device__ inline void operator()() const { #ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION if (m_run_serial) { typename ValueTraits::value_type value; diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Team.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Team.hpp index 4b472f5d4fd8..e7806390155d 100644 --- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Team.hpp +++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Team.hpp @@ -661,13 +661,14 @@ KOKKOS_INLINE_FUNCTION thread, count); } -template -KOKKOS_INLINE_FUNCTION - Impl::ThreadVectorRangeBoundariesStruct - ThreadVectorRange(const Impl::CudaTeamMember& thread, iType arg_begin, - iType arg_end) { +template +KOKKOS_INLINE_FUNCTION Impl::ThreadVectorRangeBoundariesStruct< + typename std::common_type::type, Impl::CudaTeamMember> +ThreadVectorRange(const Impl::CudaTeamMember& thread, iType1 arg_begin, + iType2 arg_end) { + using iType = typename std::common_type::type; return Impl::ThreadVectorRangeBoundariesStruct( - thread, arg_begin, arg_end); + thread, iType(arg_begin), iType(arg_end)); } KOKKOS_INLINE_FUNCTION @@ -983,7 +984,7 @@ KOKKOS_INLINE_FUNCTION void parallel_scan( //---------------------------------------------------------------------------- -/** \brief Intra-thread vector parallel exclusive prefix sum. +/** \brief Intra-thread vector parallel scan with reducer. * * Executes closure(iType i, ValueType & val, bool final) for each i=[0..N) * @@ -991,25 +992,25 @@ KOKKOS_INLINE_FUNCTION void parallel_scan( * thread and a scan operation is performed. * The last call to closure has final == true. */ -template -KOKKOS_INLINE_FUNCTION void parallel_scan( - const Impl::ThreadVectorRangeBoundariesStruct& - loop_boundaries, - const Closure& closure) { +template +KOKKOS_INLINE_FUNCTION + typename std::enable_if::value>::type + parallel_scan(const Impl::ThreadVectorRangeBoundariesStruct< + iType, Impl::CudaTeamMember>& loop_boundaries, + const Closure& closure, const ReducerType& reducer) { (void)loop_boundaries; (void)closure; + (void)reducer; #ifdef __CUDA_ARCH__ - // Extract value_type from closure - - using value_type = typename Kokkos::Impl::FunctorAnalysis< - Kokkos::Impl::FunctorPatternInterface::SCAN, void, Closure>::value_type; + using value_type = typename ReducerType::value_type; + value_type accum; + reducer.init(accum); + const value_type identity = accum; // Loop through boundaries by vector-length chunks // must scan at each iteration - value_type accum = 0; - // All thread "lanes" must loop the same number of times. // Determine an loop end for all thread "lanes." // Requires: @@ -1026,44 +1027,68 @@ KOKKOS_INLINE_FUNCTION void parallel_scan( const int end = loop_boundaries.end + (rem ? blockDim.x - rem : 0); for (int i = threadIdx.x; i < end; i += blockDim.x) { - value_type val = 0; + value_type val = identity; - // First acquire per-lane contributions: - if (i < loop_boundaries.end) closure(i, val, false); + // First acquire per-lane contributions. + // This sets i's val to i-1's contribution + // to make the latter in_place_shfl_up an + // exclusive scan -- the final accumulation + // of i's val will be included in the second + // closure call later. + if (i < loop_boundaries.end && threadIdx.x > 0) closure(i - 1, val, false); - value_type sval = val; - - // Bottom up inclusive scan in triangular pattern + // Bottom up exclusive scan in triangular pattern // where each CUDA thread is the root of a reduction tree // from the zeroth "lane" to itself. // [t] += [t-1] if t >= 1 // [t] += [t-2] if t >= 2 // [t] += [t-4] if t >= 4 // ... - + // This differs from the non-reducer overload, where an inclusive scan was + // implemented, because in general the binary operator cannot be inverted + // and we would not be able to remove the inclusive contribution by + // inversion. for (int j = 1; j < (int)blockDim.x; j <<= 1) { - value_type tmp = 0; - Impl::in_place_shfl_up(tmp, sval, j, blockDim.x, active_mask); + value_type tmp = identity; + Impl::in_place_shfl_up(tmp, val, j, blockDim.x, active_mask); if (j <= (int)threadIdx.x) { - sval += tmp; + reducer.join(val, tmp); } } - // Include accumulation and remove value for exclusive scan: - val = accum + sval - val; + // Include accumulation + reducer.join(val, accum); - // Provide exclusive scan value: + // Update i's contribution into the val + // and add it to accum for next round if (i < loop_boundaries.end) closure(i, val, true); - - // Accumulate the last value in the inclusive scan: - Impl::in_place_shfl(sval, sval, mask, blockDim.x, active_mask); - - accum += sval; + Impl::in_place_shfl(accum, val, mask, blockDim.x, active_mask); } #endif } +//---------------------------------------------------------------------------- + +/** \brief Intra-thread vector parallel exclusive prefix sum. + * + * Executes closure(iType i, ValueType & val, bool final) for each i=[0..N) + * + * The range [0..N) is mapped to all vector lanes in the + * thread and a scan operation is performed. + * The last call to closure has final == true. + */ +template +KOKKOS_INLINE_FUNCTION void parallel_scan( + const Impl::ThreadVectorRangeBoundariesStruct& + loop_boundaries, + const Closure& closure) { + using value_type = typename Kokkos::Impl::FunctorAnalysis< + Kokkos::Impl::FunctorPatternInterface::SCAN, void, Closure>::value_type; + value_type dummy; + parallel_scan(loop_boundaries, closure, Kokkos::Sum(dummy)); +} + } // namespace Kokkos namespace Kokkos { diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_View.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_View.hpp index f24abb377dae..c55956ede966 100644 --- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_View.hpp +++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_View.hpp @@ -139,7 +139,7 @@ struct CudaLDGFetch { template KOKKOS_INLINE_FUNCTION ValueType operator[](const iType& i) const { -#ifdef __CUDA_ARCH__ +#if defined(__CUDA_ARCH__) && (350 <= _CUDA_ARCH__) AliasType v = __ldg(reinterpret_cast(&m_ptr[i])); return *(reinterpret_cast(&v)); #else diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp index 05876a9f0226..fc52e4151452 100644 --- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp +++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp @@ -46,6 +46,7 @@ #define KOKKOS_CUDA_WORKGRAPHPOLICY_HPP #include +#include namespace Kokkos { namespace Impl { diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_BlockSize_Deduction.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_BlockSize_Deduction.hpp index 89135b6c45b9..9278d1bdc9ef 100644 --- a/packages/kokkos/core/src/HIP/Kokkos_HIP_BlockSize_Deduction.hpp +++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_BlockSize_Deduction.hpp @@ -75,17 +75,6 @@ void hipOccupancy(int *numBlocks, int blockSize, int sharedmem) { hipOccupancy( numBlocks, blockSize, sharedmem); } -template -struct HIPGetMaxBlockSize; - -template -int hip_get_max_block_size(typename DriverType::functor_type const &f, - size_t const vector_length, - size_t const shmem_extra_block, - size_t const shmem_extra_thread) { - return HIPGetMaxBlockSize::get_block_size( - f, vector_length, shmem_extra_block, shmem_extra_thread); -} template int hip_internal_get_block_size(const F &condition_check, @@ -131,10 +120,6 @@ int hip_internal_get_block_size(const F &condition_check, int opt_block_size = (blocks_per_sm >= min_blocks_per_sm) ? block_size : min_blocks_per_sm; int opt_threads_per_sm = threads_per_sm; - // printf("BlockSizeMax: %i Shmem: %i %i %i %i Regs: %i %i Blocks: %i %i - // Achieved: %i %i Opt: %i %i\n",block_size, - // shmem_per_sm,max_shmem_per_block,functor_shmem,total_shmem, - // regs_per_sm,regs_per_wavefront,max_blocks_shmem,max_blocks_regs,blocks_per_sm,threads_per_sm,opt_block_size,opt_threads_per_sm); block_size -= HIPTraits::WarpSize; while (condition_check(blocks_per_sm) && (block_size >= HIPTraits::WarpSize)) { @@ -160,10 +145,6 @@ int hip_internal_get_block_size(const F &condition_check, opt_threads_per_sm = threads_per_sm; } } - // printf("BlockSizeMax: %i Shmem: %i %i %i %i Regs: %i %i Blocks: %i %i - // Achieved: %i %i Opt: %i %i\n",block_size, - // shmem_per_sm,max_shmem_per_block,functor_shmem,total_shmem, - // regs_per_sm,regs_per_wavefront,max_blocks_shmem,max_blocks_regs,blocks_per_sm,threads_per_sm,opt_block_size,opt_threads_per_sm); block_size -= HIPTraits::WarpSize; } return opt_block_size; @@ -178,62 +159,6 @@ int hip_get_max_block_size(const HIPInternal *hip_instance, [](int x) { return x == 0; }, hip_instance, attr, f, vector_length, shmem_block, shmem_thread); } -template -struct HIPGetMaxBlockSize { - static int get_block_size(typename DriverType::functor_type const &f, - size_t const vector_length, - size_t const shmem_extra_block, - size_t const shmem_extra_thread) { - int numBlocks = 0; - int blockSize = LaunchBounds::maxTperB == 0 ? 1024 : LaunchBounds::maxTperB; - int sharedmem = - shmem_extra_block + shmem_extra_thread * (blockSize / vector_length) + - ::Kokkos::Impl::FunctorTeamShmemSize< - typename DriverType::functor_type>::value(f, blockSize / - vector_length); - - hipOccupancy(&numBlocks, blockSize, sharedmem); - - if (numBlocks > 0) return blockSize; - while (blockSize > HIPTraits::WarpSize && numBlocks == 0) { - blockSize /= 2; - sharedmem = - shmem_extra_block + shmem_extra_thread * (blockSize / vector_length) + - ::Kokkos::Impl::FunctorTeamShmemSize< - typename DriverType::functor_type>::value(f, blockSize / - vector_length); - - hipOccupancy(&numBlocks, blockSize, sharedmem); - } - int blockSizeUpperBound = blockSize * 2; - while (blockSize < blockSizeUpperBound && numBlocks > 0) { - blockSize += HIPTraits::WarpSize; - sharedmem = - shmem_extra_block + shmem_extra_thread * (blockSize / vector_length) + - ::Kokkos::Impl::FunctorTeamShmemSize< - typename DriverType::functor_type>::value(f, blockSize / - vector_length); - - hipOccupancy(&numBlocks, blockSize, sharedmem); - } - return blockSize - HIPTraits::WarpSize; - } -}; - -template -struct HIPGetOptBlockSize; - -template -int hip_get_opt_block_size(typename DriverType::functor_type const &f, - size_t const vector_length, - size_t const shmem_extra_block, - size_t const shmem_extra_thread) { - return HIPGetOptBlockSize< - DriverType, LaunchBounds, - (HIPTraits::ConstantMemoryUseThreshold < - sizeof(DriverType))>::get_block_size(f, vector_length, shmem_extra_block, - shmem_extra_thread); -} template int hip_get_opt_block_size(HIPInternal const *hip_instance, @@ -245,157 +170,6 @@ int hip_get_opt_block_size(HIPInternal const *hip_instance, shmem_block, shmem_thread); } -// FIXME_HIP the code is identical to the false struct except for -// hip_parallel_launch_constant_memory -template -struct HIPGetOptBlockSize, true> { - static int get_block_size(typename DriverType::functor_type const &f, - size_t const vector_length, - size_t const shmem_extra_block, - size_t const shmem_extra_thread) { - int blockSize = HIPTraits::WarpSize / 2; - int numBlocks; - int sharedmem; - int maxOccupancy = 0; - int bestBlockSize = 0; - - while (blockSize < HIPTraits::MaxThreadsPerBlock) { - blockSize *= 2; - - // calculate the occupancy with that optBlockSize and check whether its - // larger than the largest one found so far - sharedmem = - shmem_extra_block + shmem_extra_thread * (blockSize / vector_length) + - ::Kokkos::Impl::FunctorTeamShmemSize< - typename DriverType::functor_type>::value(f, blockSize / - vector_length); - hipOccupancy(&numBlocks, blockSize, sharedmem); - if (maxOccupancy < numBlocks * blockSize) { - maxOccupancy = numBlocks * blockSize; - bestBlockSize = blockSize; - } - } - return bestBlockSize; - } -}; - -template -struct HIPGetOptBlockSize, false> { - static int get_block_size(const typename DriverType::functor_type &f, - const size_t vector_length, - const size_t shmem_extra_block, - const size_t shmem_extra_thread) { - int blockSize = HIPTraits::WarpSize / 2; - int numBlocks; - int sharedmem; - int maxOccupancy = 0; - int bestBlockSize = 0; - - while (blockSize < HIPTraits::MaxThreadsPerBlock) { - blockSize *= 2; - sharedmem = - shmem_extra_block + shmem_extra_thread * (blockSize / vector_length) + - ::Kokkos::Impl::FunctorTeamShmemSize< - typename DriverType::functor_type>::value(f, blockSize / - vector_length); - - hipOccupancy(&numBlocks, blockSize, sharedmem); - - if (maxOccupancy < numBlocks * blockSize) { - maxOccupancy = numBlocks * blockSize; - bestBlockSize = blockSize; - } - } - return bestBlockSize; - } -}; - -// FIXME_HIP the code is identical to the false struct except for -// hip_parallel_launch_constant_memory -template -struct HIPGetOptBlockSize< - DriverType, Kokkos::LaunchBounds, - true> { - static int get_block_size(const typename DriverType::functor_type &f, - const size_t vector_length, - const size_t shmem_extra_block, - const size_t shmem_extra_thread) { - int blockSize = HIPTraits::WarpSize / 2; - int numBlocks; - int sharedmem; - int maxOccupancy = 0; - int bestBlockSize = 0; - int max_threads_per_block = - std::min(MaxThreadsPerBlock, - hip_internal_maximum_warp_count() * HIPTraits::WarpSize); - - while (blockSize < max_threads_per_block) { - blockSize *= 2; - - // calculate the occupancy with that optBlockSize and check whether its - // larger than the largest one found so far - sharedmem = - shmem_extra_block + shmem_extra_thread * (blockSize / vector_length) + - ::Kokkos::Impl::FunctorTeamShmemSize< - typename DriverType::functor_type>::value(f, blockSize / - vector_length); - hipOccupancy( - &numBlocks, blockSize, sharedmem); - if (numBlocks >= static_cast(MinBlocksPerSM) && - blockSize <= static_cast(MaxThreadsPerBlock)) { - if (maxOccupancy < numBlocks * blockSize) { - maxOccupancy = numBlocks * blockSize; - bestBlockSize = blockSize; - } - } - } - if (maxOccupancy > 0) return bestBlockSize; - return -1; - } -}; - -template -struct HIPGetOptBlockSize< - DriverType, Kokkos::LaunchBounds, - false> { - static int get_block_size(const typename DriverType::functor_type &f, - const size_t vector_length, - const size_t shmem_extra_block, - const size_t shmem_extra_thread) { - int blockSize = HIPTraits::WarpSize / 2; - int numBlocks; - int sharedmem; - int maxOccupancy = 0; - int bestBlockSize = 0; - int max_threads_per_block = - std::min(MaxThreadsPerBlock, - hip_internal_maximum_warp_count() * HIPTraits::WarpSize); - - while (blockSize < max_threads_per_block) { - blockSize *= 2; - sharedmem = - shmem_extra_block + shmem_extra_thread * (blockSize / vector_length) + - ::Kokkos::Impl::FunctorTeamShmemSize< - typename DriverType::functor_type>::value(f, blockSize / - vector_length); - - hipOccupancy( - &numBlocks, blockSize, sharedmem); - if (numBlocks >= int(MinBlocksPerSM) && - blockSize <= int(MaxThreadsPerBlock)) { - if (maxOccupancy < numBlocks * blockSize) { - maxOccupancy = numBlocks * blockSize; - bestBlockSize = blockSize; - } - } - } - if (maxOccupancy > 0) return bestBlockSize; - return -1; - } -}; - } // namespace Impl } // namespace Experimental } // namespace Kokkos diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_Instance.cpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_Instance.cpp index 45512038acaf..18ef10e22cd3 100644 --- a/packages/kokkos/core/src/HIP/Kokkos_HIP_Instance.cpp +++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_Instance.cpp @@ -164,6 +164,8 @@ HIPInternal &HIPInternal::singleton() { void HIPInternal::fence() const { HIP_SAFE_CALL(hipStreamSynchronize(m_stream)); + // can reset our cycle id now as well + m_cycleId = 0; } void HIPInternal::initialize(int hip_device_id, hipStream_t stream) { @@ -256,7 +258,7 @@ void HIPInternal::initialize(int hip_device_id, hipStream_t stream) { void>; Record *const r = Record::allocate(Kokkos::Experimental::HIPSpace(), - "InternalScratchBitset", + "Kokkos::InternalScratchBitset", sizeof(uint32_t) * buffer_bound); Record::increment(r); @@ -303,8 +305,10 @@ Kokkos::Experimental::HIP::size_type *HIPInternal::scratch_space( Kokkos::Impl::SharedAllocationRecord; - static Record *const r = Record::allocate( - Kokkos::Experimental::HIPSpace(), "InternalScratchSpace", + if (m_scratchSpace) Record::decrement(Record::get_record(m_scratchSpace)); + + Record *const r = Record::allocate( + Kokkos::Experimental::HIPSpace(), "Kokkos::InternalScratchSpace", (sizeScratchGrain * m_scratchSpaceCount)); Record::increment(r); @@ -325,8 +329,10 @@ Kokkos::Experimental::HIP::size_type *HIPInternal::scratch_flags( Kokkos::Impl::SharedAllocationRecord; + if (m_scratchFlags) Record::decrement(Record::get_record(m_scratchFlags)); + Record *const r = Record::allocate( - Kokkos::Experimental::HIPSpace(), "InternalScratchFlags", + Kokkos::Experimental::HIPSpace(), "Kokkos::InternalScratchFlags", (sizeScratchGrain * m_scratchFlagsCount)); Record::increment(r); @@ -345,7 +351,7 @@ void *HIPInternal::resize_team_scratch_space(std::int64_t bytes, if (m_team_scratch_current_size == 0) { m_team_scratch_current_size = bytes; m_team_scratch_ptr = Kokkos::kokkos_malloc( - "HIPSpace::ScratchMemory", m_team_scratch_current_size); + "Kokkos::HIPSpace::TeamScratchMemory", m_team_scratch_current_size); } if ((bytes > m_team_scratch_current_size) || ((bytes < m_team_scratch_current_size) && (force_shrink))) { @@ -388,6 +394,40 @@ void HIPInternal::finalize() { m_team_scratch_current_size = 0; m_team_scratch_ptr = nullptr; } + if (nullptr != d_driverWorkArray) { + HIP_SAFE_CALL(hipHostFree(d_driverWorkArray)); + d_driverWorkArray = nullptr; + } +} + +char *HIPInternal::get_next_driver(size_t driverTypeSize) const { + std::lock_guard const lock(m_mutexWorkArray); + if (d_driverWorkArray == nullptr) { + HIP_SAFE_CALL( + hipHostMalloc(&d_driverWorkArray, + m_maxDriverCycles * m_maxDriverTypeSize * sizeof(char), + hipHostMallocNonCoherent)); + } + if (driverTypeSize > m_maxDriverTypeSize) { + // fence handles the cycle id reset for us + fence(); + HIP_SAFE_CALL(hipHostFree(d_driverWorkArray)); + m_maxDriverTypeSize = driverTypeSize; + if (m_maxDriverTypeSize % 128 != 0) + m_maxDriverTypeSize = + m_maxDriverTypeSize + 128 - m_maxDriverTypeSize % 128; + HIP_SAFE_CALL( + hipHostMalloc(&d_driverWorkArray, + m_maxDriverCycles * m_maxDriverTypeSize * sizeof(char), + hipHostMallocNonCoherent)); + } else { + m_cycleId = (m_cycleId + 1) % m_maxDriverCycles; + if (m_cycleId == 0) { + // ensure any outstanding kernels are completed before we wrap around + fence(); + } + } + return &d_driverWorkArray[m_maxDriverTypeSize * m_cycleId]; } //---------------------------------------------------------------------------- diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_Instance.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_Instance.hpp index 07ec8625e693..f4f88628e313 100644 --- a/packages/kokkos/core/src/HIP/Kokkos_HIP_Instance.hpp +++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_Instance.hpp @@ -49,6 +49,8 @@ #include +#include + namespace Kokkos { namespace Experimental { namespace Impl { @@ -83,33 +85,46 @@ class HIPInternal { public: using size_type = ::Kokkos::Experimental::HIP::size_type; - int m_hipDev; - int m_hipArch; - unsigned m_multiProcCount; - unsigned m_maxWarpCount; - unsigned m_maxBlock; - unsigned m_maxBlocksPerSM; - unsigned m_maxSharedWords; + int m_hipDev = -1; + int m_hipArch = -1; + unsigned m_multiProcCount = 0; + unsigned m_maxWarpCount = 0; + unsigned m_maxBlock = 0; + unsigned m_maxBlocksPerSM = 0; + unsigned m_maxSharedWords = 0; int m_regsPerSM; - int m_shmemPerSM; - int m_maxShmemPerBlock; - int m_maxThreadsPerSM; + int m_shmemPerSM = 0; + int m_maxShmemPerBlock = 0; + int m_maxThreadsPerSM = 0; + + // array of DriverTypes to be allocated in host-pinned memory for async + // kernel launches + mutable char *d_driverWorkArray = nullptr; + // number of kernel launches that can be in-flight w/o synchronization + const int m_maxDriverCycles = 100; + // max size of a DriverType [bytes] + mutable size_t m_maxDriverTypeSize = 1024 * 10; + // the current index in the driverWorkArray + mutable int m_cycleId = 0; + // mutex to access d_driverWorkArray + mutable std::mutex m_mutexWorkArray; // Scratch Spaces for Reductions - size_type m_scratchSpaceCount; - size_type m_scratchFlagsCount; + size_type m_scratchSpaceCount = 0; + size_type m_scratchFlagsCount = 0; - size_type *m_scratchSpace; - size_type *m_scratchFlags; + size_type *m_scratchSpace = nullptr; + size_type *m_scratchFlags = nullptr; uint32_t *m_scratchConcurrentBitset = nullptr; hipDeviceProp_t m_deviceProp; - hipStream_t m_stream; + hipStream_t m_stream = nullptr; // Team Scratch Level 1 Space - mutable int64_t m_team_scratch_current_size; - mutable void *m_team_scratch_ptr; + mutable int64_t m_team_scratch_current_size = 0; + mutable void *m_team_scratch_ptr = nullptr; + mutable std::mutex m_team_scratch_mutex; bool was_finalized = false; @@ -117,9 +132,7 @@ class HIPInternal { int verify_is_initialized(const char *const label) const; - int is_initialized() const { - return m_hipDev >= 0; - } // 0 != m_scratchSpace && 0 != m_scratchFlags ; } + int is_initialized() const { return m_hipDev >= 0; } void initialize(int hip_device_id, hipStream_t stream = nullptr); void finalize(); @@ -128,25 +141,12 @@ class HIPInternal { void fence() const; + // returns the next driver type pointer in our work array + char *get_next_driver(size_t driverTypeSize) const; + ~HIPInternal(); - HIPInternal() - : m_hipDev(-1), - m_hipArch(-1), - m_multiProcCount(0), - m_maxWarpCount(0), - m_maxBlock(0), - m_maxSharedWords(0), - m_shmemPerSM(0), - m_maxShmemPerBlock(0), - m_maxThreadsPerSM(0), - m_scratchSpaceCount(0), - m_scratchFlagsCount(0), - m_scratchSpace(nullptr), - m_scratchFlags(nullptr), - m_stream(nullptr), - m_team_scratch_current_size(0), - m_team_scratch_ptr(nullptr) {} + HIPInternal() = default; // Resizing of reduction related scratch spaces size_type *scratch_space(const size_type size); diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp index 3e972c7346b8..f774423b378b 100644 --- a/packages/kokkos/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp +++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp @@ -49,9 +49,9 @@ #if defined(__HIPCC__) -#include #include #include +#include // Must use global variable on the device with HIP-Clang #ifdef __HIP__ @@ -127,93 +127,87 @@ struct HIPDispatchProperties { HIPLaunchMechanism launch_mechanism = l; }; -template , - HIPLaunchMechanism LaunchMechanism = HIPLaunchMechanism::LocalMemory> -struct HIPParallelLaunch; +template +struct HIPParallelLaunchKernelFunc; -template -struct HIPParallelLaunch< +struct HIPParallelLaunchKernelFunc< DriverType, Kokkos::LaunchBounds, HIPLaunchMechanism::LocalMemory> { - inline HIPParallelLaunch(const DriverType &driver, const dim3 &grid, - const dim3 &block, const int shmem, - const HIPInternal *hip_instance, - const bool /*prefer_shmem*/) { - if ((grid.x != 0) && ((block.x * block.y * block.z) != 0)) { - if (hip_instance->m_maxShmemPerBlock < shmem) { - Kokkos::Impl::throw_runtime_exception( - "HIPParallelLaunch FAILED: shared memory request is too large"); - } - - KOKKOS_ENSURE_HIP_LOCK_ARRAYS_ON_DEVICE(); - - // FIXME_HIP -- there is currently an error copying (some) structs - // by value to the device in HIP-Clang / VDI - // As a workaround, we can malloc the DriverType and explictly copy over. - // To remove once solved in HIP - DriverType *d_driver; - HIP_SAFE_CALL(hipMalloc(&d_driver, sizeof(DriverType))); - HIP_SAFE_CALL(hipMemcpyAsync(d_driver, &driver, sizeof(DriverType), - hipMemcpyHostToDevice, - hip_instance->m_stream)); - hip_parallel_launch_local_memory - <<m_stream>>>(d_driver); + static auto get_kernel_func() { + return hip_parallel_launch_local_memory; + } +}; -#if defined(KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK) - HIP_SAFE_CALL(hipGetLastError()); - hip_instance->fence(); -#endif - HIP_SAFE_CALL(hipFree(d_driver)); - } +template +struct HIPParallelLaunchKernelFunc, + HIPLaunchMechanism::LocalMemory> { + static auto get_kernel_func() { + return hip_parallel_launch_local_memory; } +}; - static hipFuncAttributes get_hip_func_attributes() { - static hipFuncAttributes attr = []() { - hipFuncAttributes attr; - HIP_SAFE_CALL(hipFuncGetAttributes( - &attr, - reinterpret_cast( - hip_parallel_launch_local_memory))); - return attr; - }(); - return attr; +template +struct HIPParallelLaunchKernelInvoker; + +template +struct HIPParallelLaunchKernelInvoker + : HIPParallelLaunchKernelFunc { + using base_t = HIPParallelLaunchKernelFunc; + + static void invoke_kernel(DriverType const *driver, dim3 const &grid, + dim3 const &block, int shmem, + HIPInternal const *hip_instance) { + (base_t::get_kernel_func())<<m_stream>>>( + driver); } }; -template -struct HIPParallelLaunch, - HIPLaunchMechanism::LocalMemory> { - inline HIPParallelLaunch(const DriverType &driver, const dim3 &grid, - const dim3 &block, const int shmem, - const HIPInternal *hip_instance, - const bool /*prefer_shmem*/) { +template , + HIPLaunchMechanism LaunchMechanism = HIPLaunchMechanism::LocalMemory> +struct HIPParallelLaunch; + +template +struct HIPParallelLaunch< + DriverType, Kokkos::LaunchBounds, + HIPLaunchMechanism::LocalMemory> + : HIPParallelLaunchKernelInvoker< + DriverType, Kokkos::LaunchBounds, + HIPLaunchMechanism::LocalMemory> { + using base_t = HIPParallelLaunchKernelInvoker< + DriverType, Kokkos::LaunchBounds, + HIPLaunchMechanism::LocalMemory>; + + HIPParallelLaunch(const DriverType &driver, const dim3 &grid, + const dim3 &block, const int shmem, + const HIPInternal *hip_instance, + const bool /*prefer_shmem*/) { if ((grid.x != 0) && ((block.x * block.y * block.z) != 0)) { if (hip_instance->m_maxShmemPerBlock < shmem) { - Kokkos::Impl::throw_runtime_exception(std::string( - "HIPParallelLaunch FAILED: shared memory request is too large")); + Kokkos::Impl::throw_runtime_exception( + "HIPParallelLaunch FAILED: shared memory request is too large"); } KOKKOS_ENSURE_HIP_LOCK_ARRAYS_ON_DEVICE(); // Invoke the driver function on the device - - // FIXME_HIP -- see note about struct copy by value above - DriverType *d_driver; - HIP_SAFE_CALL(hipMalloc(&d_driver, sizeof(DriverType))); - HIP_SAFE_CALL(hipMemcpyAsync(d_driver, &driver, sizeof(DriverType), - hipMemcpyHostToDevice, - hip_instance->m_stream)); - hip_parallel_launch_local_memory - <<m_stream>>>(d_driver); + DriverType *d_driver = reinterpret_cast( + hip_instance->get_next_driver(sizeof(DriverType))); + std::memcpy((void *)d_driver, (void *)&driver, sizeof(DriverType)); + base_t::invoke_kernel(d_driver, grid, block, shmem, hip_instance); #if defined(KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK) HIP_SAFE_CALL(hipGetLastError()); hip_instance->fence(); #endif - HIP_SAFE_CALL(hipFree(d_driver)); } } @@ -221,8 +215,7 @@ struct HIPParallelLaunch, static hipFuncAttributes attr = []() { hipFuncAttributes attr; HIP_SAFE_CALL(hipFuncGetAttributes( - &attr, reinterpret_cast( - hip_parallel_launch_local_memory))); + &attr, reinterpret_cast(base_t::get_kernel_func()))); return attr; }(); return attr; diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_MDRangePolicy.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_MDRangePolicy.hpp new file mode 100644 index 000000000000..ce1aff9586d2 --- /dev/null +++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_MDRangePolicy.hpp @@ -0,0 +1,37 @@ +#ifndef KOKKOS_HIP_MDRANGEPOLICY_HPP_ +#define KOKKOS_HIP_MDRANGEPOLICY_HPP_ + +#include + +namespace Kokkos { + +template <> +struct default_outer_direction { + using type = Iterate; + static constexpr Iterate value = Iterate::Left; +}; + +template <> +struct default_inner_direction { + using type = Iterate; + static constexpr Iterate value = Iterate::Left; +}; + +namespace Impl { + +// Settings for MDRangePolicy +template <> +inline TileSizeProperties get_tile_size_properties( + const Kokkos::Experimental::HIP& space) { + TileSizeProperties properties; + properties.max_threads = + space.impl_internal_space_instance()->m_maxThreadsPerSM; + properties.default_largest_tile_size = 16; + properties.default_tile_size = 4; + properties.max_total_tile_size = 1024; + return properties; +} + +} // Namespace Impl +} // Namespace Kokkos +#endif diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_Parallel_MDRange.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_Parallel_MDRange.hpp index 6b831ff7a3dd..35e7d6fb853a 100644 --- a/packages/kokkos/core/src/HIP/Kokkos_HIP_Parallel_MDRange.hpp +++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_Parallel_MDRange.hpp @@ -49,6 +49,7 @@ #include #include #include +#include #include namespace Kokkos { @@ -72,7 +73,7 @@ class ParallelFor, ParallelFor& operator=(ParallelFor const&) = delete; public: - inline __device__ void operator()(void) const { + inline __device__ void operator()() const { Kokkos::Impl::DeviceIterateTile(m_policy, m_functor) @@ -175,6 +176,25 @@ class ParallelFor, ParallelFor(FunctorType const& arg_functor, Policy const& arg_policy) : m_functor(arg_functor), m_policy(arg_policy) {} + + template + static int max_tile_size_product(const Policy& pol, const Functor&) { + using closure_type = + ParallelFor, + Kokkos::Experimental::HIP>; + hipFuncAttributes attr = Kokkos::Experimental::Impl::HIPParallelLaunch< + closure_type, LaunchBounds>::get_hip_func_attributes(); + auto const& prop = pol.space().hip_device_prop(); + // Limits due to registers/SM, MDRange doesn't have + // shared memory constraints + int const regs_per_sm = prop.regsPerMultiprocessor; + int const regs_per_thread = attr.numRegs; + int const max_threads_per_sm = regs_per_sm / regs_per_thread; + return std::min( + max_threads_per_sm, + static_cast( + Kokkos::Experimental::Impl::HIPTraits::MaxThreadsPerBlock)); + } }; // ParallelReduce @@ -231,7 +251,7 @@ class ParallelReduce, ReducerType, DeviceIteratePattern(m_policy, m_functor, update).exec_range(); } - inline __device__ void operator()(void) const { + inline __device__ void operator()() const { const integral_nonzero_constant word_count(ValueTraits::value_size( @@ -291,13 +311,19 @@ class ParallelReduce, ReducerType, ::Kokkos::Experimental::Impl::HIPTraits::MaxThreadsPerBlock; int shmem_size = ::Kokkos::Impl::hip_single_inter_block_reduce_scan_shmem< false, FunctorType, WorkTag>(f, n); + using closure_type = Impl::ParallelReduce; + hipFuncAttributes attr = ::Kokkos::Experimental::Impl::HIPParallelLaunch< + closure_type, LaunchBounds>::get_hip_func_attributes(); while ( (n && (m_policy.space().impl_internal_space_instance()->m_maxShmemPerBlock < shmem_size)) || - (n > static_cast( - ::Kokkos::Experimental::Impl::hip_get_max_block_size< - ParallelReduce, LaunchBounds>(f, 1, shmem_size, 0)))) { + (n > + static_cast( + ::Kokkos::Experimental::Impl::hip_get_max_block_size( + m_policy.space().impl_internal_space_instance(), attr, f, 1, + shmem_size, 0)))) { n >>= 1; shmem_size = ::Kokkos::Impl::hip_single_inter_block_reduce_scan_shmem< false, FunctorType, WorkTag>(f, n); @@ -391,6 +417,23 @@ class ParallelReduce, ReducerType, memory_space>::accessible), m_scratch_space(nullptr), m_scratch_flags(nullptr) {} + template + static int max_tile_size_product(const Policy& pol, const Functor&) { + using closure_type = + ParallelReduce, + ReducerType, Kokkos::Experimental::HIP>; + hipFuncAttributes attr = Kokkos::Experimental::Impl::HIPParallelLaunch< + closure_type, LaunchBounds>::get_hip_func_attributes(); + auto const& prop = pol.space().hip_device_prop(); + // Limits due do registers/SM + int const regs_per_sm = prop.regsPerMultiprocessor; + int const regs_per_thread = attr.numRegs; + int const max_threads_per_sm = regs_per_sm / regs_per_thread; + return std::min( + max_threads_per_sm, + static_cast( + Kokkos::Experimental::Impl::HIPTraits::MaxThreadsPerBlock)); + } }; } // namespace Impl } // namespace Kokkos diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Range.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Range.hpp index 5607f1c91a5d..7d2825eeb4c6 100644 --- a/packages/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Range.hpp +++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Range.hpp @@ -92,7 +92,7 @@ class ParallelFor, public: using functor_type = FunctorType; - inline __device__ void operator()(void) const { + inline __device__ void operator()() const { const Member work_stride = blockDim.y * gridDim.x; const Member work_end = m_policy.end(); @@ -174,11 +174,14 @@ class ParallelReduce, ReducerType, size_type* m_scratch_space = nullptr; size_type* m_scratch_flags = nullptr; - // FIXME_HIP_PERFORMANCE Need a rule to choose when to use shared memory and - // when to use shuffle +#if HIP_VERSION < 401 static bool constexpr UseShflReduction = ((sizeof(value_type) > 2 * sizeof(double)) && static_cast(ValueTraits::StaticValueSize)); +#else + static bool constexpr UseShflReduction = + static_cast(ValueTraits::StaticValueSize); +#endif private: struct ShflReductionTag {}; @@ -330,13 +333,19 @@ class ParallelReduce, ReducerType, int shmem_size = hip_single_inter_block_reduce_scan_shmem( f, n); + using closure_type = Impl::ParallelReduce; + hipFuncAttributes attr = ::Kokkos::Experimental::Impl::HIPParallelLaunch< + closure_type, LaunchBounds>::get_hip_func_attributes(); while ( (n && (m_policy.space().impl_internal_space_instance()->m_maxShmemPerBlock < shmem_size)) || - (n > static_cast( - Kokkos::Experimental::Impl::hip_get_max_block_size< - ParallelReduce, LaunchBounds>(f, 1, shmem_size, 0)))) { + (n > + static_cast( + ::Kokkos::Experimental::Impl::hip_get_max_block_size( + m_policy.space().impl_internal_space_instance(), attr, f, 1, + shmem_size, 0)))) { n >>= 1; shmem_size = hip_single_inter_block_reduce_scan_shmem( @@ -493,7 +502,7 @@ class ParallelScanHIPBase { //---------------------------------------- - __device__ inline void initial(void) const { + __device__ inline void initial() const { const integral_nonzero_constant word_count(ValueTraits::value_size(m_functor) / sizeof(size_type)); @@ -529,7 +538,7 @@ class ParallelScanHIPBase { //---------------------------------------- - __device__ inline void final(void) const { + __device__ inline void final() const { const integral_nonzero_constant word_count(ValueTraits::value_size(m_functor) / sizeof(size_type)); @@ -606,7 +615,7 @@ class ParallelScanHIPBase { public: //---------------------------------------- - __device__ inline void operator()(void) const { + __device__ inline void operator()() const { if (!m_final) { initial(); } else { diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Team.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Team.hpp index 5da83d289e2f..96c3ff2a7510 100644 --- a/packages/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Team.hpp +++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Team.hpp @@ -433,6 +433,9 @@ class ParallelFor, int m_shmem_size; void* m_scratch_ptr[2]; int m_scratch_size[2]; + // Only let one ParallelFor/Reduce modify the team scratch memory. The + // constructor acquires the mutex which is released in the destructor. + std::unique_lock m_scratch_lock; template __device__ inline @@ -449,7 +452,7 @@ class ParallelFor, } public: - __device__ inline void operator()(void) const { + __device__ inline void operator()() const { // Iterate this block through the league int64_t threadid = 0; if (m_scratch_size[1] > 0) { @@ -513,7 +516,10 @@ class ParallelFor, m_policy(arg_policy), m_league_size(arg_policy.league_size()), m_team_size(arg_policy.team_size()), - m_vector_size(arg_policy.impl_vector_length()) { + m_vector_size(arg_policy.impl_vector_length()), + m_scratch_lock(m_policy.space() + .impl_internal_space_instance() + ->m_team_scratch_mutex) { hipFuncAttributes attr = ::Kokkos::Experimental::Impl::HIPParallelLaunch< ParallelFor, launch_bounds>::get_hip_func_attributes(); m_team_size = @@ -640,6 +646,9 @@ class ParallelReduce, const size_type m_league_size; int m_team_size; const size_type m_vector_size; + // Only let one ParallelFor/Reduce modify the team scratch memory. The + // constructor acquires the mutex which is released in the destructor. + std::unique_lock m_scratch_lock; template __device__ inline @@ -877,7 +886,10 @@ class ParallelReduce, m_scratch_ptr{nullptr, nullptr}, m_league_size(arg_policy.league_size()), m_team_size(arg_policy.team_size()), - m_vector_size(arg_policy.impl_vector_length()) { + m_vector_size(arg_policy.impl_vector_length()), + m_scratch_lock(m_policy.space() + .impl_internal_space_instance() + ->m_team_scratch_mutex) { hipFuncAttributes attr = Kokkos::Experimental::Impl::HIPParallelLaunch< ParallelReduce, launch_bounds>::get_hip_func_attributes(); m_team_size = @@ -976,7 +988,10 @@ class ParallelReduce, m_scratch_ptr{nullptr, nullptr}, m_league_size(arg_policy.league_size()), m_team_size(arg_policy.team_size()), - m_vector_size(arg_policy.impl_vector_length()) { + m_vector_size(arg_policy.impl_vector_length()), + m_scratch_lock(m_policy.space() + .impl_internal_space_instance() + ->m_team_scratch_mutex) { hipFuncAttributes attr = Kokkos::Experimental::Impl::HIPParallelLaunch< ParallelReduce, launch_bounds>::get_hip_func_attributes(); m_team_size = diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_Space.cpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_Space.cpp index 00cef28f826d..15ca089d1474 100644 --- a/packages/kokkos/core/src/HIP/Kokkos_HIP_Space.cpp +++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_Space.cpp @@ -42,12 +42,6 @@ //@HEADER */ -#include -#include -#include -#include -#include -#include #include #include @@ -57,6 +51,13 @@ #include #include +#include +#include +#include +#include +#include +#include + /*--------------------------------------------------------------------------*/ /*--------------------------------------------------------------------------*/ namespace Kokkos { @@ -172,14 +173,14 @@ void DeepCopyAsyncHIP(void* dst, void const* src, size_t n) { namespace Kokkos { -void Experimental::HIPSpace::access_error() { +KOKKOS_DEPRECATED void Experimental::HIPSpace::access_error() { const std::string msg( "Kokkos::Experimental::HIPSpace::access_error attempt to execute " "Experimental::HIP function from non-HIP space"); Kokkos::Impl::throw_runtime_exception(msg); } -void Experimental::HIPSpace::access_error(const void* const) { +KOKKOS_DEPRECATED void Experimental::HIPSpace::access_error(const void* const) { const std::string msg( "Kokkos::Experimental::HIPSpace::access_error attempt to execute " "Experimental::HIP function from non-HIP space"); @@ -326,45 +327,6 @@ SharedAllocationRecord SharedAllocationRecord< Kokkos::Experimental::HIPHostPinnedSpace, void>::s_root_record; #endif -std::string SharedAllocationRecord::get_label() const { - SharedAllocationHeader header; - - Kokkos::Impl::DeepCopy( - &header, RecordBase::head(), sizeof(SharedAllocationHeader)); - - return std::string(header.m_label); -} - -std::string SharedAllocationRecord::get_label() const { - return std::string(RecordBase::head()->m_label); -} - -SharedAllocationRecord* -SharedAllocationRecord::allocate( - const Kokkos::Experimental::HIPSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size) { - return new SharedAllocationRecord(arg_space, arg_label, arg_alloc_size); -} - -SharedAllocationRecord* -SharedAllocationRecord:: - allocate(const Kokkos::Experimental::HIPHostPinnedSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size) { - return new SharedAllocationRecord(arg_space, arg_label, arg_alloc_size); -} - -void SharedAllocationRecord::deallocate( - SharedAllocationRecord* arg_rec) { - delete static_cast(arg_rec); -} - -void SharedAllocationRecord:: - deallocate(SharedAllocationRecord* arg_rec) { - delete static_cast(arg_rec); -} - SharedAllocationRecord::~SharedAllocationRecord() { const char* label = nullptr; @@ -393,7 +355,7 @@ SharedAllocationRecord:: const SharedAllocationRecord::function_type arg_dealloc) // Pass through allocated [ SharedAllocationHeader , user_memory ] // Pass through deallocation function - : SharedAllocationRecord( + : base_t( #ifdef KOKKOS_ENABLE_DEBUG &SharedAllocationRecord::s_root_record, @@ -405,13 +367,7 @@ SharedAllocationRecord:: SharedAllocationHeader header; - // Fill in the Header information - header.m_record = static_cast*>(this); - - strncpy(header.m_label, arg_label.c_str(), - SharedAllocationHeader::maximum_label_length); - // Set last element zero, in case c_str is too long - header.m_label[SharedAllocationHeader::maximum_label_length - 1] = (char)0; + this->base_t::_fill_host_accessible_header_info(header, arg_label); // Copy to device memory Kokkos::Impl::DeepCopy( @@ -425,7 +381,7 @@ SharedAllocationRecord:: const SharedAllocationRecord::function_type arg_dealloc) // Pass through allocated [ SharedAllocationHeader , user_memory ] // Pass through deallocation function - : SharedAllocationRecord( + : base_t( #ifdef KOKKOS_ENABLE_DEBUG &SharedAllocationRecord::s_root_record, @@ -435,223 +391,8 @@ SharedAllocationRecord:: sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc), m_space(arg_space) { // Fill in the Header information, directly accessible via host pinned memory - - RecordBase::m_alloc_ptr->m_record = this; - - strncpy(RecordBase::m_alloc_ptr->m_label, arg_label.c_str(), - SharedAllocationHeader::maximum_label_length); - // Set last element zero, in case c_str is too long - RecordBase::m_alloc_ptr - ->m_label[SharedAllocationHeader::maximum_label_length - 1] = (char)0; -} - -//---------------------------------------------------------------------------- - -void* SharedAllocationRecord:: - allocate_tracked(const Kokkos::Experimental::HIPSpace& arg_space, - const std::string& arg_alloc_label, - const size_t arg_alloc_size) { - if (!arg_alloc_size) return nullptr; - - SharedAllocationRecord* const r = - allocate(arg_space, arg_alloc_label, arg_alloc_size); - - RecordBase::increment(r); - - return r->data(); -} - -void SharedAllocationRecord::deallocate_tracked(void* const - arg_alloc_ptr) { - if (arg_alloc_ptr != nullptr) { - SharedAllocationRecord* const r = get_record(arg_alloc_ptr); - - RecordBase::decrement(r); - } -} - -void* SharedAllocationRecord:: - reallocate_tracked(void* const arg_alloc_ptr, const size_t arg_alloc_size) { - SharedAllocationRecord* const r_old = get_record(arg_alloc_ptr); - SharedAllocationRecord* const r_new = - allocate(r_old->m_space, r_old->get_label(), arg_alloc_size); - - Kokkos::Impl::DeepCopy( - r_new->data(), r_old->data(), std::min(r_old->size(), r_new->size())); - - RecordBase::increment(r_new); - RecordBase::decrement(r_old); - - return r_new->data(); -} - -void* SharedAllocationRecord:: - allocate_tracked(const Kokkos::Experimental::HIPHostPinnedSpace& arg_space, - const std::string& arg_alloc_label, - const size_t arg_alloc_size) { - if (!arg_alloc_size) return nullptr; - - SharedAllocationRecord* const r = - allocate(arg_space, arg_alloc_label, arg_alloc_size); - - RecordBase::increment(r); - - return r->data(); -} - -void SharedAllocationRecord::deallocate_tracked(void* const - arg_alloc_ptr) { - if (arg_alloc_ptr) { - SharedAllocationRecord* const r = get_record(arg_alloc_ptr); - - RecordBase::decrement(r); - } -} - -void* SharedAllocationRecord:: - reallocate_tracked(void* const arg_alloc_ptr, const size_t arg_alloc_size) { - SharedAllocationRecord* const r_old = get_record(arg_alloc_ptr); - SharedAllocationRecord* const r_new = - allocate(r_old->m_space, r_old->get_label(), arg_alloc_size); - - using HIPHostPinnedSpace = Kokkos::Experimental::HIPHostPinnedSpace; - Kokkos::Impl::DeepCopy( - r_new->data(), r_old->data(), std::min(r_old->size(), r_new->size())); - - RecordBase::increment(r_new); - RecordBase::decrement(r_old); - - return r_new->data(); -} - -//---------------------------------------------------------------------------- - -SharedAllocationRecord* -SharedAllocationRecord::get_record( - void* alloc_ptr) { - using Header = SharedAllocationHeader; - using RecordHIP = - SharedAllocationRecord; - - // Copy the header from the allocation - Header head; - - Header const* const head_hip = - alloc_ptr ? Header::get_header(alloc_ptr) : nullptr; - - if (alloc_ptr) { - Kokkos::Impl::DeepCopy( - &head, head_hip, sizeof(SharedAllocationHeader)); - } - - RecordHIP* const record = - alloc_ptr ? static_cast(head.m_record) : nullptr; - - if (!alloc_ptr || record->m_alloc_ptr != head_hip) { - Kokkos::Impl::throw_runtime_exception(std::string( - "Kokkos::Impl::SharedAllocationRecord< Kokkos::Experimental::HIPSpace " - ", void >::get_record ERROR")); - } - - return record; -} - -SharedAllocationRecord* -SharedAllocationRecord::get_record(void* alloc_ptr) { - using Header = SharedAllocationHeader; - using RecordHIP = - SharedAllocationRecord; - - Header* const h = - alloc_ptr ? reinterpret_cast(alloc_ptr) - 1 : nullptr; - - if (!alloc_ptr || h->m_record->m_alloc_ptr != h) { - Kokkos::Impl::throw_runtime_exception(std::string( - "Kokkos::Impl::SharedAllocationRecord< " - "Kokkos::Experimental::HIPHostPinnedSpace , void >::get_record ERROR")); - } - - return static_cast(h->m_record); -} - -// Iterate records to print orphaned memory ... -void SharedAllocationRecord:: - print_records(std::ostream& s, const Kokkos::Experimental::HIPSpace&, - bool detail) { -#ifdef KOKKOS_ENABLE_DEBUG - SharedAllocationRecord* r = &s_root_record; - - char buffer[256]; - - SharedAllocationHeader head; - - if (detail) { - do { - if (r->m_alloc_ptr) { - Kokkos::Impl::DeepCopy( - &head, r->m_alloc_ptr, sizeof(SharedAllocationHeader)); - } else { - head.m_label[0] = 0; - } - - // Formatting dependent on sizeof(uintptr_t) - const char* format_string; - - if (sizeof(uintptr_t) == sizeof(unsigned long)) { - format_string = - "HIP addr( 0x%.12lx ) list( 0x%.12lx 0x%.12lx ) extent[ 0x%.12lx + " - "%.8ld ] count(%d) dealloc(0x%.12lx) %s\n"; - } else if (sizeof(uintptr_t) == sizeof(unsigned long long)) { - format_string = - "HIP addr( 0x%.12llx ) list( 0x%.12llx 0x%.12llx ) extent[ " - "0x%.12llx + %.8ld ] count(%d) dealloc(0x%.12llx) %s\n"; - } - - snprintf(buffer, 256, format_string, reinterpret_cast(r), - reinterpret_cast(r->m_prev), - reinterpret_cast(r->m_next), - reinterpret_cast(r->m_alloc_ptr), r->m_alloc_size, - r->m_count, reinterpret_cast(r->m_dealloc), - head.m_label); - s << buffer; - r = r->m_next; - } while (r != &s_root_record); - } else { - do { - if (r->m_alloc_ptr) { - Kokkos::Impl::DeepCopy( - &head, r->m_alloc_ptr, sizeof(SharedAllocationHeader)); - - // Formatting dependent on sizeof(uintptr_t) - const char* format_string; - - if (sizeof(uintptr_t) == sizeof(unsigned long)) { - format_string = "HIP [ 0x%.12lx + %ld ] %s\n"; - } else if (sizeof(uintptr_t) == sizeof(unsigned long long)) { - format_string = "HIP [ 0x%.12llx + %ld ] %s\n"; - } - - snprintf(buffer, 256, format_string, - reinterpret_cast(r->data()), r->size(), - head.m_label); - } else { - snprintf(buffer, 256, "HIP [ 0 + 0 ]\n"); - } - s << buffer; - r = r->m_next; - } while (r != &s_root_record); - } -#else - (void)s; - (void)detail; - throw_runtime_exception( - "Kokkos::Impl::SharedAllocationRecord::print_records" - " only works with KOKKOS_ENABLE_DEBUG enabled"); -#endif + this->base_t::_fill_host_accessible_header_info(*RecordBase::m_alloc_ptr, + arg_label); } } // namespace Impl @@ -680,63 +421,22 @@ void HIP::impl_initialize(const HIP::SelectDevice config) { void HIP::impl_finalize() { Impl::HIPInternal::singleton().finalize(); } HIP::HIP() - : m_space_instance(&Impl::HIPInternal::singleton()), m_counter(nullptr) { + : m_space_instance(&Impl::HIPInternal::singleton(), + [](Impl::HIPInternal*) {}) { Impl::HIPInternal::singleton().verify_is_initialized( "HIP instance constructor"); } HIP::HIP(hipStream_t const stream) - : m_space_instance(new Impl::HIPInternal), m_counter(new int(1)) { + : m_space_instance(new Impl::HIPInternal, [](Impl::HIPInternal* ptr) { + ptr->finalize(); + delete ptr; + }) { Impl::HIPInternal::singleton().verify_is_initialized( "HIP instance constructor"); m_space_instance->initialize(Impl::HIPInternal::singleton().m_hipDev, stream); } -KOKKOS_FUNCTION HIP::HIP(HIP&& other) noexcept { - m_space_instance = other.m_space_instance; - other.m_space_instance = nullptr; - m_counter = other.m_counter; - other.m_counter = nullptr; -} - -KOKKOS_FUNCTION HIP::HIP(HIP const& other) - : m_space_instance(other.m_space_instance), m_counter(other.m_counter) { -#ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HIP_GPU - if (m_counter) Kokkos::atomic_add(m_counter, 1); -#endif -} - -KOKKOS_FUNCTION HIP& HIP::operator=(HIP&& other) noexcept { - m_space_instance = other.m_space_instance; - other.m_space_instance = nullptr; - m_counter = other.m_counter; - other.m_counter = nullptr; - - return *this; -} - -KOKKOS_FUNCTION HIP& HIP::operator=(HIP const& other) { - m_space_instance = other.m_space_instance; - m_counter = other.m_counter; -#ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HIP_GPU - if (m_counter) Kokkos::atomic_add(m_counter, 1); -#endif - - return *this; -} - -KOKKOS_FUNCTION HIP::~HIP() noexcept { -#ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HIP_GPU - if (m_counter == nullptr) return; - int const count = Kokkos::atomic_fetch_sub(m_counter, 1); - if (count == 1) { - delete m_counter; - m_space_instance->finalize(); - delete m_space_instance; - } -#endif -} - void HIP::print_configuration(std::ostream& s, const bool) { Impl::HIPInternal::singleton().print_configuration(s); } @@ -810,3 +510,26 @@ void HIPSpaceInitializer::print_configuration(std::ostream& msg, } // namespace Impl } // namespace Kokkos + +//============================================================================== +// {{{1 + +#include + +namespace Kokkos { +namespace Impl { + +// To avoid additional compilation cost for something that's (mostly?) not +// performance sensitive, we explicity instantiate these CRTP base classes here, +// where we have access to the associated *_timpl.hpp header files. +template class HostInaccessibleSharedAllocationRecordCommon< + Kokkos::Experimental::HIPSpace>; +template class SharedAllocationRecordCommon; +template class SharedAllocationRecordCommon< + Kokkos::Experimental::HIPHostPinnedSpace>; + +} // end namespace Impl +} // end namespace Kokkos + +// end Explicit instantiations of CRTP Base classes }}}1 +//============================================================================== diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_Team.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_Team.hpp index 7571510c31fa..fe52886ced7c 100644 --- a/packages/kokkos/core/src/HIP/Kokkos_HIP_Team.hpp +++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_Team.hpp @@ -644,13 +644,14 @@ KOKKOS_INLINE_FUNCTION thread, count); } -template -KOKKOS_INLINE_FUNCTION - Impl::ThreadVectorRangeBoundariesStruct - ThreadVectorRange(const Impl::HIPTeamMember& thread, iType arg_begin, - iType arg_end) { +template +KOKKOS_INLINE_FUNCTION Impl::ThreadVectorRangeBoundariesStruct< + typename std::common_type::type, Impl::HIPTeamMember> +ThreadVectorRange(const Impl::HIPTeamMember& thread, iType1 arg_begin, + iType2 arg_end) { + using iType = typename std::common_type::type; return Impl::ThreadVectorRangeBoundariesStruct( - thread, arg_begin, arg_end); + thread, iType(arg_begin), iType(arg_end)); } KOKKOS_INLINE_FUNCTION @@ -961,7 +962,7 @@ KOKKOS_INLINE_FUNCTION //---------------------------------------------------------------------------- -/** \brief Intra-thread vector parallel exclusive prefix sum. +/** \brief Intra-thread vector parallel scan with reducer. * * Executes closure(iType i, ValueType & val, bool final) for each i=[0..N) * @@ -969,22 +970,21 @@ KOKKOS_INLINE_FUNCTION * thread and a scan operation is performed. * The last call to closure has final == true. */ -template -KOKKOS_INLINE_FUNCTION void parallel_scan( - const Impl::ThreadVectorRangeBoundariesStruct& - loop_boundaries, - const Closure& closure) { +template +KOKKOS_INLINE_FUNCTION + typename std::enable_if::value>::type + parallel_scan(const Impl::ThreadVectorRangeBoundariesStruct< + iType, Impl::HIPTeamMember>& loop_boundaries, + const Closure& closure, const ReducerType& reducer) { #ifdef __HIP_DEVICE_COMPILE__ - // Extract value_type from closure - - using value_type = typename Kokkos::Impl::FunctorAnalysis< - Kokkos::Impl::FunctorPatternInterface::SCAN, void, Closure>::value_type; + using value_type = typename ReducerType::value_type; + value_type accum; + reducer.init(accum); + const value_type identity = accum; // Loop through boundaries by vector-length chunks // must scan at each iteration - value_type accum = 0; - // All thread "lanes" must loop the same number of times. // Determine an loop end for all thread "lanes." // Requires: @@ -997,47 +997,72 @@ KOKKOS_INLINE_FUNCTION void parallel_scan( const int end = loop_boundaries.end + (rem ? blockDim.x - rem : 0); for (int i = threadIdx.x; i < end; i += blockDim.x) { - value_type val = 0; - - // First acquire per-lane contributions: - if (i < loop_boundaries.end) closure(i, val, false); + value_type val = identity; - value_type sval = val; + // First acquire per-lane contributions. + // This sets i's val to i-1's contribution + // to make the latter in_place_shfl_up an + // exclusive scan -- the final accumulation + // of i's val will be included in the second + // closure call later. + if (i < loop_boundaries.end && threadIdx.x > 0) closure(i - 1, val, false); - // Bottom up inclusive scan in triangular pattern + // Bottom up exclusive scan in triangular pattern // where each HIP thread is the root of a reduction tree // from the zeroth "lane" to itself. // [t] += [t-1] if t >= 1 // [t] += [t-2] if t >= 2 // [t] += [t-4] if t >= 4 // ... - + // This differs from the non-reducer overload, where an inclusive scan was + // implemented, because in general the binary operator cannot be inverted + // and we would not be able to remove the inclusive contribution by + // inversion. for (int j = 1; j < static_cast(blockDim.x); j <<= 1) { - value_type tmp = 0; - ::Kokkos::Experimental::Impl::in_place_shfl_up(tmp, sval, j, blockDim.x); + value_type tmp = identity; + ::Kokkos::Experimental::Impl::in_place_shfl_up(tmp, val, j, blockDim.x); if (j <= static_cast(threadIdx.x)) { - sval += tmp; + reducer.join(val, tmp); } } - // Include accumulation and remove value for exclusive scan: - val = accum + sval - val; + // Include accumulation + reducer.join(val, accum); - // Provide exclusive scan value: + // Update i's contribution into the val + // and add it to accum for next round if (i < loop_boundaries.end) closure(i, val, true); - - // Accumulate the last value in the inclusive scan: - ::Kokkos::Experimental::Impl::in_place_shfl(sval, sval, blockDim.x - 1, + ::Kokkos::Experimental::Impl::in_place_shfl(accum, val, blockDim.x - 1, blockDim.x); - - accum += sval; } #else (void)loop_boundaries; (void)closure; + (void)reducer; #endif } +//---------------------------------------------------------------------------- + +/** \brief Intra-thread vector parallel exclusive prefix sum. + * + * Executes closure(iType i, ValueType & val, bool final) for each i=[0..N) + * + * The range [0..N) is mapped to all vector lanes in the + * thread and a scan operation is performed. + * The last call to closure has final == true. + */ +template +KOKKOS_INLINE_FUNCTION void parallel_scan( + const Impl::ThreadVectorRangeBoundariesStruct& + loop_boundaries, + const Closure& closure) { + using value_type = typename Kokkos::Impl::FunctorAnalysis< + Kokkos::Impl::FunctorPatternInterface::SCAN, void, Closure>::value_type; + value_type dummy; + parallel_scan(loop_boundaries, closure, Kokkos::Sum(dummy)); +} + } // namespace Kokkos namespace Kokkos { diff --git a/packages/kokkos/core/src/KokkosExp_MDRangePolicy.hpp b/packages/kokkos/core/src/KokkosExp_MDRangePolicy.hpp index 140376425c29..b7d8e62f6960 100644 --- a/packages/kokkos/core/src/KokkosExp_MDRangePolicy.hpp +++ b/packages/kokkos/core/src/KokkosExp_MDRangePolicy.hpp @@ -48,17 +48,11 @@ #include #include - +#include #include #include -#include #include -#if defined(KOKKOS_ENABLE_CUDA) || \ - (defined(__HIPCC__) && defined(KOKKOS_ENABLE_HIP)) -#include -#endif - namespace Kokkos { // ------------------------------------------------------------------ // @@ -74,22 +68,14 @@ enum class Iterate template struct default_outer_direction { - using type = Iterate; -#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) - static constexpr Iterate value = Iterate::Left; -#else + using type = Iterate; static constexpr Iterate value = Iterate::Right; -#endif }; template struct default_inner_direction { - using type = Iterate; -#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) - static constexpr Iterate value = Iterate::Left; -#else + using type = Iterate; static constexpr Iterate value = Iterate::Right; -#endif }; // Iteration Pattern @@ -179,6 +165,25 @@ constexpr NVCC_WONT_LET_ME_CALL_YOU_Array to_array_potentially_narrowing( } return a; } + +struct TileSizeProperties { + int max_threads; + int default_largest_tile_size; + int default_tile_size; + int max_total_tile_size; +}; + +template +TileSizeProperties get_tile_size_properties(const ExecutionSpace&) { + // Host settings + TileSizeProperties properties; + properties.max_threads = std::numeric_limits::max(); + properties.default_largest_tile_size = 0; + properties.default_tile_size = 2; + properties.max_total_tile_size = std::numeric_limits::max(); + return properties; +} + } // namespace Impl // multi-dimensional iteration pattern @@ -208,7 +213,7 @@ struct MDRangePolicy : public Kokkos::Impl::PolicyTraits { using launch_bounds = typename traits::launch_bounds; using member_type = typename range_policy::member_type; - enum { rank = static_cast(iteration_pattern::rank) }; + static constexpr int rank = iteration_pattern::rank; using index_type = typename traits::index_type; using array_index_type = std::int64_t; @@ -231,37 +236,20 @@ struct MDRangePolicy : public Kokkos::Impl::PolicyTraits { point_type m_tile_end = {}; index_type m_num_tiles = 1; index_type m_prod_tile_dims = 1; + bool m_tune_tile_size = false; - /* - // NDE enum impl definition alternative - replace static constexpr int ? - enum { outer_direction = static_cast ( - (iteration_pattern::outer_direction != Iterate::Default) - ? iteration_pattern::outer_direction - : default_outer_direction< typename traits::execution_space>::value ) }; - - enum { inner_direction = static_cast ( - iteration_pattern::inner_direction != Iterate::Default - ? iteration_pattern::inner_direction - : default_inner_direction< typename traits::execution_space>::value ) }; - - enum { Right = static_cast( Iterate::Right ) }; - enum { Left = static_cast( Iterate::Left ) }; - */ - // static constexpr int rank = iteration_pattern::rank; - - static constexpr int outer_direction = static_cast( + static constexpr auto outer_direction = (iteration_pattern::outer_direction != Iterate::Default) ? iteration_pattern::outer_direction - : default_outer_direction::value); + : default_outer_direction::value; - static constexpr int inner_direction = static_cast( + static constexpr auto inner_direction = iteration_pattern::inner_direction != Iterate::Default ? iteration_pattern::inner_direction - : default_inner_direction::value); + : default_inner_direction::value; - // Ugly ugly workaround intel 14 not handling scoped enum correctly - static constexpr int Right = static_cast(Iterate::Right); - static constexpr int Left = static_cast(Iterate::Left); + static constexpr auto Right = Iterate::Right; + static constexpr auto Left = Iterate::Left; KOKKOS_INLINE_FUNCTION const typename traits::execution_space& space() const { return m_space; @@ -320,7 +308,7 @@ struct MDRangePolicy : public Kokkos::Impl::PolicyTraits { point_type const& lower, point_type const& upper, tile_type const& tile = tile_type{}) : m_space(work_space), m_lower(lower), m_upper(upper), m_tile(tile) { - init(); + init_helper(Impl::get_tile_size_properties(work_space)); } template { m_tile(p.m_tile), m_tile_end(p.m_tile_end), m_num_tiles(p.m_num_tiles), - m_prod_tile_dims(p.m_prod_tile_dims) {} + m_prod_tile_dims(p.m_prod_tile_dims), + m_tune_tile_size(p.m_tune_tile_size) {} + + void impl_change_tile_size(const point_type& tile) { + m_tile = tile; + init_helper(Impl::get_tile_size_properties(m_space)); + } + bool impl_tune_tile_size() const { return m_tune_tile_size; } private: - void init() { - // Host - if (true -#if defined(KOKKOS_ENABLE_CUDA) - && !std::is_same::value -#endif -#if defined(KOKKOS_ENABLE_HIP) - && !std::is_same::value -#endif - ) { - index_type span; - for (int i = 0; i < rank; ++i) { - span = m_upper[i] - m_lower[i]; - if (m_tile[i] <= 0) { - if (((int)inner_direction == (int)Right && (i < rank - 1)) || - ((int)inner_direction == (int)Left && (i > 0))) { - m_tile[i] = 2; - } else { - m_tile[i] = (span == 0 ? 1 : span); - } - } - m_tile_end[i] = - static_cast((span + m_tile[i] - 1) / m_tile[i]); - m_num_tiles *= m_tile_end[i]; - m_prod_tile_dims *= m_tile[i]; - } + void init_helper(Impl::TileSizeProperties properties) { + m_prod_tile_dims = 1; + int increment = 1; + int rank_start = 0; + int rank_end = rank; + if (inner_direction == Iterate::Right) { + increment = -1; + rank_start = rank - 1; + rank_end = -1; } -#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) - else // Cuda or HIP - { - index_type span; - int increment = 1; - int rank_start = 0; - int rank_end = rank; - if ((int)inner_direction == (int)Right) { - increment = -1; - rank_start = rank - 1; - rank_end = -1; - } - bool is_cuda_exec_space = -#if defined(KOKKOS_ENABLE_CUDA) - std::is_same::value; -#else - false; -#endif - for (int i = rank_start; i != rank_end; i += increment) { - span = m_upper[i] - m_lower[i]; - if (m_tile[i] <= 0) { - // TODO: determine what is a good default tile size for Cuda and HIP - // may be rank dependent - if (((int)inner_direction == (int)Right && (i < rank - 1)) || - ((int)inner_direction == (int)Left && (i > 0))) { - if (m_prod_tile_dims < 256) { - m_tile[i] = (is_cuda_exec_space) ? 2 : 4; - } else { - m_tile[i] = 1; - } + for (int i = rank_start; i != rank_end; i += increment) { + const index_type length = m_upper[i] - m_lower[i]; + if (m_tile[i] <= 0) { + m_tune_tile_size = true; + if ((inner_direction == Iterate::Right && (i < rank - 1)) || + (inner_direction == Iterate::Left && (i > 0))) { + if (m_prod_tile_dims * properties.default_tile_size < + static_cast(properties.max_total_tile_size)) { + m_tile[i] = properties.default_tile_size; } else { - m_tile[i] = 16; + m_tile[i] = 1; } - } - m_tile_end[i] = - static_cast((span + m_tile[i] - 1) / m_tile[i]); - m_num_tiles *= m_tile_end[i]; - m_prod_tile_dims *= m_tile[i]; - } - if (m_prod_tile_dims > - 1024) { // Match Cuda restriction for ParallelReduce; 1024,1024,64 - // max per dim (Kepler), but product num_threads < 1024 - if (is_cuda_exec_space) { - printf(" Tile dimensions exceed Cuda limits\n"); - Kokkos::abort( - "Cuda ExecSpace Error: MDRange tile dims exceed maximum number " - "of threads per block - choose smaller tile dims"); } else { - printf(" Tile dimensions exceed HIP limits\n"); - Kokkos::abort( - "HIP ExecSpace Error: MDRange tile dims exceed maximum number of " - "threads per block - choose smaller tile dims"); + m_tile[i] = properties.default_largest_tile_size == 0 + ? std::max(length, 1) + : properties.default_largest_tile_size; } } + m_tile_end[i] = + static_cast((length + m_tile[i] - 1) / m_tile[i]); + m_num_tiles *= m_tile_end[i]; + m_prod_tile_dims *= m_tile[i]; + } + if (m_prod_tile_dims > static_cast(properties.max_threads)) { + printf(" Product of tile dimensions exceed maximum limit: %d\n", + static_cast(properties.max_threads)); + Kokkos::abort( + "ExecSpace Error: MDRange tile dims exceed maximum number " + "of threads per block - choose smaller tile dims"); } -#endif } }; diff --git a/packages/kokkos/core/src/Kokkos_AnonymousSpace.hpp b/packages/kokkos/core/src/Kokkos_AnonymousSpace.hpp index 8e226a078d1a..fb94049d7ad7 100644 --- a/packages/kokkos/core/src/Kokkos_AnonymousSpace.hpp +++ b/packages/kokkos/core/src/Kokkos_AnonymousSpace.hpp @@ -104,20 +104,6 @@ struct MemorySpaceAccess { enum : bool { deepcopy = true }; }; -template -struct VerifyExecutionCanAccessMemorySpace { - enum { value = 1 }; - KOKKOS_INLINE_FUNCTION static void verify(void) {} - KOKKOS_INLINE_FUNCTION static void verify(const void *) {} -}; - -template -struct VerifyExecutionCanAccessMemorySpace { - enum { value = 1 }; - KOKKOS_INLINE_FUNCTION static void verify(void) {} - KOKKOS_INLINE_FUNCTION static void verify(const void *) {} -}; - } // namespace Impl } // namespace Kokkos diff --git a/packages/kokkos/core/src/Kokkos_Complex.hpp b/packages/kokkos/core/src/Kokkos_Complex.hpp index fb2925a066f5..6578723fc8e5 100644 --- a/packages/kokkos/core/src/Kokkos_Complex.hpp +++ b/packages/kokkos/core/src/Kokkos_Complex.hpp @@ -45,14 +45,13 @@ #define KOKKOS_COMPLEX_HPP #include +#include #include +#include #include +#include #include -#ifdef KOKKOS_ENABLE_SYCL -#include -#endif - namespace Kokkos { /// \class complex @@ -220,10 +219,11 @@ class // Conditional noexcept, just in case RType throws on divide-by-zero KOKKOS_CONSTEXPR_14 KOKKOS_INLINE_FUNCTION complex& operator/=( const complex& y) noexcept(noexcept(RealType{} / RealType{})) { + using Kokkos::Experimental::fabs; // Scale (by the "1-norm" of y) to avoid unwarranted overflow. // If the real part is +/-Inf and the imaginary part is -/+Inf, // this won't change the result. - const RealType s = std::fabs(y.real()) + std::fabs(y.imag()); + const RealType s = fabs(y.real()) + fabs(y.imag()); // If s is 0, then y is zero, so x/y == real(x)/0 + i*imag(x)/0. // In that case, the relation x/y == (x/s) / (y/s) doesn't hold, @@ -248,10 +248,11 @@ class KOKKOS_INLINE_FUNCTION complex& operator/=( const std::complex& y) noexcept(noexcept(RealType{} / RealType{})) { + using Kokkos::Experimental::fabs; // Scale (by the "1-norm" of y) to avoid unwarranted overflow. // If the real part is +/-Inf and the imaginary part is -/+Inf, // this won't change the result. - const RealType s = std::fabs(y.real()) + std::fabs(y.imag()); + const RealType s = fabs(y.real()) + fabs(y.imag()); // If s is 0, then y is zero, so x/y == real(x)/0 + i*imag(x)/0. // In that case, the relation x/y == (x/s) / (y/s) doesn't hold, @@ -693,35 +694,96 @@ KOKKOS_INLINE_FUNCTION RealType real(const complex& x) noexcept { return x.real(); } +//! Constructs a complex number from magnitude and phase angle +template +KOKKOS_INLINE_FUNCTION complex polar(const T& r, const T& theta = T()) { + using Kokkos::Experimental::cos; + using Kokkos::Experimental::sin; + KOKKOS_EXPECTS(r >= 0); + return complex(r * cos(theta), r * sin(theta)); +} + //! Absolute value (magnitude) of a complex number. template KOKKOS_INLINE_FUNCTION RealType abs(const complex& x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - using cl::sycl::hypot; -#else - using std::hypot; -#endif + using Kokkos::Experimental::hypot; return hypot(x.real(), x.imag()); } //! Power of a complex number -template -KOKKOS_INLINE_FUNCTION Kokkos::complex pow(const complex& x, - const RealType& e) { - RealType r = abs(x); -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - using cl::sycl::atan; - using cl::sycl::cos; - using cl::sycl::pow; - using cl::sycl::sin; -#else - using std::atan; - using std::cos; - using std::pow; - using std::sin; -#endif - RealType phi = atan(x.imag() / x.real()); - return pow(r, e) * Kokkos::complex(cos(phi * e), sin(phi * e)); +template +KOKKOS_INLINE_FUNCTION complex pow(const complex& x, const T& y) { + using Kokkos::Experimental::atan2; + using Kokkos::Experimental::pow; + T r = abs(x); + T theta = atan2(x.imag(), x.real()); + return polar(pow(r, y), y * theta); +} + +template +KOKKOS_INLINE_FUNCTION complex pow(const T& x, const complex& y) { + return pow(complex(x), y); +} + +template +KOKKOS_INLINE_FUNCTION complex pow(const complex& x, + const complex& y) { + using Kokkos::Experimental::log; + + return x == T() ? T() : exp(y * log(x)); +} + +namespace Impl { +// NOTE promote would also be useful for math functions +template ::value> +struct promote { + using type = double; +}; +template +struct promote {}; +template <> +struct promote { + using type = long double; +}; +template <> +struct promote { + using type = double; +}; +template <> +struct promote { + using type = float; +}; +template +using promote_t = typename promote::type; +template +struct promote_2 { + using type = decltype(promote_t() + promote_t()); +}; +template +using promote_2_t = typename promote_2::type; +} // namespace Impl + +template ::value>> +KOKKOS_INLINE_FUNCTION complex> pow( + const T& x, const complex& y) { + using type = Impl::promote_2_t; + return pow(type(x), complex(y)); +} + +template ::value>> +KOKKOS_INLINE_FUNCTION complex> pow(const complex& x, + const U& y) { + using type = Impl::promote_2_t; + return pow(complex(x), type(y)); +} + +template +KOKKOS_INLINE_FUNCTION complex> pow( + const complex& x, const complex& y) { + using type = Impl::promote_2_t; + return pow(complex(x), complex(y)); } //! Square root of a complex number. This is intended to match the stdc++ @@ -729,26 +791,21 @@ KOKKOS_INLINE_FUNCTION Kokkos::complex pow(const complex& x, template KOKKOS_INLINE_FUNCTION Kokkos::complex sqrt( const complex& x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - using cl::sycl::abs; - using cl::sycl::sqrt; -#else - using std::abs; - using std::sqrt; -#endif + using Kokkos::Experimental::fabs; + using Kokkos::Experimental::sqrt; RealType r = x.real(); RealType i = x.imag(); if (r == RealType()) { - RealType t = sqrt(abs(i) / 2); + RealType t = sqrt(fabs(i) / 2); return Kokkos::complex(t, i < RealType() ? -t : t); } else { - RealType t = sqrt(2 * (abs(x) + abs(r))); + RealType t = sqrt(2 * (abs(x) + fabs(r))); RealType u = t / 2; - return r > RealType() - ? Kokkos::complex(u, i / t) - : Kokkos::complex(abs(i) / t, i < RealType() ? -u : u); + return r > RealType() ? Kokkos::complex(u, i / t) + : Kokkos::complex(fabs(i) / t, + i < RealType() ? -u : u); } } @@ -762,15 +819,9 @@ KOKKOS_INLINE_FUNCTION complex conj( //! Exponential of a complex number. template KOKKOS_INLINE_FUNCTION complex exp(const complex& x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - using cl::sycl::cos; - using cl::sycl::exp; - using cl::sycl::sin; -#else - using std::cos; - using std::exp; - using std::sin; -#endif + using Kokkos::Experimental::cos; + using Kokkos::Experimental::exp; + using Kokkos::Experimental::sin; return exp(x.real()) * complex(cos(x.imag()), sin(x.imag())); } @@ -778,14 +829,9 @@ KOKKOS_INLINE_FUNCTION complex exp(const complex& x) { template KOKKOS_INLINE_FUNCTION Kokkos::complex log( const complex& x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - using cl::sycl::atan; - using cl::sycl::log; -#else - using std::atan; - using std::log; -#endif - RealType phi = atan(x.imag() / x.real()); + using Kokkos::Experimental::atan2; + using Kokkos::Experimental::log; + RealType phi = atan2(x.imag(), x.real()); return Kokkos::complex(log(abs(x)), phi); } @@ -793,17 +839,10 @@ KOKKOS_INLINE_FUNCTION Kokkos::complex log( template KOKKOS_INLINE_FUNCTION Kokkos::complex sin( const complex& x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - using cl::sycl::cos; - using cl::sycl::cosh; - using cl::sycl::sin; - using cl::sycl::sinh; -#else - using std::cos; - using std::cosh; - using std::sin; - using std::sinh; -#endif + using Kokkos::Experimental::cos; + using Kokkos::Experimental::cosh; + using Kokkos::Experimental::sin; + using Kokkos::Experimental::sinh; return Kokkos::complex(sin(x.real()) * cosh(x.imag()), cos(x.real()) * sinh(x.imag())); } @@ -812,17 +851,10 @@ KOKKOS_INLINE_FUNCTION Kokkos::complex sin( template KOKKOS_INLINE_FUNCTION Kokkos::complex cos( const complex& x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - using cl::sycl::cos; - using cl::sycl::cosh; - using cl::sycl::sin; - using cl::sycl::sinh; -#else - using std::cos; - using std::cosh; - using std::sin; - using std::sinh; -#endif + using Kokkos::Experimental::cos; + using Kokkos::Experimental::cosh; + using Kokkos::Experimental::sin; + using Kokkos::Experimental::sinh; return Kokkos::complex(cos(x.real()) * cosh(x.imag()), -sin(x.real()) * sinh(x.imag())); } @@ -838,17 +870,10 @@ KOKKOS_INLINE_FUNCTION Kokkos::complex tan( template KOKKOS_INLINE_FUNCTION Kokkos::complex sinh( const complex& x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - using cl::sycl::cos; - using cl::sycl::cosh; - using cl::sycl::sin; - using cl::sycl::sinh; -#else - using std::cos; - using std::cosh; - using std::sin; - using std::sinh; -#endif + using Kokkos::Experimental::cos; + using Kokkos::Experimental::cosh; + using Kokkos::Experimental::sin; + using Kokkos::Experimental::sinh; return Kokkos::complex(sinh(x.real()) * cos(x.imag()), cosh(x.real()) * sin(x.imag())); } @@ -857,17 +882,10 @@ KOKKOS_INLINE_FUNCTION Kokkos::complex sinh( template KOKKOS_INLINE_FUNCTION Kokkos::complex cosh( const complex& x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - using cl::sycl::cos; - using cl::sycl::cosh; - using cl::sycl::sin; - using cl::sycl::sinh; -#else - using std::cos; - using std::cosh; - using std::sin; - using std::sinh; -#endif + using Kokkos::Experimental::cos; + using Kokkos::Experimental::cosh; + using Kokkos::Experimental::sin; + using Kokkos::Experimental::sinh; return Kokkos::complex(cosh(x.real()) * cos(x.imag()), sinh(x.real()) * sin(x.imag())); } @@ -898,13 +916,8 @@ KOKKOS_INLINE_FUNCTION Kokkos::complex acosh( template KOKKOS_INLINE_FUNCTION Kokkos::complex atanh( const complex& x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - using cl::sycl::atan2; - using cl::sycl::log; -#else - using std::atan2; - using std::log; -#endif + using Kokkos::Experimental::atan2; + using Kokkos::Experimental::log; const RealType i2 = x.imag() * x.imag(); const RealType r = RealType(1.0) - i2 - x.real() * x.real(); @@ -933,12 +946,7 @@ KOKKOS_INLINE_FUNCTION Kokkos::complex asin( template KOKKOS_INLINE_FUNCTION Kokkos::complex acos( const complex& x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - using cl::sycl::acos; - -#else - using std::acos; -#endif + using Kokkos::Experimental::acos; Kokkos::complex t = asin(x); RealType pi_2 = acos(RealType(0.0)); return Kokkos::complex(pi_2 - t.real(), -t.imag()); @@ -948,13 +956,8 @@ KOKKOS_INLINE_FUNCTION Kokkos::complex acos( template KOKKOS_INLINE_FUNCTION Kokkos::complex atan( const complex& x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - using cl::sycl::atan2; - using cl::sycl::log; -#else - using std::atan2; - using std::log; -#endif + using Kokkos::Experimental::atan2; + using Kokkos::Experimental::log; const RealType r2 = x.real() * x.real(); const RealType i = RealType(1.0) - r2 - x.imag() * x.imag(); @@ -996,12 +999,13 @@ KOKKOS_INLINE_FUNCTION operator/(const complex& x, const complex& y) noexcept(noexcept(RealType1{} / RealType2{})) { + using Kokkos::Experimental::fabs; // Scale (by the "1-norm" of y) to avoid unwarranted overflow. // If the real part is +/-Inf and the imaginary part is -/+Inf, // this won't change the result. using common_real_type = typename std::common_type::type; - const common_real_type s = std::fabs(real(y)) + std::fabs(imag(y)); + const common_real_type s = fabs(real(y)) + fabs(imag(y)); // If s is 0, then y is zero, so x/y == real(x)/0 + i*imag(x)/0. // In that case, the relation x/y == (x/s) / (y/s) doesn't hold, @@ -1046,7 +1050,7 @@ std::istream& operator>>(std::istream& is, complex& x) { } template -struct reduction_identity > { +struct reduction_identity> { using t_red_ident = reduction_identity; KOKKOS_FORCEINLINE_FUNCTION constexpr static Kokkos::complex sum() noexcept { diff --git a/packages/kokkos/core/src/Kokkos_Core.hpp b/packages/kokkos/core/src/Kokkos_Core.hpp index 4dac463a6671..c3771ab393f3 100644 --- a/packages/kokkos/core/src/Kokkos_Core.hpp +++ b/packages/kokkos/core/src/Kokkos_Core.hpp @@ -58,6 +58,7 @@ #include #include #include +#include #include #include #include @@ -86,6 +87,10 @@ struct InitArguments { int skip_device; bool disable_warnings; bool tune_internals; + bool tool_help = false; + std::string tool_lib = {}; + std::string tool_args = {}; + InitArguments(int nt = -1, int nn = -1, int dv = -1, bool dw = false, bool ti = false) : num_threads{nt}, @@ -139,6 +144,10 @@ void pre_initialize(const InitArguments& args); void post_initialize(const InitArguments& args); +void declare_configuration_metadata(const std::string& category, + const std::string& key, + const std::string& value); + } // namespace Impl bool is_initialized() noexcept; diff --git a/packages/kokkos/core/src/Kokkos_Core_fwd.hpp b/packages/kokkos/core/src/Kokkos_Core_fwd.hpp index 7502719c73d0..fe7eba3f6ef1 100644 --- a/packages/kokkos/core/src/Kokkos_Core_fwd.hpp +++ b/packages/kokkos/core/src/Kokkos_Core_fwd.hpp @@ -50,6 +50,7 @@ // and compiler environment then sets a collection of #define macros. #include +#include #include #include @@ -180,7 +181,6 @@ using DefaultHostExecutionSpace KOKKOS_IMPL_DEFAULT_HOST_EXEC_SPACE_ANNOTATION = // a given memory space. namespace Kokkos { - namespace Impl { #if defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA) && \ @@ -196,16 +196,22 @@ using ActiveExecutionMemorySpace = Kokkos::HostSpace; using ActiveExecutionMemorySpace = void; #endif -template -struct VerifyExecutionCanAccessMemorySpace { - enum { value = 0 }; +template +struct MemorySpaceAccess; + +template ::accessible> +struct verify_space { + KOKKOS_FUNCTION static void check() {} }; -template -struct VerifyExecutionCanAccessMemorySpace { - enum { value = 1 }; - KOKKOS_INLINE_FUNCTION static void verify(void) {} - KOKKOS_INLINE_FUNCTION static void verify(const void *) {} +template +struct verify_space { + KOKKOS_FUNCTION static void check() { + Kokkos::abort( + "Kokkos::View ERROR: attempt to access inaccessible memory space"); + }; }; // Base class for exec space initializer factories @@ -220,13 +226,13 @@ class LogicalMemorySpace; } // namespace Kokkos -#define KOKKOS_RESTRICT_EXECUTION_TO_DATA(DATA_SPACE, DATA_PTR) \ - Kokkos::Impl::VerifyExecutionCanAccessMemorySpace< \ - Kokkos::Impl::ActiveExecutionMemorySpace, DATA_SPACE>::verify(DATA_PTR) +#define KOKKOS_RESTRICT_EXECUTION_TO_DATA(DATA_SPACE, DATA_PTR) \ + Kokkos::Impl::verify_space::check(); -#define KOKKOS_RESTRICT_EXECUTION_TO_(DATA_SPACE) \ - Kokkos::Impl::VerifyExecutionCanAccessMemorySpace< \ - Kokkos::Impl::ActiveExecutionMemorySpace, DATA_SPACE>::verify() +#define KOKKOS_RESTRICT_EXECUTION_TO_(DATA_SPACE) \ + Kokkos::Impl::verify_space::check(); //---------------------------------------------------------------------------- @@ -256,8 +262,7 @@ template struct ViewCopy; -template +template struct FunctorPolicyExecutionSpace; //---------------------------------------------------------------------------- diff --git a/packages/kokkos/core/src/Kokkos_Crs.hpp b/packages/kokkos/core/src/Kokkos_Crs.hpp index 4a573d82c044..1a10500b19a5 100644 --- a/packages/kokkos/core/src/Kokkos_Crs.hpp +++ b/packages/kokkos/core/src/Kokkos_Crs.hpp @@ -199,7 +199,7 @@ class CrsRowMapFromCounts { public: KOKKOS_INLINE_FUNCTION void operator()(index_type i, value_type& update, bool final_pass) const { - if (i < m_in.size()) { + if (i < static_cast(m_in.size())) { update += m_in(i); if (final_pass) m_out(i + 1) = update; } else if (final_pass) { diff --git a/packages/kokkos/core/src/Kokkos_Cuda.hpp b/packages/kokkos/core/src/Kokkos_Cuda.hpp index 81e11f3f1285..7a218120bb7b 100644 --- a/packages/kokkos/core/src/Kokkos_Cuda.hpp +++ b/packages/kokkos/core/src/Kokkos_Cuda.hpp @@ -63,6 +63,7 @@ #include #include #include +#include /*--------------------------------------------------------------------------*/ @@ -198,16 +199,6 @@ class Cuda { Cuda(); - KOKKOS_FUNCTION Cuda(Cuda&& other) noexcept; - - KOKKOS_FUNCTION Cuda(const Cuda& other); - - KOKKOS_FUNCTION Cuda& operator=(Cuda&& other) noexcept; - - KOKKOS_FUNCTION Cuda& operator=(const Cuda& other); - - KOKKOS_FUNCTION ~Cuda() noexcept; - Cuda(cudaStream_t stream); //-------------------------------------------------------------------------- @@ -253,13 +244,12 @@ class Cuda { static const char* name(); inline Impl::CudaInternal* impl_internal_space_instance() const { - return m_space_instance; + return m_space_instance.get(); } uint32_t impl_instance_id() const noexcept { return 0; } private: - Impl::CudaInternal* m_space_instance; - int* m_counter; + Kokkos::Impl::HostSharedPtr m_space_instance; }; namespace Tools { @@ -319,38 +309,8 @@ struct MemorySpaceAccess -struct VerifyExecutionCanAccessMemorySpace { - enum : bool { value = true }; - KOKKOS_INLINE_FUNCTION static void verify(void) {} - KOKKOS_INLINE_FUNCTION static void verify(const void*) {} -}; - -template <> -struct VerifyExecutionCanAccessMemorySpace { - enum : bool { value = false }; - inline static void verify(void) { CudaSpace::access_error(); } - inline static void verify(const void* p) { CudaSpace::access_error(p); } -}; - } // namespace Impl } // namespace Kokkos -/*--------------------------------------------------------------------------*/ -/*--------------------------------------------------------------------------*/ - -#include -#include -#include -#include -#include -#include -#include - -#include -//---------------------------------------------------------------------------- - #endif /* #if defined( KOKKOS_ENABLE_CUDA ) */ #endif /* #ifndef KOKKOS_CUDA_HPP */ diff --git a/packages/kokkos/core/src/Kokkos_CudaSpace.hpp b/packages/kokkos/core/src/Kokkos_CudaSpace.hpp index fc1c0e2f8a10..e10fae93c7ca 100644 --- a/packages/kokkos/core/src/Kokkos_CudaSpace.hpp +++ b/packages/kokkos/core/src/Kokkos_CudaSpace.hpp @@ -53,8 +53,10 @@ #include #include #include +#include #include +#include #include @@ -119,8 +121,8 @@ class CudaSpace { /*--------------------------------*/ /** \brief Error reporting for HostSpace attempt to access CudaSpace */ - static void access_error(); - static void access_error(const void* const); + KOKKOS_DEPRECATED static void access_error(); + KOKKOS_DEPRECATED static void access_error(const void* const); private: int m_device; ///< Which Cuda device @@ -128,42 +130,6 @@ class CudaSpace { static constexpr const char* m_name = "Cuda"; friend class Kokkos::Impl::SharedAllocationRecord; }; - -namespace Impl { -/// \brief Initialize lock array for arbitrary size atomics. -/// -/// Arbitrary atomics are implemented using a hash table of locks -/// where the hash value is derived from the address of the -/// object for which an atomic operation is performed. -/// This function initializes the locks to zero (unset). -void init_lock_arrays_cuda_space(); - -/// \brief Retrieve the pointer to the lock array for arbitrary size atomics. -/// -/// Arbitrary atomics are implemented using a hash table of locks -/// where the hash value is derived from the address of the -/// object for which an atomic operation is performed. -/// This function retrieves the lock array pointer. -/// If the array is not yet allocated it will do so. -int* atomic_lock_array_cuda_space_ptr(bool deallocate = false); - -/// \brief Retrieve the pointer to the scratch array for team and thread private -/// global memory. -/// -/// Team and Thread private scratch allocations in -/// global memory are acquired via locks. -/// This function retrieves the lock array pointer. -/// If the array is not yet allocated it will do so. -int* scratch_lock_array_cuda_space_ptr(bool deallocate = false); - -/// \brief Retrieve the pointer to the scratch array for unique identifiers. -/// -/// Unique identifiers in the range 0-Cuda::concurrency -/// are provided via locks. -/// This function retrieves the lock array pointer. -/// If the array is not yet allocated it will do so. -int* threadid_lock_array_cuda_space_ptr(bool deallocate = false); -} // namespace Impl } // namespace Kokkos /*--------------------------------------------------------------------------*/ @@ -313,6 +279,11 @@ class CudaHostPinnedSpace { namespace Kokkos { namespace Impl { +cudaStream_t cuda_get_deep_copy_stream(); + +const std::unique_ptr& cuda_get_deep_copy_space( + bool initialize = true); + static_assert(Kokkos::Impl::MemorySpaceAccess::assignable, ""); @@ -784,104 +755,21 @@ struct DeepCopy { namespace Kokkos { namespace Impl { -/** Running in CudaSpace attempting to access HostSpace: error */ -template <> -struct VerifyExecutionCanAccessMemorySpace { - enum : bool { value = false }; - KOKKOS_INLINE_FUNCTION static void verify(void) { - Kokkos::abort("Cuda code attempted to access HostSpace memory"); - } - - KOKKOS_INLINE_FUNCTION static void verify(const void*) { - Kokkos::abort("Cuda code attempted to access HostSpace memory"); - } -}; - -/** Running in CudaSpace accessing CudaUVMSpace: ok */ -template <> -struct VerifyExecutionCanAccessMemorySpace { - enum : bool { value = true }; - KOKKOS_INLINE_FUNCTION static void verify(void) {} - KOKKOS_INLINE_FUNCTION static void verify(const void*) {} -}; - -/** Running in CudaSpace accessing CudaHostPinnedSpace: ok */ -template <> -struct VerifyExecutionCanAccessMemorySpace { - enum : bool { value = true }; - KOKKOS_INLINE_FUNCTION static void verify(void) {} - KOKKOS_INLINE_FUNCTION static void verify(const void*) {} -}; - -/** Running in CudaSpace attempting to access an unknown space: error */ -template -struct VerifyExecutionCanAccessMemorySpace< - typename std::enable_if::value, - Kokkos::CudaSpace>::type, - OtherSpace> { - enum : bool { value = false }; - KOKKOS_INLINE_FUNCTION static void verify(void) { - Kokkos::abort("Cuda code attempted to access unknown Space memory"); - } - - KOKKOS_INLINE_FUNCTION static void verify(const void*) { - Kokkos::abort("Cuda code attempted to access unknown Space memory"); - } -}; - -//---------------------------------------------------------------------------- -/** Running in HostSpace attempting to access CudaSpace */ -template <> -struct VerifyExecutionCanAccessMemorySpace { - enum : bool { value = false }; - inline static void verify(void) { CudaSpace::access_error(); } - inline static void verify(const void* p) { CudaSpace::access_error(p); } -}; - -/** Running in HostSpace accessing CudaUVMSpace is OK */ -template <> -struct VerifyExecutionCanAccessMemorySpace { - enum : bool { value = true }; - inline static void verify(void) {} - inline static void verify(const void*) {} -}; - -/** Running in HostSpace accessing CudaHostPinnedSpace is OK */ -template <> -struct VerifyExecutionCanAccessMemorySpace { - enum : bool { value = true }; - KOKKOS_INLINE_FUNCTION static void verify(void) {} - KOKKOS_INLINE_FUNCTION static void verify(const void*) {} -}; - -} // namespace Impl -} // namespace Kokkos - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { -namespace Impl { - template <> class SharedAllocationRecord - : public SharedAllocationRecord { + : public HostInaccessibleSharedAllocationRecordCommon { private: friend class SharedAllocationRecord; + friend class SharedAllocationRecordCommon; + friend class HostInaccessibleSharedAllocationRecordCommon; using RecordBase = SharedAllocationRecord; + using base_t = + HostInaccessibleSharedAllocationRecordCommon; SharedAllocationRecord(const SharedAllocationRecord&) = delete; SharedAllocationRecord& operator=(const SharedAllocationRecord&) = delete; - static void deallocate(RecordBase*); - static ::cudaTextureObject_t attach_texture_object( const unsigned sizeof_alias, void* const alloc_ptr, const size_t alloc_size); @@ -890,39 +778,19 @@ class SharedAllocationRecord static RecordBase s_root_record; #endif - ::cudaTextureObject_t m_tex_obj; + ::cudaTextureObject_t m_tex_obj = 0; const Kokkos::CudaSpace m_space; protected: ~SharedAllocationRecord(); - SharedAllocationRecord() : RecordBase(), m_tex_obj(0), m_space() {} + SharedAllocationRecord() = default; SharedAllocationRecord( const Kokkos::CudaSpace& arg_space, const std::string& arg_label, const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &deallocate); + const RecordBase::function_type arg_dealloc = &base_t::deallocate); public: - std::string get_label() const; - - static SharedAllocationRecord* allocate(const Kokkos::CudaSpace& arg_space, - const std::string& arg_label, - const size_t arg_alloc_size); - - /**\brief Allocate tracked memory in the space */ - static void* allocate_tracked(const Kokkos::CudaSpace& arg_space, - const std::string& arg_label, - const size_t arg_alloc_size); - - /**\brief Reallocate tracked memory in the space */ - static void* reallocate_tracked(void* const arg_alloc_ptr, - const size_t arg_alloc_size); - - /**\brief Deallocate tracked memory in the space */ - static void deallocate_tracked(void* const arg_alloc_ptr); - - static SharedAllocationRecord* get_record(void* arg_alloc_ptr); - template inline ::cudaTextureObject_t attach_texture_object() { static_assert((std::is_same::value || @@ -945,57 +813,35 @@ class SharedAllocationRecord // Texture object is attached to the entire allocation range return ptr - reinterpret_cast(RecordBase::m_alloc_ptr); } - - static void print_records(std::ostream&, const Kokkos::CudaSpace&, - bool detail = false); }; template <> class SharedAllocationRecord - : public SharedAllocationRecord { + : public SharedAllocationRecordCommon { private: + friend class SharedAllocationRecordCommon; + + using base_t = SharedAllocationRecordCommon; using RecordBase = SharedAllocationRecord; SharedAllocationRecord(const SharedAllocationRecord&) = delete; SharedAllocationRecord& operator=(const SharedAllocationRecord&) = delete; - static void deallocate(RecordBase*); - static RecordBase s_root_record; - ::cudaTextureObject_t m_tex_obj; + ::cudaTextureObject_t m_tex_obj = 0; const Kokkos::CudaUVMSpace m_space; protected: ~SharedAllocationRecord(); - SharedAllocationRecord() : RecordBase(), m_tex_obj(0), m_space() {} + SharedAllocationRecord() = default; SharedAllocationRecord( const Kokkos::CudaUVMSpace& arg_space, const std::string& arg_label, const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &deallocate); + const RecordBase::function_type arg_dealloc = &base_t::deallocate); public: - std::string get_label() const; - - static SharedAllocationRecord* allocate(const Kokkos::CudaUVMSpace& arg_space, - const std::string& arg_label, - const size_t arg_alloc_size); - - /**\brief Allocate tracked memory in the space */ - static void* allocate_tracked(const Kokkos::CudaUVMSpace& arg_space, - const std::string& arg_label, - const size_t arg_alloc_size); - - /**\brief Reallocate tracked memory in the space */ - static void* reallocate_tracked(void* const arg_alloc_ptr, - const size_t arg_alloc_size); - - /**\brief Deallocate tracked memory in the space */ - static void deallocate_tracked(void* const arg_alloc_ptr); - - static SharedAllocationRecord* get_record(void* arg_alloc_ptr); - template inline ::cudaTextureObject_t attach_texture_object() { static_assert((std::is_same::value || @@ -1019,57 +865,32 @@ class SharedAllocationRecord // Texture object is attached to the entire allocation range return ptr - reinterpret_cast(RecordBase::m_alloc_ptr); } - - static void print_records(std::ostream&, const Kokkos::CudaUVMSpace&, - bool detail = false); }; template <> class SharedAllocationRecord - : public SharedAllocationRecord { + : public SharedAllocationRecordCommon { private: + friend class SharedAllocationRecordCommon; + using RecordBase = SharedAllocationRecord; + using base_t = SharedAllocationRecordCommon; SharedAllocationRecord(const SharedAllocationRecord&) = delete; SharedAllocationRecord& operator=(const SharedAllocationRecord&) = delete; - static void deallocate(RecordBase*); - static RecordBase s_root_record; const Kokkos::CudaHostPinnedSpace m_space; protected: ~SharedAllocationRecord(); - SharedAllocationRecord() : RecordBase(), m_space() {} + SharedAllocationRecord() = default; SharedAllocationRecord( const Kokkos::CudaHostPinnedSpace& arg_space, const std::string& arg_label, const size_t arg_alloc_size, const RecordBase::function_type arg_dealloc = &deallocate); - - public: - std::string get_label() const; - - static SharedAllocationRecord* allocate( - const Kokkos::CudaHostPinnedSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size); - /**\brief Allocate tracked memory in the space */ - static void* allocate_tracked(const Kokkos::CudaHostPinnedSpace& arg_space, - const std::string& arg_label, - const size_t arg_alloc_size); - - /**\brief Reallocate tracked memory in the space */ - static void* reallocate_tracked(void* const arg_alloc_ptr, - const size_t arg_alloc_size); - - /**\brief Deallocate tracked memory in the space */ - static void deallocate_tracked(void* const arg_alloc_ptr); - - static SharedAllocationRecord* get_record(void* arg_alloc_ptr); - - static void print_records(std::ostream&, const Kokkos::CudaHostPinnedSpace&, - bool detail = false); }; } // namespace Impl diff --git a/packages/kokkos/core/src/Kokkos_ExecPolicy.hpp b/packages/kokkos/core/src/Kokkos_ExecPolicy.hpp index 3afe08170134..55aed13670e6 100644 --- a/packages/kokkos/core/src/Kokkos_ExecPolicy.hpp +++ b/packages/kokkos/core/src/Kokkos_ExecPolicy.hpp @@ -856,11 +856,12 @@ KOKKOS_INLINE_FUNCTION_DELETED Impl::ThreadVectorRangeBoundariesStruct ThreadVectorRange(const TeamMemberType&, const iType& count) = delete; -template -KOKKOS_INLINE_FUNCTION_DELETED - Impl::ThreadVectorRangeBoundariesStruct - ThreadVectorRange(const TeamMemberType&, const iType& arg_begin, - const iType& arg_end) = delete; +template +KOKKOS_INLINE_FUNCTION_DELETED Impl::ThreadVectorRangeBoundariesStruct< + typename std::common_type::type, TeamMemberType> +ThreadVectorRange(const TeamMemberType&, const iType1& arg_begin, + const iType2& arg_end) = delete; namespace Impl { @@ -902,85 +903,6 @@ struct ParallelConstructName { } // namespace Kokkos namespace Kokkos { -namespace Experimental { - -namespace Impl { -template -struct PolicyPropertyAdaptor; - -template class Policy, - class... Properties> -struct PolicyPropertyAdaptor, - Policy> { - using policy_in_t = Policy; - static_assert(is_execution_policy::value, ""); - using policy_out_t = Policy, - typename policy_in_t::traits::occupancy_control>; -}; - -template